codegraph-cli-ai 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/PKG-INFO +1 -1
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph/cli.py +121 -11
- codegraph_cli_ai-0.1.8/codegraph/graph/builder.py +190 -0
- codegraph_cli_ai-0.1.8/codegraph/parsers/database_parser.py +25 -0
- codegraph_cli_ai-0.1.8/codegraph/parsers/image_parser.py +39 -0
- codegraph_cli_ai-0.1.8/codegraph/parsers/multimodal_parser.py +86 -0
- codegraph_cli_ai-0.1.8/codegraph/parsers/pdf_parser.py +43 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph/parsers/python_parser.py +13 -1
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/PKG-INFO +1 -1
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/SOURCES.txt +4 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/pyproject.toml +1 -1
- codegraph_cli_ai-0.1.7/codegraph/graph/builder.py +0 -110
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/MANIFEST.in +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/README.md +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/dependency_links.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/entry_points.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/requires.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/top_level.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/setup.cfg +0 -0
|
@@ -9,6 +9,7 @@ import networkx as nx
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Optional
|
|
11
11
|
from codegraph.parsers.python_parser import PythonParser
|
|
12
|
+
from codegraph.parsers.multimodal_parser import MultiModalParser
|
|
12
13
|
from codegraph.graph.builder import GraphBuilder
|
|
13
14
|
|
|
14
15
|
app = typer.Typer()
|
|
@@ -20,6 +21,23 @@ def main():
|
|
|
20
21
|
pass
|
|
21
22
|
|
|
22
23
|
|
|
24
|
+
IGNORE_DIRS = {
|
|
25
|
+
"venv", ".venv", "env", "bin", "Scripts",
|
|
26
|
+
".git", "__pycache__", "node_modules", ".codegraph"
|
|
27
|
+
}
|
|
28
|
+
IGNORE_FILES = {"graph.json", ".DS_Store"}
|
|
29
|
+
IGNORE_EXTENSIONS = {".pyc", ".log", ".pyo", ".pyd"}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_virtualenv(path: Path) -> bool:
|
|
33
|
+
"""Detect if a directory is a virtual environment based on its structure."""
|
|
34
|
+
return (
|
|
35
|
+
(path / "pyvenv.cfg").exists()
|
|
36
|
+
or (path / "bin" / "python").exists()
|
|
37
|
+
or (path / "Scripts" / "python.exe").exists()
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
23
41
|
@app.command()
|
|
24
42
|
def index(
|
|
25
43
|
path: str = typer.Argument(".", help="Path to the repo or folder to index")
|
|
@@ -32,31 +50,81 @@ def index(
|
|
|
32
50
|
raise typer.Exit(code=1)
|
|
33
51
|
|
|
34
52
|
typer.echo(f"Indexing: {root}\n")
|
|
35
|
-
|
|
53
|
+
|
|
54
|
+
# Phase A: Identify top-level custom virtualenvs
|
|
55
|
+
venv_dirs = set()
|
|
56
|
+
for item in root.iterdir():
|
|
57
|
+
if item.is_dir() and is_virtualenv(item):
|
|
58
|
+
venv_dirs.add(item)
|
|
59
|
+
|
|
60
|
+
# Phase B: Filtered Scan
|
|
61
|
+
all_files = []
|
|
62
|
+
for p in root.rglob("*"):
|
|
63
|
+
if not p.is_file():
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# 1. Skip if any parent directory is in IGNORE_DIRS
|
|
67
|
+
if any(part in IGNORE_DIRS for part in p.parts):
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
# 2. Skip if inside a detected custom venv
|
|
71
|
+
if any(v_dir in p.parents for v_dir in venv_dirs):
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# 3. Skip if filename is ignored
|
|
75
|
+
if p.name in IGNORE_FILES:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# 4. Skip if extension is ignored
|
|
79
|
+
if p.suffix.lower() in IGNORE_EXTENSIONS:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
all_files.append(p)
|
|
83
|
+
|
|
84
|
+
py_files = [f for f in all_files if f.suffix == ".py"]
|
|
85
|
+
asset_exts = {
|
|
86
|
+
".csv", ".json", ".db", ".sqlite",
|
|
87
|
+
".pdf",
|
|
88
|
+
".png", ".jpg", ".jpeg"
|
|
89
|
+
}
|
|
90
|
+
asset_files = [f for f in all_files if f.suffix.lower() in asset_exts]
|
|
36
91
|
|
|
37
|
-
if not py_files:
|
|
38
|
-
typer.echo("No
|
|
92
|
+
if not py_files and not asset_files:
|
|
93
|
+
typer.echo("No supported files found (everything might be ignored).")
|
|
39
94
|
raise typer.Exit()
|
|
40
95
|
|
|
41
|
-
typer.echo(f"Found {len(py_files)} Python file(s)\n")
|
|
96
|
+
typer.echo(f"Found {len(py_files)} Python file(s) and {len(asset_files)} asset(s)\n")
|
|
42
97
|
|
|
43
98
|
# Step 1 — Parse
|
|
44
|
-
|
|
99
|
+
py_parser = PythonParser()
|
|
100
|
+
mm_parser = MultiModalParser()
|
|
101
|
+
|
|
45
102
|
parsed_files = []
|
|
103
|
+
parsed_assets = []
|
|
46
104
|
failed_files = []
|
|
47
105
|
|
|
106
|
+
# Parse Python files
|
|
48
107
|
for filepath in py_files:
|
|
49
|
-
result =
|
|
108
|
+
result = py_parser.parse_file(str(filepath))
|
|
50
109
|
if result.errors:
|
|
51
110
|
failed_files.append((str(filepath), result.errors))
|
|
52
111
|
else:
|
|
53
|
-
typer.echo(f" ✔ {filepath.relative_to(root)}")
|
|
112
|
+
typer.echo(f" ✔ [code] {filepath.relative_to(root)}")
|
|
54
113
|
parsed_files.append(result)
|
|
55
114
|
|
|
115
|
+
# Parse assets
|
|
116
|
+
for filepath in asset_files:
|
|
117
|
+
try:
|
|
118
|
+
asset = mm_parser.parse(str(filepath))
|
|
119
|
+
typer.echo(f" ✔ [asset] {filepath.relative_to(root)}")
|
|
120
|
+
parsed_assets.append(asset)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
failed_files.append((str(filepath), [str(e)]))
|
|
123
|
+
|
|
56
124
|
# Step 2 — Build graph
|
|
57
125
|
typer.echo("\nBuilding graph...")
|
|
58
126
|
builder = GraphBuilder()
|
|
59
|
-
builder.build(parsed_files)
|
|
127
|
+
builder.build(parsed_files, parsed_assets)
|
|
60
128
|
summary = builder.summary()
|
|
61
129
|
|
|
62
130
|
# Step 3 — Save to .codegraph/graph.json
|
|
@@ -163,9 +231,13 @@ def _build_premium_html(G: nx.DiGraph) -> str:
|
|
|
163
231
|
STYLES = {
|
|
164
232
|
"file": {"color": "#4A90E2", "shape": "diamond", "size": 28, "font_color": "#ffffff"},
|
|
165
233
|
"class": {"color": "#F5A623", "shape": "hexagon", "size": 24, "font_color": "#ffffff"},
|
|
166
|
-
"function": {"color": "#50C878", "shape": "dot",
|
|
167
|
-
"method": {"color": "#7ED6A8", "shape": "dot",
|
|
234
|
+
"function": {"color": "#50C878", "shape": "dot", "size": 16, "font_color": "#ffffff"},
|
|
235
|
+
"method": {"color": "#7ED6A8", "shape": "dot", "size": 14, "font_color": "#ffffff"},
|
|
168
236
|
"module": {"color": "#B0BEC5", "shape": "box", "size": 14, "font_color": "#ffffff"},
|
|
237
|
+
"dataset": {"color": "#FFD700", "shape": "diamond", "size": 22, "font_color": "#ffffff"},
|
|
238
|
+
"database": {"color": "#FF69B4", "shape": "database", "size": 24, "font_color": "#ffffff"},
|
|
239
|
+
"document": {"color": "#9B59B6", "shape": "diamond", "size": 24, "font_color": "#ffffff"},
|
|
240
|
+
"image": {"color": "#5DADE2", "shape": "diamond", "size": 24, "font_color": "#ffffff"},
|
|
169
241
|
}
|
|
170
242
|
|
|
171
243
|
EDGE_COLORS = {
|
|
@@ -173,6 +245,8 @@ def _build_premium_html(G: nx.DiGraph) -> str:
|
|
|
173
245
|
"calls": "#50C878",
|
|
174
246
|
"imports": "#B0BEC5",
|
|
175
247
|
"defined_in": "#E8E8E8",
|
|
248
|
+
"uses": "#FFD700",
|
|
249
|
+
"references": "#9B59B6",
|
|
176
250
|
}
|
|
177
251
|
|
|
178
252
|
nodes_js = []
|
|
@@ -188,10 +262,41 @@ def _build_premium_html(G: nx.DiGraph) -> str:
|
|
|
188
262
|
|
|
189
263
|
# Tooltip
|
|
190
264
|
tooltip_parts = [f"<b>{label}</b>", f"Kind: {kind}"]
|
|
191
|
-
if attrs.get("
|
|
265
|
+
if attrs.get("filename"):
|
|
266
|
+
tooltip_parts.append(f"File: {attrs['filename']}")
|
|
267
|
+
elif attrs.get("file"):
|
|
192
268
|
tooltip_parts.append(f"File: {attrs['file'].replace('file:', '')}")
|
|
193
269
|
if attrs.get("cls"):
|
|
194
270
|
tooltip_parts.append(f"Class: {attrs['cls']}")
|
|
271
|
+
|
|
272
|
+
# MultiModal Metadata for tooltips
|
|
273
|
+
metadata = attrs.get("metadata", {})
|
|
274
|
+
if kind == "dataset":
|
|
275
|
+
if "columns" in metadata: tooltip_parts.append(f"Columns: {', '.join(metadata['columns'])}")
|
|
276
|
+
if "keys" in metadata: tooltip_parts.append(f"Keys: {', '.join(metadata['keys'])}")
|
|
277
|
+
elif kind == "database":
|
|
278
|
+
if "tables" in metadata: tooltip_parts.append(f"Tables: {', '.join(metadata['tables'])}")
|
|
279
|
+
elif kind == "document":
|
|
280
|
+
if "num_pages" in metadata: tooltip_parts.append(f"Pages: {metadata['num_pages']}")
|
|
281
|
+
if "text_preview" in metadata: tooltip_parts.append(f"<br>Preview: <i>{metadata['text_preview']}</i>")
|
|
282
|
+
elif kind == "image":
|
|
283
|
+
if "text" in metadata and metadata["text"]: tooltip_parts.append(f"<br>OCR Text: <i>{metadata['text']}</i>")
|
|
284
|
+
|
|
285
|
+
# Add metadata for datasets/databases
|
|
286
|
+
metadata = attrs.get("metadata", {})
|
|
287
|
+
if metadata:
|
|
288
|
+
if "columns" in metadata:
|
|
289
|
+
cols = ", ".join(metadata["columns"][:5])
|
|
290
|
+
if len(metadata["columns"]) > 5: cols += "..."
|
|
291
|
+
tooltip_parts.append(f"Columns: {cols}")
|
|
292
|
+
if "tables" in metadata:
|
|
293
|
+
tbls = ", ".join(metadata["tables"])
|
|
294
|
+
tooltip_parts.append(f"Tables: {tbls}")
|
|
295
|
+
if "keys" in metadata:
|
|
296
|
+
keys = ", ".join(metadata["keys"][:5])
|
|
297
|
+
if len(metadata["keys"]) > 5: keys += "..."
|
|
298
|
+
tooltip_parts.append(f"Keys: {keys}")
|
|
299
|
+
|
|
195
300
|
if external:
|
|
196
301
|
tooltip_parts.append("<i>external</i>")
|
|
197
302
|
tooltip = "<br>".join(tooltip_parts)
|
|
@@ -337,10 +442,15 @@ def _build_premium_html(G: nx.DiGraph) -> str:
|
|
|
337
442
|
<div class="legend-item"><div class="legend-dot" style="background:#50C878"></div>Function</div>
|
|
338
443
|
<div class="legend-item"><div class="legend-dot" style="background:#7ED6A8"></div>Method</div>
|
|
339
444
|
<div class="legend-item"><div class="legend-dot" style="background:#B0BEC5;border-radius:2px"></div>Module</div>
|
|
445
|
+
<div class="legend-item"><div class="legend-dot" style="background:#FFD700;clip-path:polygon(50% 0%, 100% 50%, 50% 100%, 0% 50%)"></div>Dataset</div>
|
|
446
|
+
<div class="legend-item"><div class="legend-dot" style="background:#FF69B4;border-radius:2px"></div>Database</div>
|
|
447
|
+
<div class="legend-item"><div class="legend-dot" style="background:#9B59B6;clip-path:polygon(50% 0%, 100% 50%, 50% 100%, 0% 50%)"></div>Document</div>
|
|
448
|
+
<div class="legend-item"><div class="legend-dot" style="background:#5DADE2;clip-path:polygon(50% 0%, 100% 50%, 50% 100%, 0% 50%)"></div>Image</div>
|
|
340
449
|
<span style="font-size:11px; color:#546E7A; font-weight: 700; margin-left:12px; margin-right:4px; letter-spacing: 0.05em;">EDGES</span>
|
|
341
450
|
<div class="legend-item"><div style="width:20px;height:2px;background:#4A90E2;opacity:0.6"></div>contains</div>
|
|
342
451
|
<div class="legend-item"><div style="width:20px;height:2px;background:#50C878;opacity:0.6"></div>calls</div>
|
|
343
452
|
<div class="legend-item"><div style="width:20px;height:1px;background:#B0BEC5;border-top:1px dashed #B0BEC5"></div>imports</div>
|
|
453
|
+
<div class="legend-item"><div style="width:20px;height:2px;background:#FFD700;opacity:0.6"></div>uses</div>
|
|
344
454
|
</div>
|
|
345
455
|
|
|
346
456
|
<div id="graph-container">
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graph Builder for CodeGraph AI
|
|
3
|
+
|
|
4
|
+
Node types:
|
|
5
|
+
- file : a .py file
|
|
6
|
+
- function : top-level function
|
|
7
|
+
- class : a class
|
|
8
|
+
- method : a method belonging to a class
|
|
9
|
+
- module : an imported module/package
|
|
10
|
+
|
|
11
|
+
Edge types:
|
|
12
|
+
- contains : file → function, file → class, class → method
|
|
13
|
+
- calls : function/method → function/method
|
|
14
|
+
- imports : file → module
|
|
15
|
+
- defined_in : function/method → file
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import builtins
|
|
19
|
+
import networkx as nx
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from codegraph.parsers.python_parser import ParsedFile
|
|
22
|
+
from codegraph.parsers.multimodal_parser import ParsedAsset
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
BUILTIN_FUNCTIONS = set(dir(builtins))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GraphBuilder:
|
|
29
|
+
def __init__(self):
|
|
30
|
+
self.graph = nx.DiGraph()
|
|
31
|
+
self._function_to_file: dict[str, str] = {}
|
|
32
|
+
self._assets: list[ParsedAsset] = []
|
|
33
|
+
self._parsed_files: list[ParsedFile] = []
|
|
34
|
+
|
|
35
|
+
def add_file(self, parsed: ParsedFile) -> None:
|
|
36
|
+
self._parsed_files.append(parsed)
|
|
37
|
+
file_id = self._file_node_id(parsed.filepath)
|
|
38
|
+
filename = Path(parsed.filepath).name
|
|
39
|
+
|
|
40
|
+
self._add_node(file_id, kind="file", label=filename, external=False)
|
|
41
|
+
|
|
42
|
+
for cls in parsed.classes:
|
|
43
|
+
cls_id = f"class:{file_id}:{cls}"
|
|
44
|
+
self._add_node(cls_id, kind="class", label=cls, external=False, file=file_id)
|
|
45
|
+
self._add_edge(file_id, cls_id, relation="contains")
|
|
46
|
+
|
|
47
|
+
for func in parsed.functions:
|
|
48
|
+
func_id = f"func:{file_id}:{func}"
|
|
49
|
+
self._add_node(func_id, kind="function", label=func, external=False, file=file_id)
|
|
50
|
+
self._add_edge(file_id, func_id, relation="contains")
|
|
51
|
+
self._add_edge(func_id, file_id, relation="defined_in")
|
|
52
|
+
self._function_to_file[func] = file_id
|
|
53
|
+
|
|
54
|
+
for cls_name, method_name in parsed.methods:
|
|
55
|
+
cls_id = f"class:{file_id}:{cls_name}"
|
|
56
|
+
method_id = f"func:{file_id}:{method_name}"
|
|
57
|
+
self._add_node(method_id, kind="method", label=method_name, external=False, file=file_id, cls=cls_name)
|
|
58
|
+
self._add_edge(cls_id, method_id, relation="contains")
|
|
59
|
+
self._add_edge(method_id, file_id, relation="defined_in")
|
|
60
|
+
self._function_to_file[method_name] = file_id
|
|
61
|
+
|
|
62
|
+
for imp in parsed.imports:
|
|
63
|
+
mod_id = f"module:{imp}"
|
|
64
|
+
self._add_node(mod_id, kind="module", label=imp, external=True)
|
|
65
|
+
self._add_edge(file_id, mod_id, relation="imports")
|
|
66
|
+
|
|
67
|
+
for caller, callee in parsed.calls:
|
|
68
|
+
if callee in BUILTIN_FUNCTIONS:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Use file context for caller
|
|
72
|
+
caller_id = f"func:{file_id}:{caller}"
|
|
73
|
+
|
|
74
|
+
# Resolve callee ID using global map or default to external
|
|
75
|
+
target_file_id = self._function_to_file.get(callee)
|
|
76
|
+
if target_file_id:
|
|
77
|
+
callee_id = f"func:{target_file_id}:{callee}"
|
|
78
|
+
else:
|
|
79
|
+
callee_id = f"func:external:{callee}"
|
|
80
|
+
if not self.graph.has_node(callee_id):
|
|
81
|
+
self._add_node(callee_id, kind="function", label=callee, external=True)
|
|
82
|
+
|
|
83
|
+
self._add_edge(caller_id, callee_id, relation="calls")
|
|
84
|
+
|
|
85
|
+
def add_asset(self, asset: ParsedAsset) -> None:
|
|
86
|
+
self._assets.append(asset)
|
|
87
|
+
filename = Path(asset.filepath).name
|
|
88
|
+
node_id = f"{asset.kind}:{filename}"
|
|
89
|
+
|
|
90
|
+
self._add_node(
|
|
91
|
+
node_id,
|
|
92
|
+
kind=asset.kind,
|
|
93
|
+
label=filename,
|
|
94
|
+
filename=filename, # keep raw filename for linking
|
|
95
|
+
external=False,
|
|
96
|
+
metadata=asset.metadata
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def link_code_to_assets(self) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Connect code nodes to assets if the filename appears in:
|
|
102
|
+
- function name
|
|
103
|
+
- call list
|
|
104
|
+
- string usage
|
|
105
|
+
"""
|
|
106
|
+
for node_id, data in list(self.graph.nodes(data=True)):
|
|
107
|
+
if data.get("kind") not in ("function", "method"):
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Get function metadata from parsed files
|
|
111
|
+
func_name = data.get("label")
|
|
112
|
+
file_id = data.get("file")
|
|
113
|
+
|
|
114
|
+
# Find the parsed file this function belongs to
|
|
115
|
+
parsed = next((p for p in self._parsed_files if self._file_node_id(p.filepath) == file_id), None)
|
|
116
|
+
if not parsed:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Check each asset
|
|
120
|
+
for asset in self._assets:
|
|
121
|
+
asset_filename = Path(asset.filepath).name
|
|
122
|
+
asset_id = f"{asset.kind}:{asset_filename}"
|
|
123
|
+
relation = "uses" if asset.kind in ("dataset", "database") else "references"
|
|
124
|
+
|
|
125
|
+
# Check 1: function name
|
|
126
|
+
if asset_filename in func_name:
|
|
127
|
+
self._add_edge(node_id, asset_id, relation=relation)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Check 2: call list for this function
|
|
131
|
+
calls = [c[1] for c in parsed.calls if c[0] == func_name]
|
|
132
|
+
if any(asset_filename in callee for callee in calls):
|
|
133
|
+
self._add_edge(node_id, asset_id, relation=relation)
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Check 3: string usage
|
|
137
|
+
strings = parsed.strings.get(func_name, [])
|
|
138
|
+
if any(asset_filename in s for s in strings):
|
|
139
|
+
self._add_edge(node_id, asset_id, relation=relation)
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
def build(self, parsed_files: list[ParsedFile], assets: list[ParsedAsset] = None) -> nx.DiGraph:
|
|
143
|
+
# First pass: add all files to populate function-to-file map
|
|
144
|
+
for parsed in parsed_files:
|
|
145
|
+
if not parsed.errors:
|
|
146
|
+
self.add_file(parsed)
|
|
147
|
+
|
|
148
|
+
# Second pass: add assets
|
|
149
|
+
if assets:
|
|
150
|
+
for asset in assets:
|
|
151
|
+
self.add_asset(asset)
|
|
152
|
+
|
|
153
|
+
# Third pass: link code to assets
|
|
154
|
+
self.link_code_to_assets()
|
|
155
|
+
|
|
156
|
+
return self.graph
|
|
157
|
+
|
|
158
|
+
def summary(self) -> dict:
|
|
159
|
+
nodes_by_kind = {}
|
|
160
|
+
for _, data in self.graph.nodes(data=True):
|
|
161
|
+
kind = data.get("kind", "unknown")
|
|
162
|
+
nodes_by_kind[kind] = nodes_by_kind.get(kind, 0) + 1
|
|
163
|
+
|
|
164
|
+
edges_by_relation = {}
|
|
165
|
+
for _, _, data in self.graph.edges(data=True):
|
|
166
|
+
rel = data.get("relation", "unknown")
|
|
167
|
+
edges_by_relation[rel] = edges_by_relation.get(rel, 0) + 1
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
"total_nodes": self.graph.number_of_nodes(),
|
|
171
|
+
"total_edges": self.graph.number_of_edges(),
|
|
172
|
+
"nodes_by_kind": nodes_by_kind,
|
|
173
|
+
"edges_by_relation": edges_by_relation,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
def to_dict(self) -> dict:
|
|
177
|
+
return {
|
|
178
|
+
"nodes": [{"id": node, **data} for node, data in self.graph.nodes(data=True)],
|
|
179
|
+
"edges": [{"source": src, "target": dst, **data} for src, dst, data in self.graph.edges(data=True)],
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
def _file_node_id(self, filepath: str) -> str:
|
|
183
|
+
return f"file:{Path(filepath).name}"
|
|
184
|
+
|
|
185
|
+
def _add_node(self, node_id: str, **attrs) -> None:
|
|
186
|
+
if not self.graph.has_node(node_id):
|
|
187
|
+
self.graph.add_node(node_id, **attrs)
|
|
188
|
+
|
|
189
|
+
def _add_edge(self, src: str, dst: str, **attrs) -> None:
|
|
190
|
+
self.graph.add_edge(src, dst, **attrs)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database Parser for CodeGraph AI
|
|
3
|
+
Extracts metadata from SQLite databases.
|
|
4
|
+
"""
|
|
5
|
+
import sqlite3
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
class DatabaseParser:
|
|
9
|
+
"""
|
|
10
|
+
Parses SQLite files to extract table names.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def parse(self, filepath: str) -> dict:
|
|
14
|
+
metadata = {"tables": []}
|
|
15
|
+
try:
|
|
16
|
+
conn = sqlite3.connect(filepath)
|
|
17
|
+
cursor = conn.cursor()
|
|
18
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
19
|
+
tables = [row[0] for row in cursor.fetchall()]
|
|
20
|
+
metadata["tables"] = tables
|
|
21
|
+
conn.close()
|
|
22
|
+
except Exception as e:
|
|
23
|
+
metadata["error"] = str(e)
|
|
24
|
+
|
|
25
|
+
return metadata
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Image Parser for CodeGraph AI
|
|
3
|
+
Extracts text from images using pytesseract (OCR).
|
|
4
|
+
"""
|
|
5
|
+
import pytesseract
|
|
6
|
+
from PIL import Image
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
class ImageParser:
|
|
10
|
+
"""
|
|
11
|
+
Parses Image files to extract text via OCR.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def parse(self, filepath: str) -> dict:
|
|
15
|
+
"""
|
|
16
|
+
Attempts OCR on the image. Falls back gracefully if OCR is unavailable.
|
|
17
|
+
"""
|
|
18
|
+
metadata = {
|
|
19
|
+
"text": ""
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
# Check if tesseract is installed
|
|
24
|
+
# (In a real system we'd handle TesseractNotFoundError specifically)
|
|
25
|
+
img = Image.open(filepath)
|
|
26
|
+
ocr_text = pytesseract.image_to_string(img)
|
|
27
|
+
|
|
28
|
+
# Limit the text length
|
|
29
|
+
if len(ocr_text) > 500:
|
|
30
|
+
ocr_text = ocr_text[:500] + "..."
|
|
31
|
+
|
|
32
|
+
metadata["text"] = ocr_text.strip()
|
|
33
|
+
|
|
34
|
+
except Exception as e:
|
|
35
|
+
# If tesseract is not found or fails, we still return success with empty text
|
|
36
|
+
# as requested in the requirements (handle failure gracefully)
|
|
37
|
+
metadata["error"] = f"OCR failed or not available: {str(e)}"
|
|
38
|
+
|
|
39
|
+
return metadata
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MultiModal Parser for CodeGraph AI
|
|
3
|
+
Extracts metadata from non-code assets like CSV, JSON, SQLite, PDF, and Images.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import csv
|
|
7
|
+
import sqlite3
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
from codegraph.parsers.pdf_parser import PDFParser
|
|
12
|
+
from codegraph.parsers.image_parser import ImageParser
|
|
13
|
+
from codegraph.parsers.database_parser import DatabaseParser
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ParsedAsset:
|
|
18
|
+
filepath: str
|
|
19
|
+
kind: str # "dataset" | "database" | "document" | "image"
|
|
20
|
+
metadata: dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MultiModalParser:
|
|
24
|
+
"""
|
|
25
|
+
Parses non-Python files (CSV, JSON, SQLite, PDF, Image) to extract metadata.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.pdf_parser = PDFParser()
|
|
30
|
+
self.image_parser = ImageParser()
|
|
31
|
+
self.database_parser = DatabaseParser()
|
|
32
|
+
|
|
33
|
+
def parse(self, filepath: str) -> ParsedAsset:
|
|
34
|
+
path = Path(filepath)
|
|
35
|
+
suffix = path.suffix.lower()
|
|
36
|
+
|
|
37
|
+
if suffix == ".csv":
|
|
38
|
+
return self._parse_csv(path)
|
|
39
|
+
elif suffix == ".json":
|
|
40
|
+
return self._parse_json(path)
|
|
41
|
+
elif suffix in (".db", ".sqlite"):
|
|
42
|
+
return self._parse_sqlite(path)
|
|
43
|
+
elif suffix == ".pdf":
|
|
44
|
+
return self._parse_pdf(path)
|
|
45
|
+
elif suffix in (".png", ".jpg", ".jpeg"):
|
|
46
|
+
return self._parse_image(path)
|
|
47
|
+
else:
|
|
48
|
+
return ParsedAsset(filepath=str(path), kind="unknown")
|
|
49
|
+
|
|
50
|
+
def _parse_csv(self, path: Path) -> ParsedAsset:
|
|
51
|
+
metadata = {"columns": []}
|
|
52
|
+
try:
|
|
53
|
+
with path.open("r", encoding="utf-8") as f:
|
|
54
|
+
reader = csv.reader(f)
|
|
55
|
+
header = next(reader, [])
|
|
56
|
+
metadata["columns"] = header
|
|
57
|
+
except Exception as e:
|
|
58
|
+
metadata["error"] = str(e)
|
|
59
|
+
|
|
60
|
+
return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
|
|
61
|
+
|
|
62
|
+
def _parse_json(self, path: Path) -> ParsedAsset:
|
|
63
|
+
metadata = {"keys": []}
|
|
64
|
+
try:
|
|
65
|
+
with path.open("r", encoding="utf-8") as f:
|
|
66
|
+
data = json.load(f)
|
|
67
|
+
if isinstance(data, dict):
|
|
68
|
+
metadata["keys"] = list(data.keys())
|
|
69
|
+
elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
|
70
|
+
metadata["keys"] = list(data[0].keys())
|
|
71
|
+
except Exception as e:
|
|
72
|
+
metadata["error"] = str(e)
|
|
73
|
+
|
|
74
|
+
return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
|
|
75
|
+
|
|
76
|
+
def _parse_sqlite(self, path: Path) -> ParsedAsset:
|
|
77
|
+
metadata = self.database_parser.parse(str(path))
|
|
78
|
+
return ParsedAsset(filepath=str(path), kind="database", metadata=metadata)
|
|
79
|
+
|
|
80
|
+
def _parse_pdf(self, path: Path) -> ParsedAsset:
|
|
81
|
+
metadata = self.pdf_parser.parse(str(path))
|
|
82
|
+
return ParsedAsset(filepath=str(path), kind="document", metadata=metadata)
|
|
83
|
+
|
|
84
|
+
def _parse_image(self, path: Path) -> ParsedAsset:
|
|
85
|
+
metadata = self.image_parser.parse(str(path))
|
|
86
|
+
return ParsedAsset(filepath=str(path), kind="image", metadata=metadata)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Parser for CodeGraph AI
|
|
3
|
+
Extracts text and metadata from PDF files using pypdf.
|
|
4
|
+
"""
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pypdf import PdfReader
|
|
7
|
+
|
|
8
|
+
class PDFParser:
|
|
9
|
+
"""
|
|
10
|
+
Parses PDF files to extract basic text preview and page count.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def parse(self, filepath: str) -> dict:
|
|
14
|
+
"""
|
|
15
|
+
Extracts metadata from the first few pages of a PDF.
|
|
16
|
+
"""
|
|
17
|
+
metadata = {
|
|
18
|
+
"num_pages": 0,
|
|
19
|
+
"text_preview": ""
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
reader = PdfReader(filepath)
|
|
24
|
+
metadata["num_pages"] = len(reader.pages)
|
|
25
|
+
|
|
26
|
+
# Extract text from first 2 pages as a preview
|
|
27
|
+
preview_text = []
|
|
28
|
+
for i in range(min(2, len(reader.pages))):
|
|
29
|
+
text = reader.pages[i].extract_text()
|
|
30
|
+
if text:
|
|
31
|
+
preview_text.append(text.strip())
|
|
32
|
+
|
|
33
|
+
# Limit the preview length to avoid huge tooltips
|
|
34
|
+
full_preview = "\n---\n".join(preview_text)
|
|
35
|
+
if len(full_preview) > 500:
|
|
36
|
+
full_preview = full_preview[:500] + "..."
|
|
37
|
+
|
|
38
|
+
metadata["text_preview"] = full_preview
|
|
39
|
+
|
|
40
|
+
except Exception as e:
|
|
41
|
+
metadata["error"] = str(e)
|
|
42
|
+
|
|
43
|
+
return metadata
|
|
@@ -18,6 +18,7 @@ class ParsedFile:
|
|
|
18
18
|
methods: list[tuple[str, str]] = field(default_factory=list) # (class_name, method_name)
|
|
19
19
|
imports: list[str] = field(default_factory=list)
|
|
20
20
|
calls: list[tuple[str, str]] = field(default_factory=list) # (caller, callee)
|
|
21
|
+
strings: dict[str, list[str]] = field(default_factory=list) # func_name -> list of strings
|
|
21
22
|
errors: list[str] = field(default_factory=list)
|
|
22
23
|
|
|
23
24
|
|
|
@@ -28,7 +29,7 @@ class PythonParser:
|
|
|
28
29
|
"""
|
|
29
30
|
|
|
30
31
|
def parse_file(self, filepath: str) -> ParsedFile:
|
|
31
|
-
result = ParsedFile(filepath=filepath)
|
|
32
|
+
result = ParsedFile(filepath=filepath, strings={})
|
|
32
33
|
source = self._read_file(filepath, result)
|
|
33
34
|
|
|
34
35
|
if source is None:
|
|
@@ -93,12 +94,14 @@ class PythonParser:
|
|
|
93
94
|
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
94
95
|
result.functions.append(node.name)
|
|
95
96
|
self._extract_calls_in_func(node, node.name, result)
|
|
97
|
+
self._extract_strings_in_func(node, node.name, result)
|
|
96
98
|
|
|
97
99
|
elif isinstance(node, ast.ClassDef):
|
|
98
100
|
for item in node.body:
|
|
99
101
|
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
100
102
|
result.methods.append((node.name, item.name))
|
|
101
103
|
self._extract_calls_in_func(item, item.name, result)
|
|
104
|
+
self._extract_strings_in_func(item, item.name, result)
|
|
102
105
|
|
|
103
106
|
def _extract_calls_in_func(self, func_node: ast.AST, func_name: str, result: ParsedFile):
|
|
104
107
|
for child in ast.walk(func_node):
|
|
@@ -107,6 +110,15 @@ class PythonParser:
|
|
|
107
110
|
if callee:
|
|
108
111
|
result.calls.append((func_name, callee))
|
|
109
112
|
|
|
113
|
+
def _extract_strings_in_func(self, func_node: ast.AST, func_name: str, result: ParsedFile):
|
|
114
|
+
if func_name not in result.strings:
|
|
115
|
+
result.strings[func_name] = []
|
|
116
|
+
|
|
117
|
+
for child in ast.walk(func_node):
|
|
118
|
+
# Python 3.8+ handles strings as ast.Constant
|
|
119
|
+
if isinstance(child, ast.Constant) and isinstance(child.value, str):
|
|
120
|
+
result.strings[func_name].append(child.value)
|
|
121
|
+
|
|
110
122
|
def _extract_classes(self, tree: ast.AST, result: ParsedFile):
|
|
111
123
|
for node in ast.walk(tree):
|
|
112
124
|
if isinstance(node, ast.ClassDef):
|
|
@@ -3,6 +3,10 @@ README.md
|
|
|
3
3
|
pyproject.toml
|
|
4
4
|
codegraph/cli.py
|
|
5
5
|
codegraph/graph/builder.py
|
|
6
|
+
codegraph/parsers/database_parser.py
|
|
7
|
+
codegraph/parsers/image_parser.py
|
|
8
|
+
codegraph/parsers/multimodal_parser.py
|
|
9
|
+
codegraph/parsers/pdf_parser.py
|
|
6
10
|
codegraph/parsers/python_parser.py
|
|
7
11
|
codegraph_cli_ai.egg-info/PKG-INFO
|
|
8
12
|
codegraph_cli_ai.egg-info/SOURCES.txt
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "codegraph-cli-ai"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.8"
|
|
8
8
|
description = "CLI tool to analyze codebases and visualize knowledge graphs using AST"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Graph Builder for CodeGraph AI
|
|
3
|
-
|
|
4
|
-
Node types:
|
|
5
|
-
- file : a .py file
|
|
6
|
-
- function : top-level function
|
|
7
|
-
- class : a class
|
|
8
|
-
- method : a method belonging to a class
|
|
9
|
-
- module : an imported module/package
|
|
10
|
-
|
|
11
|
-
Edge types:
|
|
12
|
-
- contains : file → function, file → class, class → method
|
|
13
|
-
- calls : function/method → function/method
|
|
14
|
-
- imports : file → module
|
|
15
|
-
- defined_in : function/method → file
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
import builtins
|
|
19
|
-
import networkx as nx
|
|
20
|
-
from pathlib import Path
|
|
21
|
-
from codegraph.parsers.python_parser import ParsedFile
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
BUILTIN_FUNCTIONS = set(dir(builtins))
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class GraphBuilder:
|
|
28
|
-
def __init__(self):
|
|
29
|
-
self.graph = nx.DiGraph()
|
|
30
|
-
self._function_to_file: dict[str, str] = {}
|
|
31
|
-
|
|
32
|
-
def add_file(self, parsed: ParsedFile) -> None:
|
|
33
|
-
file_id = self._file_node_id(parsed.filepath)
|
|
34
|
-
filename = Path(parsed.filepath).name
|
|
35
|
-
|
|
36
|
-
self._add_node(file_id, kind="file", label=filename, external=False)
|
|
37
|
-
|
|
38
|
-
for cls in parsed.classes:
|
|
39
|
-
cls_id = f"class:{cls}"
|
|
40
|
-
self._add_node(cls_id, kind="class", label=cls, external=False, file=file_id)
|
|
41
|
-
self._add_edge(file_id, cls_id, relation="contains")
|
|
42
|
-
|
|
43
|
-
for func in parsed.functions:
|
|
44
|
-
func_id = f"func:{func}"
|
|
45
|
-
self._add_node(func_id, kind="function", label=func, external=False, file=file_id)
|
|
46
|
-
self._add_edge(file_id, func_id, relation="contains")
|
|
47
|
-
self._add_edge(func_id, file_id, relation="defined_in")
|
|
48
|
-
self._function_to_file[func] = file_id
|
|
49
|
-
|
|
50
|
-
for cls_name, method_name in parsed.methods:
|
|
51
|
-
cls_id = f"class:{cls_name}"
|
|
52
|
-
method_id = f"func:{method_name}"
|
|
53
|
-
self._add_node(method_id, kind="method", label=method_name, external=False, file=file_id, cls=cls_name)
|
|
54
|
-
self._add_edge(cls_id, method_id, relation="contains")
|
|
55
|
-
self._add_edge(method_id, file_id, relation="defined_in")
|
|
56
|
-
self._function_to_file[method_name] = file_id
|
|
57
|
-
|
|
58
|
-
for imp in parsed.imports:
|
|
59
|
-
mod_id = f"module:{imp}"
|
|
60
|
-
self._add_node(mod_id, kind="module", label=imp, external=True)
|
|
61
|
-
self._add_edge(file_id, mod_id, relation="imports")
|
|
62
|
-
|
|
63
|
-
for caller, callee in parsed.calls:
|
|
64
|
-
if callee in BUILTIN_FUNCTIONS:
|
|
65
|
-
continue
|
|
66
|
-
caller_id = f"func:{caller}"
|
|
67
|
-
callee_id = f"func:{callee}"
|
|
68
|
-
if not self.graph.has_node(callee_id):
|
|
69
|
-
self._add_node(callee_id, kind="function", label=callee, external=True)
|
|
70
|
-
self._add_edge(caller_id, callee_id, relation="calls")
|
|
71
|
-
|
|
72
|
-
def build(self, parsed_files: list[ParsedFile]) -> nx.DiGraph:
|
|
73
|
-
for parsed in parsed_files:
|
|
74
|
-
if not parsed.errors:
|
|
75
|
-
self.add_file(parsed)
|
|
76
|
-
return self.graph
|
|
77
|
-
|
|
78
|
-
def summary(self) -> dict:
|
|
79
|
-
nodes_by_kind = {}
|
|
80
|
-
for _, data in self.graph.nodes(data=True):
|
|
81
|
-
kind = data.get("kind", "unknown")
|
|
82
|
-
nodes_by_kind[kind] = nodes_by_kind.get(kind, 0) + 1
|
|
83
|
-
|
|
84
|
-
edges_by_relation = {}
|
|
85
|
-
for _, _, data in self.graph.edges(data=True):
|
|
86
|
-
rel = data.get("relation", "unknown")
|
|
87
|
-
edges_by_relation[rel] = edges_by_relation.get(rel, 0) + 1
|
|
88
|
-
|
|
89
|
-
return {
|
|
90
|
-
"total_nodes": self.graph.number_of_nodes(),
|
|
91
|
-
"total_edges": self.graph.number_of_edges(),
|
|
92
|
-
"nodes_by_kind": nodes_by_kind,
|
|
93
|
-
"edges_by_relation": edges_by_relation,
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
def to_dict(self) -> dict:
|
|
97
|
-
return {
|
|
98
|
-
"nodes": [{"id": node, **data} for node, data in self.graph.nodes(data=True)],
|
|
99
|
-
"edges": [{"source": src, "target": dst, **data} for src, dst, data in self.graph.edges(data=True)],
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
def _file_node_id(self, filepath: str) -> str:
|
|
103
|
-
return f"file:{Path(filepath).name}"
|
|
104
|
-
|
|
105
|
-
def _add_node(self, node_id: str, **attrs) -> None:
|
|
106
|
-
if not self.graph.has_node(node_id):
|
|
107
|
-
self.graph.add_node(node_id, **attrs)
|
|
108
|
-
|
|
109
|
-
def _add_edge(self, src: str, dst: str, **attrs) -> None:
|
|
110
|
-
self.graph.add_edge(src, dst, **attrs)
|
|
File without changes
|
|
File without changes
|
{codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|