codegraph-cli-ai 0.1.7__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/PKG-INFO +1 -1
  2. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph/cli.py +121 -11
  3. codegraph_cli_ai-0.1.8/codegraph/graph/builder.py +190 -0
  4. codegraph_cli_ai-0.1.8/codegraph/parsers/database_parser.py +25 -0
  5. codegraph_cli_ai-0.1.8/codegraph/parsers/image_parser.py +39 -0
  6. codegraph_cli_ai-0.1.8/codegraph/parsers/multimodal_parser.py +86 -0
  7. codegraph_cli_ai-0.1.8/codegraph/parsers/pdf_parser.py +43 -0
  8. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph/parsers/python_parser.py +13 -1
  9. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/PKG-INFO +1 -1
  10. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/SOURCES.txt +4 -0
  11. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/pyproject.toml +1 -1
  12. codegraph_cli_ai-0.1.7/codegraph/graph/builder.py +0 -110
  13. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/MANIFEST.in +0 -0
  14. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/README.md +0 -0
  15. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/dependency_links.txt +0 -0
  16. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/entry_points.txt +0 -0
  17. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/requires.txt +0 -0
  18. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/codegraph_cli_ai.egg-info/top_level.txt +0 -0
  19. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codegraph-cli-ai
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: CLI tool to analyze codebases and visualize knowledge graphs using AST
5
5
  Author: Aditya Jogdand
6
6
  License: MIT
@@ -9,6 +9,7 @@ import networkx as nx
9
9
  from pathlib import Path
10
10
  from typing import Optional
11
11
  from codegraph.parsers.python_parser import PythonParser
12
+ from codegraph.parsers.multimodal_parser import MultiModalParser
12
13
  from codegraph.graph.builder import GraphBuilder
13
14
 
14
15
  app = typer.Typer()
@@ -20,6 +21,23 @@ def main():
20
21
  pass
21
22
 
22
23
 
24
+ IGNORE_DIRS = {
25
+ "venv", ".venv", "env", "bin", "Scripts",
26
+ ".git", "__pycache__", "node_modules", ".codegraph"
27
+ }
28
+ IGNORE_FILES = {"graph.json", ".DS_Store"}
29
+ IGNORE_EXTENSIONS = {".pyc", ".log", ".pyo", ".pyd"}
30
+
31
+
32
+ def is_virtualenv(path: Path) -> bool:
33
+ """Detect if a directory is a virtual environment based on its structure."""
34
+ return (
35
+ (path / "pyvenv.cfg").exists()
36
+ or (path / "bin" / "python").exists()
37
+ or (path / "Scripts" / "python.exe").exists()
38
+ )
39
+
40
+
23
41
  @app.command()
24
42
  def index(
25
43
  path: str = typer.Argument(".", help="Path to the repo or folder to index")
@@ -32,31 +50,81 @@ def index(
32
50
  raise typer.Exit(code=1)
33
51
 
34
52
  typer.echo(f"Indexing: {root}\n")
35
- py_files = list(root.rglob("*.py"))
53
+
54
+ # Phase A: Identify top-level custom virtualenvs
55
+ venv_dirs = set()
56
+ for item in root.iterdir():
57
+ if item.is_dir() and is_virtualenv(item):
58
+ venv_dirs.add(item)
59
+
60
+ # Phase B: Filtered Scan
61
+ all_files = []
62
+ for p in root.rglob("*"):
63
+ if not p.is_file():
64
+ continue
65
+
66
+ # 1. Skip if any parent directory is in IGNORE_DIRS
67
+ if any(part in IGNORE_DIRS for part in p.parts):
68
+ continue
69
+
70
+ # 2. Skip if inside a detected custom venv
71
+ if any(v_dir in p.parents for v_dir in venv_dirs):
72
+ continue
73
+
74
+ # 3. Skip if filename is ignored
75
+ if p.name in IGNORE_FILES:
76
+ continue
77
+
78
+ # 4. Skip if extension is ignored
79
+ if p.suffix.lower() in IGNORE_EXTENSIONS:
80
+ continue
81
+
82
+ all_files.append(p)
83
+
84
+ py_files = [f for f in all_files if f.suffix == ".py"]
85
+ asset_exts = {
86
+ ".csv", ".json", ".db", ".sqlite",
87
+ ".pdf",
88
+ ".png", ".jpg", ".jpeg"
89
+ }
90
+ asset_files = [f for f in all_files if f.suffix.lower() in asset_exts]
36
91
 
37
- if not py_files:
38
- typer.echo("No Python files found.")
92
+ if not py_files and not asset_files:
93
+ typer.echo("No supported files found (everything might be ignored).")
39
94
  raise typer.Exit()
40
95
 
41
- typer.echo(f"Found {len(py_files)} Python file(s)\n")
96
+ typer.echo(f"Found {len(py_files)} Python file(s) and {len(asset_files)} asset(s)\n")
42
97
 
43
98
  # Step 1 — Parse
44
- parser = PythonParser()
99
+ py_parser = PythonParser()
100
+ mm_parser = MultiModalParser()
101
+
45
102
  parsed_files = []
103
+ parsed_assets = []
46
104
  failed_files = []
47
105
 
106
+ # Parse Python files
48
107
  for filepath in py_files:
49
- result = parser.parse_file(str(filepath))
108
+ result = py_parser.parse_file(str(filepath))
50
109
  if result.errors:
51
110
  failed_files.append((str(filepath), result.errors))
52
111
  else:
53
- typer.echo(f" ✔ {filepath.relative_to(root)}")
112
+ typer.echo(f" ✔ [code] {filepath.relative_to(root)}")
54
113
  parsed_files.append(result)
55
114
 
115
+ # Parse assets
116
+ for filepath in asset_files:
117
+ try:
118
+ asset = mm_parser.parse(str(filepath))
119
+ typer.echo(f" ✔ [asset] {filepath.relative_to(root)}")
120
+ parsed_assets.append(asset)
121
+ except Exception as e:
122
+ failed_files.append((str(filepath), [str(e)]))
123
+
56
124
  # Step 2 — Build graph
57
125
  typer.echo("\nBuilding graph...")
58
126
  builder = GraphBuilder()
59
- builder.build(parsed_files)
127
+ builder.build(parsed_files, parsed_assets)
60
128
  summary = builder.summary()
61
129
 
62
130
  # Step 3 — Save to .codegraph/graph.json
@@ -163,9 +231,13 @@ def _build_premium_html(G: nx.DiGraph) -> str:
163
231
  STYLES = {
164
232
  "file": {"color": "#4A90E2", "shape": "diamond", "size": 28, "font_color": "#ffffff"},
165
233
  "class": {"color": "#F5A623", "shape": "hexagon", "size": 24, "font_color": "#ffffff"},
166
- "function": {"color": "#50C878", "shape": "dot", "size": 16, "font_color": "#ffffff"},
167
- "method": {"color": "#7ED6A8", "shape": "dot", "size": 14, "font_color": "#ffffff"},
234
+ "function": {"color": "#50C878", "shape": "dot", "size": 16, "font_color": "#ffffff"},
235
+ "method": {"color": "#7ED6A8", "shape": "dot", "size": 14, "font_color": "#ffffff"},
168
236
  "module": {"color": "#B0BEC5", "shape": "box", "size": 14, "font_color": "#ffffff"},
237
+ "dataset": {"color": "#FFD700", "shape": "diamond", "size": 22, "font_color": "#ffffff"},
238
+ "database": {"color": "#FF69B4", "shape": "database", "size": 24, "font_color": "#ffffff"},
239
+ "document": {"color": "#9B59B6", "shape": "diamond", "size": 24, "font_color": "#ffffff"},
240
+ "image": {"color": "#5DADE2", "shape": "diamond", "size": 24, "font_color": "#ffffff"},
169
241
  }
170
242
 
171
243
  EDGE_COLORS = {
@@ -173,6 +245,8 @@ def _build_premium_html(G: nx.DiGraph) -> str:
173
245
  "calls": "#50C878",
174
246
  "imports": "#B0BEC5",
175
247
  "defined_in": "#E8E8E8",
248
+ "uses": "#FFD700",
249
+ "references": "#9B59B6",
176
250
  }
177
251
 
178
252
  nodes_js = []
@@ -188,10 +262,41 @@ def _build_premium_html(G: nx.DiGraph) -> str:
188
262
 
189
263
  # Tooltip
190
264
  tooltip_parts = [f"<b>{label}</b>", f"Kind: {kind}"]
191
- if attrs.get("file"):
265
+ if attrs.get("filename"):
266
+ tooltip_parts.append(f"File: {attrs['filename']}")
267
+ elif attrs.get("file"):
192
268
  tooltip_parts.append(f"File: {attrs['file'].replace('file:', '')}")
193
269
  if attrs.get("cls"):
194
270
  tooltip_parts.append(f"Class: {attrs['cls']}")
271
+
272
+ # MultiModal Metadata for tooltips
273
+ metadata = attrs.get("metadata", {})
274
+ if kind == "dataset":
275
+ if "columns" in metadata: tooltip_parts.append(f"Columns: {', '.join(metadata['columns'])}")
276
+ if "keys" in metadata: tooltip_parts.append(f"Keys: {', '.join(metadata['keys'])}")
277
+ elif kind == "database":
278
+ if "tables" in metadata: tooltip_parts.append(f"Tables: {', '.join(metadata['tables'])}")
279
+ elif kind == "document":
280
+ if "num_pages" in metadata: tooltip_parts.append(f"Pages: {metadata['num_pages']}")
281
+ if "text_preview" in metadata: tooltip_parts.append(f"<br>Preview: <i>{metadata['text_preview']}</i>")
282
+ elif kind == "image":
283
+ if "text" in metadata and metadata["text"]: tooltip_parts.append(f"<br>OCR Text: <i>{metadata['text']}</i>")
284
+
285
+ # Add metadata for datasets/databases
286
+ metadata = attrs.get("metadata", {})
287
+ if metadata:
288
+ if "columns" in metadata:
289
+ cols = ", ".join(metadata["columns"][:5])
290
+ if len(metadata["columns"]) > 5: cols += "..."
291
+ tooltip_parts.append(f"Columns: {cols}")
292
+ if "tables" in metadata:
293
+ tbls = ", ".join(metadata["tables"])
294
+ tooltip_parts.append(f"Tables: {tbls}")
295
+ if "keys" in metadata:
296
+ keys = ", ".join(metadata["keys"][:5])
297
+ if len(metadata["keys"]) > 5: keys += "..."
298
+ tooltip_parts.append(f"Keys: {keys}")
299
+
195
300
  if external:
196
301
  tooltip_parts.append("<i>external</i>")
197
302
  tooltip = "<br>".join(tooltip_parts)
@@ -337,10 +442,15 @@ def _build_premium_html(G: nx.DiGraph) -> str:
337
442
  <div class="legend-item"><div class="legend-dot" style="background:#50C878"></div>Function</div>
338
443
  <div class="legend-item"><div class="legend-dot" style="background:#7ED6A8"></div>Method</div>
339
444
  <div class="legend-item"><div class="legend-dot" style="background:#B0BEC5;border-radius:2px"></div>Module</div>
445
+ <div class="legend-item"><div class="legend-dot" style="background:#FFD700;clip-path:polygon(50% 0%, 100% 50%, 50% 100%, 0% 50%)"></div>Dataset</div>
446
+ <div class="legend-item"><div class="legend-dot" style="background:#FF69B4;border-radius:2px"></div>Database</div>
447
+ <div class="legend-item"><div class="legend-dot" style="background:#9B59B6;clip-path:polygon(50% 0%, 100% 50%, 50% 100%, 0% 50%)"></div>Document</div>
448
+ <div class="legend-item"><div class="legend-dot" style="background:#5DADE2;clip-path:polygon(50% 0%, 100% 50%, 50% 100%, 0% 50%)"></div>Image</div>
340
449
  <span style="font-size:11px; color:#546E7A; font-weight: 700; margin-left:12px; margin-right:4px; letter-spacing: 0.05em;">EDGES</span>
341
450
  <div class="legend-item"><div style="width:20px;height:2px;background:#4A90E2;opacity:0.6"></div>contains</div>
342
451
  <div class="legend-item"><div style="width:20px;height:2px;background:#50C878;opacity:0.6"></div>calls</div>
343
452
  <div class="legend-item"><div style="width:20px;height:1px;background:#B0BEC5;border-top:1px dashed #B0BEC5"></div>imports</div>
453
+ <div class="legend-item"><div style="width:20px;height:2px;background:#FFD700;opacity:0.6"></div>uses</div>
344
454
  </div>
345
455
 
346
456
  <div id="graph-container">
@@ -0,0 +1,190 @@
1
+ """
2
+ Graph Builder for CodeGraph AI
3
+
4
+ Node types:
5
+ - file : a .py file
6
+ - function : top-level function
7
+ - class : a class
8
+ - method : a method belonging to a class
9
+ - module : an imported module/package
10
+
11
+ Edge types:
12
+ - contains : file → function, file → class, class → method
13
+ - calls : function/method → function/method
14
+ - imports : file → module
15
+ - defined_in : function/method → file
16
+ """
17
+
18
+ import builtins
19
+ import networkx as nx
20
+ from pathlib import Path
21
+ from codegraph.parsers.python_parser import ParsedFile
22
+ from codegraph.parsers.multimodal_parser import ParsedAsset
23
+
24
+
25
+ BUILTIN_FUNCTIONS = set(dir(builtins))
26
+
27
+
28
+ class GraphBuilder:
29
+ def __init__(self):
30
+ self.graph = nx.DiGraph()
31
+ self._function_to_file: dict[str, str] = {}
32
+ self._assets: list[ParsedAsset] = []
33
+ self._parsed_files: list[ParsedFile] = []
34
+
35
+ def add_file(self, parsed: ParsedFile) -> None:
36
+ self._parsed_files.append(parsed)
37
+ file_id = self._file_node_id(parsed.filepath)
38
+ filename = Path(parsed.filepath).name
39
+
40
+ self._add_node(file_id, kind="file", label=filename, external=False)
41
+
42
+ for cls in parsed.classes:
43
+ cls_id = f"class:{file_id}:{cls}"
44
+ self._add_node(cls_id, kind="class", label=cls, external=False, file=file_id)
45
+ self._add_edge(file_id, cls_id, relation="contains")
46
+
47
+ for func in parsed.functions:
48
+ func_id = f"func:{file_id}:{func}"
49
+ self._add_node(func_id, kind="function", label=func, external=False, file=file_id)
50
+ self._add_edge(file_id, func_id, relation="contains")
51
+ self._add_edge(func_id, file_id, relation="defined_in")
52
+ self._function_to_file[func] = file_id
53
+
54
+ for cls_name, method_name in parsed.methods:
55
+ cls_id = f"class:{file_id}:{cls_name}"
56
+ method_id = f"func:{file_id}:{method_name}"
57
+ self._add_node(method_id, kind="method", label=method_name, external=False, file=file_id, cls=cls_name)
58
+ self._add_edge(cls_id, method_id, relation="contains")
59
+ self._add_edge(method_id, file_id, relation="defined_in")
60
+ self._function_to_file[method_name] = file_id
61
+
62
+ for imp in parsed.imports:
63
+ mod_id = f"module:{imp}"
64
+ self._add_node(mod_id, kind="module", label=imp, external=True)
65
+ self._add_edge(file_id, mod_id, relation="imports")
66
+
67
+ for caller, callee in parsed.calls:
68
+ if callee in BUILTIN_FUNCTIONS:
69
+ continue
70
+
71
+ # Use file context for caller
72
+ caller_id = f"func:{file_id}:{caller}"
73
+
74
+ # Resolve callee ID using global map or default to external
75
+ target_file_id = self._function_to_file.get(callee)
76
+ if target_file_id:
77
+ callee_id = f"func:{target_file_id}:{callee}"
78
+ else:
79
+ callee_id = f"func:external:{callee}"
80
+ if not self.graph.has_node(callee_id):
81
+ self._add_node(callee_id, kind="function", label=callee, external=True)
82
+
83
+ self._add_edge(caller_id, callee_id, relation="calls")
84
+
85
+ def add_asset(self, asset: ParsedAsset) -> None:
86
+ self._assets.append(asset)
87
+ filename = Path(asset.filepath).name
88
+ node_id = f"{asset.kind}:{filename}"
89
+
90
+ self._add_node(
91
+ node_id,
92
+ kind=asset.kind,
93
+ label=filename,
94
+ filename=filename, # keep raw filename for linking
95
+ external=False,
96
+ metadata=asset.metadata
97
+ )
98
+
99
+ def link_code_to_assets(self) -> None:
100
+ """
101
+ Connect code nodes to assets if the filename appears in:
102
+ - function name
103
+ - call list
104
+ - string usage
105
+ """
106
+ for node_id, data in list(self.graph.nodes(data=True)):
107
+ if data.get("kind") not in ("function", "method"):
108
+ continue
109
+
110
+ # Get function metadata from parsed files
111
+ func_name = data.get("label")
112
+ file_id = data.get("file")
113
+
114
+ # Find the parsed file this function belongs to
115
+ parsed = next((p for p in self._parsed_files if self._file_node_id(p.filepath) == file_id), None)
116
+ if not parsed:
117
+ continue
118
+
119
+ # Check each asset
120
+ for asset in self._assets:
121
+ asset_filename = Path(asset.filepath).name
122
+ asset_id = f"{asset.kind}:{asset_filename}"
123
+ relation = "uses" if asset.kind in ("dataset", "database") else "references"
124
+
125
+ # Check 1: function name
126
+ if asset_filename in func_name:
127
+ self._add_edge(node_id, asset_id, relation=relation)
128
+ continue
129
+
130
+ # Check 2: call list for this function
131
+ calls = [c[1] for c in parsed.calls if c[0] == func_name]
132
+ if any(asset_filename in callee for callee in calls):
133
+ self._add_edge(node_id, asset_id, relation=relation)
134
+ continue
135
+
136
+ # Check 3: string usage
137
+ strings = parsed.strings.get(func_name, [])
138
+ if any(asset_filename in s for s in strings):
139
+ self._add_edge(node_id, asset_id, relation=relation)
140
+ continue
141
+
142
+ def build(self, parsed_files: list[ParsedFile], assets: list[ParsedAsset] = None) -> nx.DiGraph:
143
+ # First pass: add all files to populate function-to-file map
144
+ for parsed in parsed_files:
145
+ if not parsed.errors:
146
+ self.add_file(parsed)
147
+
148
+ # Second pass: add assets
149
+ if assets:
150
+ for asset in assets:
151
+ self.add_asset(asset)
152
+
153
+ # Third pass: link code to assets
154
+ self.link_code_to_assets()
155
+
156
+ return self.graph
157
+
158
+ def summary(self) -> dict:
159
+ nodes_by_kind = {}
160
+ for _, data in self.graph.nodes(data=True):
161
+ kind = data.get("kind", "unknown")
162
+ nodes_by_kind[kind] = nodes_by_kind.get(kind, 0) + 1
163
+
164
+ edges_by_relation = {}
165
+ for _, _, data in self.graph.edges(data=True):
166
+ rel = data.get("relation", "unknown")
167
+ edges_by_relation[rel] = edges_by_relation.get(rel, 0) + 1
168
+
169
+ return {
170
+ "total_nodes": self.graph.number_of_nodes(),
171
+ "total_edges": self.graph.number_of_edges(),
172
+ "nodes_by_kind": nodes_by_kind,
173
+ "edges_by_relation": edges_by_relation,
174
+ }
175
+
176
+ def to_dict(self) -> dict:
177
+ return {
178
+ "nodes": [{"id": node, **data} for node, data in self.graph.nodes(data=True)],
179
+ "edges": [{"source": src, "target": dst, **data} for src, dst, data in self.graph.edges(data=True)],
180
+ }
181
+
182
+ def _file_node_id(self, filepath: str) -> str:
183
+ return f"file:{Path(filepath).name}"
184
+
185
+ def _add_node(self, node_id: str, **attrs) -> None:
186
+ if not self.graph.has_node(node_id):
187
+ self.graph.add_node(node_id, **attrs)
188
+
189
+ def _add_edge(self, src: str, dst: str, **attrs) -> None:
190
+ self.graph.add_edge(src, dst, **attrs)
@@ -0,0 +1,25 @@
1
+ """
2
+ Database Parser for CodeGraph AI
3
+ Extracts metadata from SQLite databases.
4
+ """
5
+ import sqlite3
6
+ from pathlib import Path
7
+
8
+ class DatabaseParser:
9
+ """
10
+ Parses SQLite files to extract table names.
11
+ """
12
+
13
+ def parse(self, filepath: str) -> dict:
14
+ metadata = {"tables": []}
15
+ try:
16
+ conn = sqlite3.connect(filepath)
17
+ cursor = conn.cursor()
18
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
19
+ tables = [row[0] for row in cursor.fetchall()]
20
+ metadata["tables"] = tables
21
+ conn.close()
22
+ except Exception as e:
23
+ metadata["error"] = str(e)
24
+
25
+ return metadata
@@ -0,0 +1,39 @@
1
+ """
2
+ Image Parser for CodeGraph AI
3
+ Extracts text from images using pytesseract (OCR).
4
+ """
5
+ import pytesseract
6
+ from PIL import Image
7
+ from pathlib import Path
8
+
9
+ class ImageParser:
10
+ """
11
+ Parses Image files to extract text via OCR.
12
+ """
13
+
14
+ def parse(self, filepath: str) -> dict:
15
+ """
16
+ Attempts OCR on the image. Falls back gracefully if OCR is unavailable.
17
+ """
18
+ metadata = {
19
+ "text": ""
20
+ }
21
+
22
+ try:
23
+ # Check if tesseract is installed
24
+ # (In a real system we'd handle TesseractNotFoundError specifically)
25
+ img = Image.open(filepath)
26
+ ocr_text = pytesseract.image_to_string(img)
27
+
28
+ # Limit the text length
29
+ if len(ocr_text) > 500:
30
+ ocr_text = ocr_text[:500] + "..."
31
+
32
+ metadata["text"] = ocr_text.strip()
33
+
34
+ except Exception as e:
35
+ # If tesseract is not found or fails, we still return success with empty text
36
+ # as requested in the requirements (handle failure gracefully)
37
+ metadata["error"] = f"OCR failed or not available: {str(e)}"
38
+
39
+ return metadata
@@ -0,0 +1,86 @@
1
+ """
2
+ MultiModal Parser for CodeGraph AI
3
+ Extracts metadata from non-code assets like CSV, JSON, SQLite, PDF, and Images.
4
+ """
5
+ import json
6
+ import csv
7
+ import sqlite3
8
+ from pathlib import Path
9
+ from dataclasses import dataclass, field
10
+
11
+ from codegraph.parsers.pdf_parser import PDFParser
12
+ from codegraph.parsers.image_parser import ImageParser
13
+ from codegraph.parsers.database_parser import DatabaseParser
14
+
15
+
16
+ @dataclass
17
+ class ParsedAsset:
18
+ filepath: str
19
+ kind: str # "dataset" | "database" | "document" | "image"
20
+ metadata: dict = field(default_factory=dict)
21
+
22
+
23
+ class MultiModalParser:
24
+ """
25
+ Parses non-Python files (CSV, JSON, SQLite, PDF, Image) to extract metadata.
26
+ """
27
+
28
+ def __init__(self):
29
+ self.pdf_parser = PDFParser()
30
+ self.image_parser = ImageParser()
31
+ self.database_parser = DatabaseParser()
32
+
33
+ def parse(self, filepath: str) -> ParsedAsset:
34
+ path = Path(filepath)
35
+ suffix = path.suffix.lower()
36
+
37
+ if suffix == ".csv":
38
+ return self._parse_csv(path)
39
+ elif suffix == ".json":
40
+ return self._parse_json(path)
41
+ elif suffix in (".db", ".sqlite"):
42
+ return self._parse_sqlite(path)
43
+ elif suffix == ".pdf":
44
+ return self._parse_pdf(path)
45
+ elif suffix in (".png", ".jpg", ".jpeg"):
46
+ return self._parse_image(path)
47
+ else:
48
+ return ParsedAsset(filepath=str(path), kind="unknown")
49
+
50
+ def _parse_csv(self, path: Path) -> ParsedAsset:
51
+ metadata = {"columns": []}
52
+ try:
53
+ with path.open("r", encoding="utf-8") as f:
54
+ reader = csv.reader(f)
55
+ header = next(reader, [])
56
+ metadata["columns"] = header
57
+ except Exception as e:
58
+ metadata["error"] = str(e)
59
+
60
+ return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
61
+
62
+ def _parse_json(self, path: Path) -> ParsedAsset:
63
+ metadata = {"keys": []}
64
+ try:
65
+ with path.open("r", encoding="utf-8") as f:
66
+ data = json.load(f)
67
+ if isinstance(data, dict):
68
+ metadata["keys"] = list(data.keys())
69
+ elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
70
+ metadata["keys"] = list(data[0].keys())
71
+ except Exception as e:
72
+ metadata["error"] = str(e)
73
+
74
+ return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
75
+
76
+ def _parse_sqlite(self, path: Path) -> ParsedAsset:
77
+ metadata = self.database_parser.parse(str(path))
78
+ return ParsedAsset(filepath=str(path), kind="database", metadata=metadata)
79
+
80
+ def _parse_pdf(self, path: Path) -> ParsedAsset:
81
+ metadata = self.pdf_parser.parse(str(path))
82
+ return ParsedAsset(filepath=str(path), kind="document", metadata=metadata)
83
+
84
+ def _parse_image(self, path: Path) -> ParsedAsset:
85
+ metadata = self.image_parser.parse(str(path))
86
+ return ParsedAsset(filepath=str(path), kind="image", metadata=metadata)
@@ -0,0 +1,43 @@
1
+ """
2
+ PDF Parser for CodeGraph AI
3
+ Extracts text and metadata from PDF files using pypdf.
4
+ """
5
+ from pathlib import Path
6
+ from pypdf import PdfReader
7
+
8
+ class PDFParser:
9
+ """
10
+ Parses PDF files to extract basic text preview and page count.
11
+ """
12
+
13
+ def parse(self, filepath: str) -> dict:
14
+ """
15
+ Extracts metadata from the first few pages of a PDF.
16
+ """
17
+ metadata = {
18
+ "num_pages": 0,
19
+ "text_preview": ""
20
+ }
21
+
22
+ try:
23
+ reader = PdfReader(filepath)
24
+ metadata["num_pages"] = len(reader.pages)
25
+
26
+ # Extract text from first 2 pages as a preview
27
+ preview_text = []
28
+ for i in range(min(2, len(reader.pages))):
29
+ text = reader.pages[i].extract_text()
30
+ if text:
31
+ preview_text.append(text.strip())
32
+
33
+ # Limit the preview length to avoid huge tooltips
34
+ full_preview = "\n---\n".join(preview_text)
35
+ if len(full_preview) > 500:
36
+ full_preview = full_preview[:500] + "..."
37
+
38
+ metadata["text_preview"] = full_preview
39
+
40
+ except Exception as e:
41
+ metadata["error"] = str(e)
42
+
43
+ return metadata
@@ -18,6 +18,7 @@ class ParsedFile:
18
18
  methods: list[tuple[str, str]] = field(default_factory=list) # (class_name, method_name)
19
19
  imports: list[str] = field(default_factory=list)
20
20
  calls: list[tuple[str, str]] = field(default_factory=list) # (caller, callee)
21
+ strings: dict[str, list[str]] = field(default_factory=list) # func_name -> list of strings
21
22
  errors: list[str] = field(default_factory=list)
22
23
 
23
24
 
@@ -28,7 +29,7 @@ class PythonParser:
28
29
  """
29
30
 
30
31
  def parse_file(self, filepath: str) -> ParsedFile:
31
- result = ParsedFile(filepath=filepath)
32
+ result = ParsedFile(filepath=filepath, strings={})
32
33
  source = self._read_file(filepath, result)
33
34
 
34
35
  if source is None:
@@ -93,12 +94,14 @@ class PythonParser:
93
94
  if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
94
95
  result.functions.append(node.name)
95
96
  self._extract_calls_in_func(node, node.name, result)
97
+ self._extract_strings_in_func(node, node.name, result)
96
98
 
97
99
  elif isinstance(node, ast.ClassDef):
98
100
  for item in node.body:
99
101
  if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
100
102
  result.methods.append((node.name, item.name))
101
103
  self._extract_calls_in_func(item, item.name, result)
104
+ self._extract_strings_in_func(item, item.name, result)
102
105
 
103
106
  def _extract_calls_in_func(self, func_node: ast.AST, func_name: str, result: ParsedFile):
104
107
  for child in ast.walk(func_node):
@@ -107,6 +110,15 @@ class PythonParser:
107
110
  if callee:
108
111
  result.calls.append((func_name, callee))
109
112
 
113
+ def _extract_strings_in_func(self, func_node: ast.AST, func_name: str, result: ParsedFile):
114
+ if func_name not in result.strings:
115
+ result.strings[func_name] = []
116
+
117
+ for child in ast.walk(func_node):
118
+ # Python 3.8+ handles strings as ast.Constant
119
+ if isinstance(child, ast.Constant) and isinstance(child.value, str):
120
+ result.strings[func_name].append(child.value)
121
+
110
122
  def _extract_classes(self, tree: ast.AST, result: ParsedFile):
111
123
  for node in ast.walk(tree):
112
124
  if isinstance(node, ast.ClassDef):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codegraph-cli-ai
3
- Version: 0.1.7
3
+ Version: 0.1.8
4
4
  Summary: CLI tool to analyze codebases and visualize knowledge graphs using AST
5
5
  Author: Aditya Jogdand
6
6
  License: MIT
@@ -3,6 +3,10 @@ README.md
3
3
  pyproject.toml
4
4
  codegraph/cli.py
5
5
  codegraph/graph/builder.py
6
+ codegraph/parsers/database_parser.py
7
+ codegraph/parsers/image_parser.py
8
+ codegraph/parsers/multimodal_parser.py
9
+ codegraph/parsers/pdf_parser.py
6
10
  codegraph/parsers/python_parser.py
7
11
  codegraph_cli_ai.egg-info/PKG-INFO
8
12
  codegraph_cli_ai.egg-info/SOURCES.txt
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codegraph-cli-ai"
7
- version = "0.1.7"
7
+ version = "0.1.8"
8
8
  description = "CLI tool to analyze codebases and visualize knowledge graphs using AST"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  requires-python = ">=3.9"
@@ -1,110 +0,0 @@
1
- """
2
- Graph Builder for CodeGraph AI
3
-
4
- Node types:
5
- - file : a .py file
6
- - function : top-level function
7
- - class : a class
8
- - method : a method belonging to a class
9
- - module : an imported module/package
10
-
11
- Edge types:
12
- - contains : file → function, file → class, class → method
13
- - calls : function/method → function/method
14
- - imports : file → module
15
- - defined_in : function/method → file
16
- """
17
-
18
- import builtins
19
- import networkx as nx
20
- from pathlib import Path
21
- from codegraph.parsers.python_parser import ParsedFile
22
-
23
-
24
- BUILTIN_FUNCTIONS = set(dir(builtins))
25
-
26
-
27
- class GraphBuilder:
28
- def __init__(self):
29
- self.graph = nx.DiGraph()
30
- self._function_to_file: dict[str, str] = {}
31
-
32
- def add_file(self, parsed: ParsedFile) -> None:
33
- file_id = self._file_node_id(parsed.filepath)
34
- filename = Path(parsed.filepath).name
35
-
36
- self._add_node(file_id, kind="file", label=filename, external=False)
37
-
38
- for cls in parsed.classes:
39
- cls_id = f"class:{cls}"
40
- self._add_node(cls_id, kind="class", label=cls, external=False, file=file_id)
41
- self._add_edge(file_id, cls_id, relation="contains")
42
-
43
- for func in parsed.functions:
44
- func_id = f"func:{func}"
45
- self._add_node(func_id, kind="function", label=func, external=False, file=file_id)
46
- self._add_edge(file_id, func_id, relation="contains")
47
- self._add_edge(func_id, file_id, relation="defined_in")
48
- self._function_to_file[func] = file_id
49
-
50
- for cls_name, method_name in parsed.methods:
51
- cls_id = f"class:{cls_name}"
52
- method_id = f"func:{method_name}"
53
- self._add_node(method_id, kind="method", label=method_name, external=False, file=file_id, cls=cls_name)
54
- self._add_edge(cls_id, method_id, relation="contains")
55
- self._add_edge(method_id, file_id, relation="defined_in")
56
- self._function_to_file[method_name] = file_id
57
-
58
- for imp in parsed.imports:
59
- mod_id = f"module:{imp}"
60
- self._add_node(mod_id, kind="module", label=imp, external=True)
61
- self._add_edge(file_id, mod_id, relation="imports")
62
-
63
- for caller, callee in parsed.calls:
64
- if callee in BUILTIN_FUNCTIONS:
65
- continue
66
- caller_id = f"func:{caller}"
67
- callee_id = f"func:{callee}"
68
- if not self.graph.has_node(callee_id):
69
- self._add_node(callee_id, kind="function", label=callee, external=True)
70
- self._add_edge(caller_id, callee_id, relation="calls")
71
-
72
- def build(self, parsed_files: list[ParsedFile]) -> nx.DiGraph:
73
- for parsed in parsed_files:
74
- if not parsed.errors:
75
- self.add_file(parsed)
76
- return self.graph
77
-
78
- def summary(self) -> dict:
79
- nodes_by_kind = {}
80
- for _, data in self.graph.nodes(data=True):
81
- kind = data.get("kind", "unknown")
82
- nodes_by_kind[kind] = nodes_by_kind.get(kind, 0) + 1
83
-
84
- edges_by_relation = {}
85
- for _, _, data in self.graph.edges(data=True):
86
- rel = data.get("relation", "unknown")
87
- edges_by_relation[rel] = edges_by_relation.get(rel, 0) + 1
88
-
89
- return {
90
- "total_nodes": self.graph.number_of_nodes(),
91
- "total_edges": self.graph.number_of_edges(),
92
- "nodes_by_kind": nodes_by_kind,
93
- "edges_by_relation": edges_by_relation,
94
- }
95
-
96
- def to_dict(self) -> dict:
97
- return {
98
- "nodes": [{"id": node, **data} for node, data in self.graph.nodes(data=True)],
99
- "edges": [{"source": src, "target": dst, **data} for src, dst, data in self.graph.edges(data=True)],
100
- }
101
-
102
- def _file_node_id(self, filepath: str) -> str:
103
- return f"file:{Path(filepath).name}"
104
-
105
- def _add_node(self, node_id: str, **attrs) -> None:
106
- if not self.graph.has_node(node_id):
107
- self.graph.add_node(node_id, **attrs)
108
-
109
- def _add_edge(self, src: str, dst: str, **attrs) -> None:
110
- self.graph.add_edge(src, dst, **attrs)