codegraph-cli-ai 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/PKG-INFO +4 -1
  2. codegraph_cli_ai-0.1.9/codegraph/cli.py +279 -0
  3. codegraph_cli_ai-0.1.9/codegraph/graph/builder.py +190 -0
  4. codegraph_cli_ai-0.1.9/codegraph/parsers/database_parser.py +25 -0
  5. codegraph_cli_ai-0.1.9/codegraph/parsers/image_parser.py +39 -0
  6. codegraph_cli_ai-0.1.9/codegraph/parsers/multimodal_parser.py +86 -0
  7. codegraph_cli_ai-0.1.9/codegraph/parsers/pdf_parser.py +43 -0
  8. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph/parsers/python_parser.py +13 -1
  9. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/PKG-INFO +4 -1
  10. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/SOURCES.txt +4 -0
  11. codegraph_cli_ai-0.1.9/codegraph_cli_ai.egg-info/requires.txt +6 -0
  12. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/pyproject.toml +5 -2
  13. codegraph_cli_ai-0.1.7/codegraph/cli.py +0 -454
  14. codegraph_cli_ai-0.1.7/codegraph/graph/builder.py +0 -110
  15. codegraph_cli_ai-0.1.7/codegraph_cli_ai.egg-info/requires.txt +0 -3
  16. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/MANIFEST.in +0 -0
  17. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/README.md +0 -0
  18. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/dependency_links.txt +0 -0
  19. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/entry_points.txt +0 -0
  20. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/top_level.txt +0 -0
  21. {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codegraph-cli-ai
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: CLI tool to analyze codebases and visualize knowledge graphs using AST
5
5
  Author: Aditya Jogdand
6
6
  License: MIT
@@ -15,6 +15,9 @@ Description-Content-Type: text/markdown
15
15
  Requires-Dist: typer>=0.9.0
16
16
  Requires-Dist: networkx>=3.0
17
17
  Requires-Dist: pyvis>=0.3.2
18
+ Requires-Dist: pypdf>=3.0.0
19
+ Requires-Dist: pytesseract>=0.3.10
20
+ Requires-Dist: Pillow>=9.0.0
18
21
 
19
22
  # CodeGraph AI
20
23
 
@@ -0,0 +1,279 @@
1
+ """
2
+ CodeGraph AI - CLI Entry Point
3
+ """
4
+
5
+ import json
6
+ import typer
7
+ import webbrowser
8
+ import networkx as nx
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Optional, List, Set, Dict
12
+ from collections import deque
13
+ from codegraph.parsers.python_parser import PythonParser
14
+ from codegraph.parsers.multimodal_parser import MultiModalParser
15
+ from codegraph.graph.builder import GraphBuilder
16
+
17
+ app = typer.Typer()
18
+
19
+
20
+ @app.callback()
21
+ def main():
22
+ """CodeGraph AI - Understand your codebase using graphs and AI."""
23
+ pass
24
+
25
+
26
+ # Dynamic Ignore Defaults
27
+ IGNORE_FILES = {"graph.json", ".DS_Store"}
28
+ IGNORE_EXTENSIONS = {".pyc", ".log", ".pyo", ".pyd"}
29
+
30
+
31
+ def is_virtualenv(path: Path) -> bool:
32
+ """Detect if a directory is a virtual environment based on its structure."""
33
+ if not path.is_dir():
34
+ return False
35
+ return (
36
+ (path / "pyvenv.cfg").exists()
37
+ or (path / "bin" / "python").exists()
38
+ or (path / "Scripts" / "python.exe").exists()
39
+ or (path / "bin" / "pip").exists()
40
+ )
41
+
42
+
43
+ def should_ignore_dir(name: str, path: Path, root: Path) -> bool:
44
+ """Heuristic-based dynamic directory ignoring."""
45
+ if name.startswith(".") and path != root:
46
+ return True
47
+ if name.startswith("__"):
48
+ return True
49
+ if is_virtualenv(path):
50
+ return True
51
+ if name in {"node_modules", "bin", "Scripts", "lib", "obj", "target", "build", "dist"}:
52
+ return True
53
+ return False
54
+
55
+
56
+ @app.command()
57
+ def index(
58
+ path: str = typer.Argument(".", help="Path to the repo or folder to index")
59
+ ):
60
+ """Scan a directory, parse all Python files, and save the knowledge graph."""
61
+ root = Path(path).resolve()
62
+
63
+ if not root.exists():
64
+ typer.echo(f"[error] Path does not exist: {root}", err=True)
65
+ raise typer.Exit(code=1)
66
+
67
+ typer.echo(f"Indexing: {root}\n")
68
+
69
+ all_files = []
70
+
71
+ for dirpath, dirnames, filenames in os.walk(root):
72
+ dpath = Path(dirpath)
73
+ dirnames[:] = [d for d in dirnames if not should_ignore_dir(d, dpath / d, root)]
74
+
75
+ for f in filenames:
76
+ if f in IGNORE_FILES:
77
+ continue
78
+ p = dpath / f
79
+ if p.suffix.lower() in IGNORE_EXTENSIONS:
80
+ continue
81
+ all_files.append(p)
82
+
83
+ py_files = [f for f in all_files if f.suffix == ".py"]
84
+ asset_exts = {".csv", ".json", ".db", ".sqlite", ".pdf", ".png", ".jpg", ".jpeg"}
85
+ asset_files = [f for f in all_files if f.suffix.lower() in asset_exts]
86
+
87
+ if not py_files and not asset_files:
88
+ typer.echo("No supported files found (everything might be ignored).")
89
+ raise typer.Exit()
90
+
91
+ typer.echo(f"Found {len(py_files)} Python file(s) and {len(asset_files)} asset(s)\n")
92
+
93
+ py_parser = PythonParser()
94
+ mm_parser = MultiModalParser()
95
+ parsed_files = []
96
+ parsed_assets = []
97
+ failed_files = []
98
+
99
+ for filepath in py_files:
100
+ result = py_parser.parse_file(str(filepath))
101
+ if result.errors:
102
+ failed_files.append((str(filepath), result.errors))
103
+ else:
104
+ typer.echo(f" ✔ [code] {filepath.relative_to(root)}")
105
+ parsed_files.append(result)
106
+
107
+ for filepath in asset_files:
108
+ try:
109
+ asset = mm_parser.parse(str(filepath))
110
+ typer.echo(f" ✔ [asset] {filepath.relative_to(root)}")
111
+ parsed_assets.append(asset)
112
+ except Exception as e:
113
+ failed_files.append((str(filepath), [str(e)]))
114
+
115
+ typer.echo("\nBuilding graph...")
116
+ builder = GraphBuilder()
117
+ builder.build(parsed_files, parsed_assets)
118
+ summary = builder.summary()
119
+
120
+ output_dir = root / ".codegraph"
121
+ output_dir.mkdir(exist_ok=True)
122
+ output_file = output_dir / "graph.json"
123
+ with output_file.open("w", encoding="utf-8") as fp:
124
+ json.dump(builder.to_dict(), fp, indent=2)
125
+
126
+ typer.echo("\n" + "=" * 50)
127
+ typer.echo("Index complete")
128
+ typer.echo(f" Graph saved : {output_file}")
129
+ typer.echo(f" Graph nodes : {summary['total_nodes']}")
130
+ typer.echo(f" Graph edges : {summary['total_edges']}")
131
+
132
+
133
+ @app.command()
134
+ def plot(
135
+ hide_external: bool = typer.Option(False, "--hide-external", help="Hide external/stdlib nodes"),
136
+ level: Optional[str] = typer.Option(None, "--level", help="Show only: file, function, class, method"),
137
+ focus: Optional[str] = typer.Option(None, "--focus", help="Focus on a specific file"),
138
+ edge_type: Optional[str] = typer.Option(None, "--edge-type", help="Filter edges"),
139
+ ):
140
+ """Visualize the knowledge graph as a premium interactive HTML file."""
141
+ root = Path(".").resolve()
142
+ graph_file = root / ".codegraph" / "graph.json"
143
+
144
+ if not graph_file.exists():
145
+ typer.echo("[error] No graph found. Run 'codegraph index' first.", err=True)
146
+ raise typer.Exit(code=1)
147
+
148
+ typer.echo(f"Loading graph from {graph_file}...")
149
+ with graph_file.open("r", encoding="utf-8") as f:
150
+ data = json.load(f)
151
+
152
+ G_orig = nx.DiGraph()
153
+ for node in data.get("nodes", []):
154
+ G_orig.add_node(node["id"], **{k: v for k, v in node.items() if k != "id"})
155
+ for edge in data.get("edges", []):
156
+ G_orig.add_edge(edge["source"], edge["target"], **{k: v for k, v in edge.items() if k not in ["source", "target"]})
157
+
158
+ G = G_orig.copy()
159
+
160
+ if hide_external:
161
+ remove = [n for n, d in G.nodes(data=True) if d.get("external", False)]
162
+ G.remove_nodes_from(remove)
163
+
164
+ if level:
165
+ level_map = {
166
+ "file": {"file", "dataset", "database", "document", "image"},
167
+ "function": {"function", "method"},
168
+ "class": {"class"}
169
+ }
170
+ requested_levels = level.split(",")
171
+ allowed_kinds = set()
172
+ for r in requested_levels:
173
+ if r in level_map:
174
+ allowed_kinds.update(level_map[r])
175
+ else:
176
+ allowed_kinds.add(r)
177
+
178
+ if ("file" in requested_levels or "document" in allowed_kinds) and len(requested_levels) < 3:
179
+ G = _collapse_to_level(G, allowed_kinds)
180
+ else:
181
+ remove = [n for n, d in G.nodes(data=True) if d.get("kind") not in allowed_kinds]
182
+ G.remove_nodes_from(remove)
183
+
184
+ if focus:
185
+ focus_id = f"file:{focus}"
186
+ if focus_id in G:
187
+ keep = {focus_id} | set(G.successors(focus_id)) | set(G.predecessors(focus_id))
188
+ remove = [n for n in G.nodes if n not in keep]
189
+ G.remove_nodes_from(remove)
190
+
191
+ if edge_type and edge_type != "all":
192
+ allowed_edges = set(edge_type.split(","))
193
+ remove_edges = [(s, t) for s, t, d in G.edges(data=True) if d.get("relation") not in allowed_edges]
194
+ G.remove_edges_from(remove_edges)
195
+ G.remove_nodes_from(list(nx.isolates(G)))
196
+
197
+ typer.echo(f"Rendering {G.number_of_nodes()} nodes, {G.number_of_edges()} edges...")
198
+
199
+ html = _build_premium_html(G)
200
+ output_path = root / "graph.html"
201
+ output_path.write_text(html, encoding="utf-8")
202
+ typer.echo(f"Saved to: {output_path}")
203
+ webbrowser.open(f"file://{output_path.resolve()}")
204
+
205
+
206
+ def _collapse_to_level(G: nx.DiGraph, allowed_kinds: set) -> nx.DiGraph:
207
+ """Contract the graph using depth-limited BFS for high-performance connectivity."""
208
+ new_G = nx.DiGraph()
209
+ keep_nodes = [n for n, d in G.nodes(data=True) if d.get("kind") in allowed_kinds]
210
+
211
+ # Pre-populate all nodes
212
+ for n in keep_nodes:
213
+ new_G.add_node(n, **G.nodes[n])
214
+
215
+ # For each source node, find reachable target nodes within 5 steps
216
+ for start_node in keep_nodes:
217
+ # Standard BFS with depth tracking
218
+ queue = deque([(start_node, 0)])
219
+ visited = {start_node}
220
+
221
+ while queue:
222
+ current, depth = queue.popleft()
223
+ if depth >= 5: continue
224
+
225
+ for neighbor in G.successors(current):
226
+ if neighbor in visited: continue
227
+ visited.add(neighbor)
228
+
229
+ if neighbor in new_G:
230
+ # Found a connection to another keep_node!
231
+ # Extract relation: if it's a multi-hop, use a representative one
232
+ edge_data = G.get_edge_data(current, neighbor)
233
+ # For simplicity, we just use the first edge's relation or a default
234
+ rel = edge_data.get("relation", "calls")
235
+ new_G.add_edge(start_node, neighbor, relation=rel)
236
+ # Stop searching down this branch once we hit a keep_node
237
+ continue
238
+ else:
239
+ # Keep searching through non-keep nodes
240
+ queue.append((neighbor, depth + 1))
241
+
242
+ return new_G
243
+
244
+
245
+ def _build_premium_html(G: nx.DiGraph) -> str:
246
+ """Generate a self-contained premium HTML visualization."""
247
+ nodes_js = []
248
+ for n, d in G.nodes(data=True):
249
+ nodes_js.append(f"{{id: {json.dumps(n)}, label: {json.dumps(d.get('label', n))}}}")
250
+ edges_js = []
251
+ for s, t, d in G.edges(data=True):
252
+ edges_js.append(f"{{from: {json.dumps(s)}, to: {json.dumps(t)}, label: {json.dumps(d.get('relation', ''))}, arrows: 'to'}}")
253
+
254
+ nodes_str = ','.join(nodes_js)
255
+ edges_str = ','.join(edges_js)
256
+
257
+ return f"""<!DOCTYPE html>
258
+ <html>
259
+ <head>
260
+ <script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
261
+ <style>body{{background:#0F1117;color:#fff;margin:0;overflow:hidden;}}#graph{{height:100vh;}}</style>
262
+ </head>
263
+ <body>
264
+ <div id="graph"></div>
265
+ <script>
266
+ const nodes = new vis.DataSet([{nodes_str}]);
267
+ const edges = new vis.DataSet([{edges_str}]);
268
+ new vis.Network(document.getElementById('graph'), {{nodes, edges}}, {{physics:{{solver:'forceAtlas2Based'}} }});
269
+ </script>
270
+ </body>
271
+ </html>"""
272
+
273
+ @app.command()
274
+ def ask(query: str = typer.Argument(..., help="Question about your codebase")):
275
+ typer.echo("ask: coming soon")
276
+
277
+
278
+ if __name__ == "__main__":
279
+ app()
@@ -0,0 +1,190 @@
1
+ """
2
+ Graph Builder for CodeGraph AI
3
+
4
+ Node types:
5
+ - file : a .py file
6
+ - function : top-level function
7
+ - class : a class
8
+ - method : a method belonging to a class
9
+ - module : an imported module/package
10
+
11
+ Edge types:
12
+ - contains : file → function, file → class, class → method
13
+ - calls : function/method → function/method
14
+ - imports : file → module
15
+ - defined_in : function/method → file
16
+ """
17
+
18
+ import builtins
19
+ import networkx as nx
20
+ from pathlib import Path
21
+ from codegraph.parsers.python_parser import ParsedFile
22
+ from codegraph.parsers.multimodal_parser import ParsedAsset
23
+
24
+
25
+ BUILTIN_FUNCTIONS = set(dir(builtins))
26
+
27
+
28
+ class GraphBuilder:
29
+ def __init__(self):
30
+ self.graph = nx.DiGraph()
31
+ self._function_to_file: dict[str, str] = {}
32
+ self._assets: list[ParsedAsset] = []
33
+ self._parsed_files: list[ParsedFile] = []
34
+
35
+ def add_file(self, parsed: ParsedFile) -> None:
36
+ self._parsed_files.append(parsed)
37
+ file_id = self._file_node_id(parsed.filepath)
38
+ filename = Path(parsed.filepath).name
39
+
40
+ self._add_node(file_id, kind="file", label=filename, external=False)
41
+
42
+ for cls in parsed.classes:
43
+ cls_id = f"class:{file_id}:{cls}"
44
+ self._add_node(cls_id, kind="class", label=cls, external=False, file=file_id)
45
+ self._add_edge(file_id, cls_id, relation="contains")
46
+
47
+ for func in parsed.functions:
48
+ func_id = f"func:{file_id}:{func}"
49
+ self._add_node(func_id, kind="function", label=func, external=False, file=file_id)
50
+ self._add_edge(file_id, func_id, relation="contains")
51
+ self._add_edge(func_id, file_id, relation="defined_in")
52
+ self._function_to_file[func] = file_id
53
+
54
+ for cls_name, method_name in parsed.methods:
55
+ cls_id = f"class:{file_id}:{cls_name}"
56
+ method_id = f"func:{file_id}:{method_name}"
57
+ self._add_node(method_id, kind="method", label=method_name, external=False, file=file_id, cls=cls_name)
58
+ self._add_edge(cls_id, method_id, relation="contains")
59
+ self._add_edge(method_id, file_id, relation="defined_in")
60
+ self._function_to_file[method_name] = file_id
61
+
62
+ for imp in parsed.imports:
63
+ mod_id = f"module:{imp}"
64
+ self._add_node(mod_id, kind="module", label=imp, external=True)
65
+ self._add_edge(file_id, mod_id, relation="imports")
66
+
67
+ for caller, callee in parsed.calls:
68
+ if callee in BUILTIN_FUNCTIONS:
69
+ continue
70
+
71
+ # Use file context for caller
72
+ caller_id = f"func:{file_id}:{caller}"
73
+
74
+ # Resolve callee ID using global map or default to external
75
+ target_file_id = self._function_to_file.get(callee)
76
+ if target_file_id:
77
+ callee_id = f"func:{target_file_id}:{callee}"
78
+ else:
79
+ callee_id = f"func:external:{callee}"
80
+ if not self.graph.has_node(callee_id):
81
+ self._add_node(callee_id, kind="function", label=callee, external=True)
82
+
83
+ self._add_edge(caller_id, callee_id, relation="calls")
84
+
85
+ def add_asset(self, asset: ParsedAsset) -> None:
86
+ self._assets.append(asset)
87
+ filename = Path(asset.filepath).name
88
+ node_id = f"{asset.kind}:{filename}"
89
+
90
+ self._add_node(
91
+ node_id,
92
+ kind=asset.kind,
93
+ label=filename,
94
+ filename=filename, # keep raw filename for linking
95
+ external=False,
96
+ metadata=asset.metadata
97
+ )
98
+
99
+ def link_code_to_assets(self) -> None:
100
+ """
101
+ Connect code nodes to assets if the filename appears in:
102
+ - function name
103
+ - call list
104
+ - string usage
105
+ """
106
+ for node_id, data in list(self.graph.nodes(data=True)):
107
+ if data.get("kind") not in ("function", "method"):
108
+ continue
109
+
110
+ # Get function metadata from parsed files
111
+ func_name = data.get("label")
112
+ file_id = data.get("file")
113
+
114
+ # Find the parsed file this function belongs to
115
+ parsed = next((p for p in self._parsed_files if self._file_node_id(p.filepath) == file_id), None)
116
+ if not parsed:
117
+ continue
118
+
119
+ # Check each asset
120
+ for asset in self._assets:
121
+ asset_filename = Path(asset.filepath).name
122
+ asset_id = f"{asset.kind}:{asset_filename}"
123
+ relation = "uses" if asset.kind in ("dataset", "database") else "references"
124
+
125
+ # Check 1: function name
126
+ if asset_filename in func_name:
127
+ self._add_edge(node_id, asset_id, relation=relation)
128
+ continue
129
+
130
+ # Check 2: call list for this function
131
+ calls = [c[1] for c in parsed.calls if c[0] == func_name]
132
+ if any(asset_filename in callee for callee in calls):
133
+ self._add_edge(node_id, asset_id, relation=relation)
134
+ continue
135
+
136
+ # Check 3: string usage
137
+ strings = parsed.strings.get(func_name, [])
138
+ if any(asset_filename in s for s in strings):
139
+ self._add_edge(node_id, asset_id, relation=relation)
140
+ continue
141
+
142
+ def build(self, parsed_files: list[ParsedFile], assets: list[ParsedAsset] = None) -> nx.DiGraph:
143
+ # First pass: add all files to populate function-to-file map
144
+ for parsed in parsed_files:
145
+ if not parsed.errors:
146
+ self.add_file(parsed)
147
+
148
+ # Second pass: add assets
149
+ if assets:
150
+ for asset in assets:
151
+ self.add_asset(asset)
152
+
153
+ # Third pass: link code to assets
154
+ self.link_code_to_assets()
155
+
156
+ return self.graph
157
+
158
+ def summary(self) -> dict:
159
+ nodes_by_kind = {}
160
+ for _, data in self.graph.nodes(data=True):
161
+ kind = data.get("kind", "unknown")
162
+ nodes_by_kind[kind] = nodes_by_kind.get(kind, 0) + 1
163
+
164
+ edges_by_relation = {}
165
+ for _, _, data in self.graph.edges(data=True):
166
+ rel = data.get("relation", "unknown")
167
+ edges_by_relation[rel] = edges_by_relation.get(rel, 0) + 1
168
+
169
+ return {
170
+ "total_nodes": self.graph.number_of_nodes(),
171
+ "total_edges": self.graph.number_of_edges(),
172
+ "nodes_by_kind": nodes_by_kind,
173
+ "edges_by_relation": edges_by_relation,
174
+ }
175
+
176
+ def to_dict(self) -> dict:
177
+ return {
178
+ "nodes": [{"id": node, **data} for node, data in self.graph.nodes(data=True)],
179
+ "edges": [{"source": src, "target": dst, **data} for src, dst, data in self.graph.edges(data=True)],
180
+ }
181
+
182
+ def _file_node_id(self, filepath: str) -> str:
183
+ return f"file:{Path(filepath).name}"
184
+
185
+ def _add_node(self, node_id: str, **attrs) -> None:
186
+ if not self.graph.has_node(node_id):
187
+ self.graph.add_node(node_id, **attrs)
188
+
189
+ def _add_edge(self, src: str, dst: str, **attrs) -> None:
190
+ self.graph.add_edge(src, dst, **attrs)
@@ -0,0 +1,25 @@
1
+ """
2
+ Database Parser for CodeGraph AI
3
+ Extracts metadata from SQLite databases.
4
+ """
5
+ import sqlite3
6
+ from pathlib import Path
7
+
8
+ class DatabaseParser:
9
+ """
10
+ Parses SQLite files to extract table names.
11
+ """
12
+
13
+ def parse(self, filepath: str) -> dict:
14
+ metadata = {"tables": []}
15
+ try:
16
+ conn = sqlite3.connect(filepath)
17
+ cursor = conn.cursor()
18
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
19
+ tables = [row[0] for row in cursor.fetchall()]
20
+ metadata["tables"] = tables
21
+ conn.close()
22
+ except Exception as e:
23
+ metadata["error"] = str(e)
24
+
25
+ return metadata
@@ -0,0 +1,39 @@
1
+ """
2
+ Image Parser for CodeGraph AI
3
+ Extracts text from images using pytesseract (OCR).
4
+ """
5
+ import pytesseract
6
+ from PIL import Image
7
+ from pathlib import Path
8
+
9
+ class ImageParser:
10
+ """
11
+ Parses Image files to extract text via OCR.
12
+ """
13
+
14
+ def parse(self, filepath: str) -> dict:
15
+ """
16
+ Attempts OCR on the image. Falls back gracefully if OCR is unavailable.
17
+ """
18
+ metadata = {
19
+ "text": ""
20
+ }
21
+
22
+ try:
23
+ # Check if tesseract is installed
24
+ # (In a real system we'd handle TesseractNotFoundError specifically)
25
+ img = Image.open(filepath)
26
+ ocr_text = pytesseract.image_to_string(img)
27
+
28
+ # Limit the text length
29
+ if len(ocr_text) > 500:
30
+ ocr_text = ocr_text[:500] + "..."
31
+
32
+ metadata["text"] = ocr_text.strip()
33
+
34
+ except Exception as e:
35
+ # If tesseract is not found or fails, we still return success with empty text
36
+ # as requested in the requirements (handle failure gracefully)
37
+ metadata["error"] = f"OCR failed or not available: {str(e)}"
38
+
39
+ return metadata
@@ -0,0 +1,86 @@
1
+ """
2
+ MultiModal Parser for CodeGraph AI
3
+ Extracts metadata from non-code assets like CSV, JSON, SQLite, PDF, and Images.
4
+ """
5
+ import json
6
+ import csv
7
+ import sqlite3
8
+ from pathlib import Path
9
+ from dataclasses import dataclass, field
10
+
11
+ from codegraph.parsers.pdf_parser import PDFParser
12
+ from codegraph.parsers.image_parser import ImageParser
13
+ from codegraph.parsers.database_parser import DatabaseParser
14
+
15
+
16
+ @dataclass
17
+ class ParsedAsset:
18
+ filepath: str
19
+ kind: str # "dataset" | "database" | "document" | "image"
20
+ metadata: dict = field(default_factory=dict)
21
+
22
+
23
+ class MultiModalParser:
24
+ """
25
+ Parses non-Python files (CSV, JSON, SQLite, PDF, Image) to extract metadata.
26
+ """
27
+
28
+ def __init__(self):
29
+ self.pdf_parser = PDFParser()
30
+ self.image_parser = ImageParser()
31
+ self.database_parser = DatabaseParser()
32
+
33
+ def parse(self, filepath: str) -> ParsedAsset:
34
+ path = Path(filepath)
35
+ suffix = path.suffix.lower()
36
+
37
+ if suffix == ".csv":
38
+ return self._parse_csv(path)
39
+ elif suffix == ".json":
40
+ return self._parse_json(path)
41
+ elif suffix in (".db", ".sqlite"):
42
+ return self._parse_sqlite(path)
43
+ elif suffix == ".pdf":
44
+ return self._parse_pdf(path)
45
+ elif suffix in (".png", ".jpg", ".jpeg"):
46
+ return self._parse_image(path)
47
+ else:
48
+ return ParsedAsset(filepath=str(path), kind="unknown")
49
+
50
+ def _parse_csv(self, path: Path) -> ParsedAsset:
51
+ metadata = {"columns": []}
52
+ try:
53
+ with path.open("r", encoding="utf-8") as f:
54
+ reader = csv.reader(f)
55
+ header = next(reader, [])
56
+ metadata["columns"] = header
57
+ except Exception as e:
58
+ metadata["error"] = str(e)
59
+
60
+ return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
61
+
62
+ def _parse_json(self, path: Path) -> ParsedAsset:
63
+ metadata = {"keys": []}
64
+ try:
65
+ with path.open("r", encoding="utf-8") as f:
66
+ data = json.load(f)
67
+ if isinstance(data, dict):
68
+ metadata["keys"] = list(data.keys())
69
+ elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
70
+ metadata["keys"] = list(data[0].keys())
71
+ except Exception as e:
72
+ metadata["error"] = str(e)
73
+
74
+ return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
75
+
76
+ def _parse_sqlite(self, path: Path) -> ParsedAsset:
77
+ metadata = self.database_parser.parse(str(path))
78
+ return ParsedAsset(filepath=str(path), kind="database", metadata=metadata)
79
+
80
+ def _parse_pdf(self, path: Path) -> ParsedAsset:
81
+ metadata = self.pdf_parser.parse(str(path))
82
+ return ParsedAsset(filepath=str(path), kind="document", metadata=metadata)
83
+
84
+ def _parse_image(self, path: Path) -> ParsedAsset:
85
+ metadata = self.image_parser.parse(str(path))
86
+ return ParsedAsset(filepath=str(path), kind="image", metadata=metadata)
@@ -0,0 +1,43 @@
1
+ """
2
+ PDF Parser for CodeGraph AI
3
+ Extracts text and metadata from PDF files using pypdf.
4
+ """
5
+ from pathlib import Path
6
+ from pypdf import PdfReader
7
+
8
+ class PDFParser:
9
+ """
10
+ Parses PDF files to extract basic text preview and page count.
11
+ """
12
+
13
+ def parse(self, filepath: str) -> dict:
14
+ """
15
+ Extracts metadata from the first few pages of a PDF.
16
+ """
17
+ metadata = {
18
+ "num_pages": 0,
19
+ "text_preview": ""
20
+ }
21
+
22
+ try:
23
+ reader = PdfReader(filepath)
24
+ metadata["num_pages"] = len(reader.pages)
25
+
26
+ # Extract text from first 2 pages as a preview
27
+ preview_text = []
28
+ for i in range(min(2, len(reader.pages))):
29
+ text = reader.pages[i].extract_text()
30
+ if text:
31
+ preview_text.append(text.strip())
32
+
33
+ # Limit the preview length to avoid huge tooltips
34
+ full_preview = "\n---\n".join(preview_text)
35
+ if len(full_preview) > 500:
36
+ full_preview = full_preview[:500] + "..."
37
+
38
+ metadata["text_preview"] = full_preview
39
+
40
+ except Exception as e:
41
+ metadata["error"] = str(e)
42
+
43
+ return metadata