codegraphy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repolens/db/store.py ADDED
@@ -0,0 +1,162 @@
1
+ import json
2
+ import sqlite3
3
+ from urllib.parse import urlparse
4
+ from contextlib import contextmanager
5
+
6
+ from .schema import get_schema
7
+
8
+ try:
9
+ import psycopg2
10
+ from psycopg2.extras import Json
11
+ except ImportError:
12
+ psycopg2 = None
13
+
14
+ class Store:
15
+ def __init__(self, db_url: str):
16
+ self.db_url = db_url
17
+ self.is_postgres = db_url.startswith("postgres")
18
+
19
+ if self.is_postgres and psycopg2 is None:
20
+ raise ImportError("psycopg2-binary is required for postgres support")
21
+
22
+ if self.is_postgres:
23
+ parsed = urlparse(db_url)
24
+ self.conn_kwargs = {
25
+ 'dbname': parsed.path[1:],
26
+ 'user': parsed.username,
27
+ 'password': parsed.password,
28
+ 'host': parsed.hostname,
29
+ 'port': parsed.port,
30
+ }
31
+ # Remove None values
32
+ self.conn_kwargs = {k: v for k, v in self.conn_kwargs.items() if v is not None}
33
+ else:
34
+ # Handle sqlite:///path
35
+ self.db_path = db_url.replace("sqlite:///", "")
36
+ if not self.db_path:
37
+ self.db_path = "codegraphy.db"
38
+
39
+ @contextmanager
40
+ def get_connection(self):
41
+ if self.is_postgres:
42
+ conn = psycopg2.connect(**self.conn_kwargs)
43
+ else:
44
+ conn = sqlite3.connect(self.db_path)
45
+ conn.execute("PRAGMA foreign_keys = ON")
46
+
47
+ try:
48
+ yield conn
49
+ conn.commit()
50
+ except Exception:
51
+ conn.rollback()
52
+ raise
53
+ finally:
54
+ conn.close()
55
+
56
+ def init_schema(self):
57
+ schema_sql = get_schema(self.db_url)
58
+ with self.get_connection() as conn:
59
+ cursor = conn.cursor()
60
+ if self.is_postgres:
61
+ cursor.execute(schema_sql)
62
+ else:
63
+ # sqlite3 executescript for multiple statements
64
+ cursor.executescript(schema_sql)
65
+
66
+ def get_file_hash(self, file_path: str) -> str:
67
+ with self.get_connection() as conn:
68
+ cursor = conn.cursor()
69
+ if self.is_postgres:
70
+ cursor.execute("SELECT git_hash FROM cg_files WHERE file_path = %s", (file_path,))
71
+ else:
72
+ cursor.execute("SELECT git_hash FROM cg_files WHERE file_path = ?", (file_path,))
73
+ row = cursor.fetchone()
74
+ return row[0] if row else None
75
+
76
+ def upsert_file(self, file_path: str, git_hash: str, symbols: list, edges: list):
77
+ """
78
+ Replace symbols and edges for a file.
79
+ """
80
+ with self.get_connection() as conn:
81
+ cursor = conn.cursor()
82
+
83
+ # Param style
84
+ p = "%s" if self.is_postgres else "?"
85
+
86
+ # Upsert file
87
+ if self.is_postgres:
88
+ cursor.execute(f"""
89
+ INSERT INTO cg_files (file_path, git_hash, symbol_count, last_indexed)
90
+ VALUES ({p}, {p}, {p}, NOW())
91
+ ON CONFLICT (file_path) DO UPDATE
92
+ SET git_hash = EXCLUDED.git_hash, symbol_count = EXCLUDED.symbol_count, last_indexed = NOW()
93
+ """, (file_path, git_hash, len(symbols)))
94
+ else:
95
+ cursor.execute(f"""
96
+ INSERT INTO cg_files (file_path, git_hash, symbol_count, last_indexed)
97
+ VALUES ({p}, {p}, {p}, CURRENT_TIMESTAMP)
98
+ ON CONFLICT(file_path) DO UPDATE
99
+ SET git_hash=excluded.git_hash, symbol_count=excluded.symbol_count, last_indexed=CURRENT_TIMESTAMP
100
+ """, (file_path, git_hash, len(symbols)))
101
+
102
+ # Delete old symbols (cascade deletes edges)
103
+ cursor.execute(f"DELETE FROM cg_symbols WHERE file_path = {p}", (file_path,))
104
+
105
+ # Insert new symbols
106
+ if symbols:
107
+ symbol_records = []
108
+ for s in symbols:
109
+ extra_val = Json(s.extra) if self.is_postgres else json.dumps(s.extra)
110
+ symbol_records.append((
111
+ s.name, s.qualified_name, s.kind, s.file_path,
112
+ s.line_start, s.line_end, s.summary, s.raw_signature, extra_val
113
+ ))
114
+
115
+ cursor.executemany(f"""
116
+ INSERT INTO cg_symbols (name, qualified_name, kind, file_path, line_start, line_end, summary, raw_signature, extra)
117
+ VALUES ({p}, {p}, {p}, {p}, {p}, {p}, {p}, {p}, {p})
118
+ """, symbol_records)
119
+
120
+ # To insert edges, we need their IDs. The simplest way is to map qualified_name -> id
121
+ # Note: For edges where the target doesn't exist yet, we might have missing IDs.
122
+ # To handle this robustly without failing, we only insert edges where both from and to exist.
123
+ # However, the spec says from_id, to_id.
124
+ # We must get IDs for all symbols first.
125
+
126
+ # For simplicity in this step, we will bulk insert edges later or inside a second pass?
127
+ # Actually we can just look up ids.
128
+ # If to_qualified doesn't exist in DB, the edge is dropped.
129
+ if edges:
130
+ quals = set()
131
+ for e in edges:
132
+ quals.add(e.from_qualified)
133
+ quals.add(e.to_qualified)
134
+
135
+ if quals:
136
+ # SQLite limit for variables is 999, but a single file rarely exceeds that.
137
+ # For safety, we can query in batches, or use placeholders.
138
+ quals_list = list(quals)
139
+ qual_to_id = {}
140
+
141
+ # Batch fetch to avoid hitting sqlite limits
142
+ batch_size = 500
143
+ for i in range(0, len(quals_list), batch_size):
144
+ batch = quals_list[i:i+batch_size]
145
+ placeholders = ",".join([p] * len(batch))
146
+ cursor.execute(f"SELECT id, qualified_name FROM cg_symbols WHERE qualified_name IN ({placeholders})", tuple(batch))
147
+ for row in cursor.fetchall():
148
+ qual_to_id[row[1]] = row[0]
149
+
150
+ edge_records = []
151
+ for e in edges:
152
+ from_id = qual_to_id.get(e.from_qualified)
153
+ to_id = qual_to_id.get(e.to_qualified)
154
+ if from_id and to_id:
155
+ edge_records.append((from_id, to_id, e.relation))
156
+
157
+ if edge_records:
158
+ cursor.executemany(f"""
159
+ INSERT INTO cg_edges (from_id, to_id, relation)
160
+ VALUES ({p}, {p}, {p})
161
+ ON CONFLICT DO NOTHING
162
+ """, edge_records)
@@ -0,0 +1,5 @@
1
+ from .base import BaseIndexer, Symbol, Edge
2
+ from .python import PythonIndexer
3
+ from .walker import index_path
4
+
5
+ __all__ = ["BaseIndexer", "Symbol", "Edge", "PythonIndexer", "index_path"]
@@ -0,0 +1,27 @@
1
+ from dataclasses import dataclass, field
2
+
3
+ @dataclass
4
+ class Symbol:
5
+ name: str
6
+ qualified_name: str
7
+ kind: str
8
+ file_path: str
9
+ line_start: int
10
+ line_end: int
11
+ summary: str = ""
12
+ raw_signature: str = ""
13
+ extra: dict = field(default_factory=dict)
14
+
15
+ @dataclass
16
+ class Edge:
17
+ from_qualified: str
18
+ to_qualified: str
19
+ relation: str
20
+
21
+ class BaseIndexer:
22
+ def can_handle(self, file_path: str) -> bool:
23
+ raise NotImplementedError
24
+
25
+ def index_file(self, file_path: str, source: str) -> tuple[list[Symbol], list[Edge]]:
26
+ """Returns symbols and edges found in the file."""
27
+ raise NotImplementedError
@@ -0,0 +1,177 @@
1
+ import ast
2
+ import os
3
+ from .base import BaseIndexer, Symbol, Edge
4
+
5
+ def _get_module_path(file_path: str) -> str:
6
+ # Basic conversion from file path to dotted module path
7
+ # e.g., src/foo/bar.py -> src.foo.bar
8
+ # This is a simple approximation.
9
+ base = os.path.splitext(file_path)[0]
10
+ return base.replace(os.sep, '.')
11
+
12
+ class PythonIndexer(BaseIndexer):
13
+ def can_handle(self, file_path: str) -> bool:
14
+ return file_path.endswith('.py')
15
+
16
+ def index_file(self, file_path: str, source: str) -> tuple[list[Symbol], list[Edge]]:
17
+ try:
18
+ tree = ast.parse(source, filename=file_path)
19
+ except SyntaxError:
20
+ return [], []
21
+
22
+ module_path = _get_module_path(file_path)
23
+
24
+ symbols = []
25
+ edges = []
26
+
27
+ # We need a visitor to traverse the AST.
28
+ class Visitor(ast.NodeVisitor):
29
+ def __init__(self):
30
+ self.current_scope = []
31
+ self.current_class = None
32
+
33
+ def _get_dotted_name(self, node):
34
+ if isinstance(node, ast.Name):
35
+ return node.id
36
+ elif isinstance(node, ast.Attribute):
37
+ val = self._get_dotted_name(node.value)
38
+ if val:
39
+ return f"{val}.{node.attr}"
40
+ return None
41
+
42
+ def get_qualname(self, name):
43
+ if not self.current_scope:
44
+ return f"{module_path}.{name}" if module_path else name
45
+ return f"{module_path}.{'.'.join(self.current_scope)}.{name}"
46
+
47
+ def visit_ClassDef(self, node):
48
+ qualname = self.get_qualname(node.name)
49
+ summary = ast.get_docstring(node) or ""
50
+ if summary:
51
+ summary = summary.splitlines()[0]
52
+
53
+ symbols.append(Symbol(
54
+ name=node.name,
55
+ qualified_name=qualname,
56
+ kind="class",
57
+ file_path=file_path,
58
+ line_start=node.lineno,
59
+ line_end=node.end_lineno,
60
+ summary=summary
61
+ ))
62
+
63
+ # Inherits edges
64
+ for base in node.bases:
65
+ target = self._get_dotted_name(base)
66
+ if target:
67
+ edges.append(Edge(
68
+ from_qualified=qualname,
69
+ to_qualified=target, # Approximated base name
70
+ relation="inherits"
71
+ ))
72
+
73
+ self.current_scope.append(node.name)
74
+ prev_class = self.current_class
75
+ self.current_class = node.name
76
+ self.generic_visit(node)
77
+ self.current_class = prev_class
78
+ self.current_scope.pop()
79
+
80
+ def visit_FunctionDef(self, node):
81
+ self._visit_func(node)
82
+ def visit_AsyncFunctionDef(self, node):
83
+ self._visit_func(node)
84
+
85
+ def _visit_func(self, node):
86
+ qualname = self.get_qualname(node.name)
87
+ summary = ast.get_docstring(node) or ""
88
+ if summary:
89
+ summary = summary.splitlines()[0]
90
+
91
+ kind = "method" if self.current_class else "function"
92
+
93
+ symbols.append(Symbol(
94
+ name=node.name,
95
+ qualified_name=qualname,
96
+ kind=kind,
97
+ file_path=file_path,
98
+ line_start=node.lineno,
99
+ line_end=node.end_lineno,
100
+ summary=summary
101
+ ))
102
+
103
+ self.current_scope.append(node.name)
104
+ self.generic_visit(node)
105
+ self.current_scope.pop()
106
+
107
+ def visit_Import(self, node):
108
+ for alias in node.names:
109
+ # module level import
110
+ # e.g., import os
111
+ qualname = self.get_qualname(alias.asname or alias.name)
112
+ symbols.append(Symbol(
113
+ name=alias.asname or alias.name,
114
+ qualified_name=qualname,
115
+ kind="import",
116
+ file_path=file_path,
117
+ line_start=node.lineno,
118
+ line_end=node.end_lineno,
119
+ extra={"module": alias.name}
120
+ ))
121
+
122
+ edges.append(Edge(
123
+ from_qualified=module_path,
124
+ to_qualified=alias.name,
125
+ relation="imports"
126
+ ))
127
+
128
+ def visit_ImportFrom(self, node):
129
+ if node.module:
130
+ for alias in node.names:
131
+ name = alias.asname or alias.name
132
+ qualname = self.get_qualname(name)
133
+ symbols.append(Symbol(
134
+ name=name,
135
+ qualified_name=qualname,
136
+ kind="import",
137
+ file_path=file_path,
138
+ line_start=node.lineno,
139
+ line_end=node.end_lineno,
140
+ extra={"module": node.module, "original_name": alias.name}
141
+ ))
142
+
143
+ edges.append(Edge(
144
+ from_qualified=module_path,
145
+ to_qualified=f"{node.module}.{alias.name}",
146
+ relation="imports"
147
+ ))
148
+
149
+ def visit_Call(self, node):
150
+ # Extract simple calls
151
+ target = self._get_dotted_name(node.func)
152
+
153
+ if target and self.current_scope:
154
+ caller = self.get_qualname("")[:-1] # strip trailing dot
155
+ edges.append(Edge(
156
+ from_qualified=caller,
157
+ to_qualified=target, # We don't have full res, will be resolved or approximated during usages query
158
+ relation="calls"
159
+ ))
160
+
161
+ self.generic_visit(node)
162
+
163
+ visitor = Visitor()
164
+ visitor.visit(tree)
165
+
166
+ # Add a symbol for the file itself
167
+ symbols.append(Symbol(
168
+ name=os.path.basename(file_path),
169
+ qualified_name=module_path,
170
+ kind="file",
171
+ file_path=file_path,
172
+ line_start=1,
173
+ line_end=len(source.splitlines()),
174
+ summary=""
175
+ ))
176
+
177
+ return symbols, edges
@@ -0,0 +1,77 @@
1
+ import os
2
+ import hashlib
3
+ import subprocess
4
+ from .python import PythonIndexer
5
+ from ..db.store import Store
6
+
7
+ INDEXERS = [PythonIndexer()]
8
+
9
+ def sha256(content: bytes) -> str:
10
+ return hashlib.sha256(content).hexdigest()
11
+
12
+ def get_files_to_index(root: str, exclude: list[str]) -> list[str]:
13
+ # Use git ls-files if possible
14
+ try:
15
+ result = subprocess.run(
16
+ ['git', 'ls-files'],
17
+ cwd=root,
18
+ capture_output=True,
19
+ text=True,
20
+ check=True
21
+ )
22
+ files = result.stdout.splitlines()
23
+ # Make paths absolute
24
+ files = [os.path.join(root, f) for f in files]
25
+ except (subprocess.CalledProcessError, FileNotFoundError):
26
+ # Fallback to os.walk
27
+ files = []
28
+ for dirpath, dirnames, filenames in os.walk(root):
29
+ # rudimentary exclude
30
+ dirnames[:] = [d for d in dirnames if d not in exclude and not d.startswith('.')]
31
+ for f in filenames:
32
+ files.append(os.path.join(dirpath, f))
33
+
34
+ # Filter excludes
35
+ if exclude:
36
+ filtered = []
37
+ for f in files:
38
+ if not any(ex in f for ex in exclude):
39
+ filtered.append(f)
40
+ files = filtered
41
+
42
+ return files
43
+
44
+ def index_path(root: str, store: Store, plugins: list, exclude: list[str] = None):
45
+ exclude = exclude or ['.git', 'node_modules', '__pycache__', '.venv']
46
+ files = get_files_to_index(root, exclude)
47
+
48
+ indexed_count = 0
49
+ for path in files:
50
+ indexer = next((i for i in INDEXERS if i.can_handle(path)), None)
51
+ if not indexer:
52
+ continue
53
+
54
+ try:
55
+ with open(path, 'rb') as f:
56
+ content_bytes = f.read()
57
+ except OSError:
58
+ continue
59
+
60
+ file_hash = sha256(content_bytes)
61
+
62
+ # Check if unchanged
63
+ if store.get_file_hash(path) == file_hash:
64
+ continue
65
+
66
+ content_str = content_bytes.decode('utf-8', errors='replace')
67
+ symbols, edges = indexer.index_file(path, content_str)
68
+
69
+ # Apply plugins
70
+ for plugin in plugins:
71
+ symbols = [plugin.on_symbol(s) for s in symbols]
72
+ edges.extend(plugin.extra_edges(symbols))
73
+
74
+ store.upsert_file(path, file_hash, symbols, edges)
75
+ indexed_count += 1
76
+
77
+ return indexed_count
@@ -0,0 +1,3 @@
1
+ from .server import start_server
2
+
3
+ __all__ = ["start_server"]