srcodex 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- srcodex/__init__.py +0 -0
- srcodex/backend/__init__.py +0 -0
- srcodex/backend/chat.py +79 -0
- srcodex/backend/main.py +98 -0
- srcodex/backend/services/__init__.py +0 -0
- srcodex/backend/services/claude_service.py +754 -0
- srcodex/backend/services/config_loader.py +113 -0
- srcodex/backend/services/file_access_tools.py +279 -0
- srcodex/backend/services/file_tree.py +480 -0
- srcodex/backend/services/graph_tools.py +874 -0
- srcodex/backend/services/logger_setup.py +91 -0
- srcodex/backend/services/session_manager.py +81 -0
- srcodex/backend/services/status_tracker.py +91 -0
- srcodex/cli.py +255 -0
- srcodex/core/__init__.py +0 -0
- srcodex/core/config.py +113 -0
- srcodex/core/logger.py +23 -0
- srcodex/indexer/__init__.py +0 -0
- srcodex/indexer/cscope_client.py +183 -0
- srcodex/indexer/ctags_compat.py +223 -0
- srcodex/indexer/ctags_parser.py +456 -0
- srcodex/indexer/explorer.py +135 -0
- srcodex/indexer/field_access_analyzer.py +436 -0
- srcodex/indexer/indexer.py +664 -0
- srcodex/indexer/reference_ingestor.py +293 -0
- srcodex/indexer/reference_resolver.py +544 -0
- srcodex/tui/__init__.py +0 -0
- srcodex/tui/app.py +103 -0
- srcodex/tui/app.tcss +24 -0
- srcodex/tui/components/__init__.py +0 -0
- srcodex/tui/components/bars/__init__.py +0 -0
- srcodex/tui/components/bars/chat_header.py +48 -0
- srcodex/tui/components/bars/code_tab_bar.py +157 -0
- srcodex/tui/components/bars/footer_bar.py +128 -0
- srcodex/tui/components/bars/left_tab.py +54 -0
- srcodex/tui/components/logger.py +57 -0
- srcodex/tui/components/panels/__init__.py +0 -0
- srcodex/tui/components/panels/chat_panel.py +523 -0
- srcodex/tui/components/panels/code_panel.py +229 -0
- srcodex/tui/components/panels/side_panel.py +128 -0
- srcodex/tui/components/views/__init__.py +0 -0
- srcodex/tui/components/views/explorer_view.py +20 -0
- srcodex/tui/components/views/search_view.py +148 -0
- srcodex/tui/components/widgets/__init__.py +0 -0
- srcodex/tui/components/widgets/file_browser.py +16 -0
- srcodex/tui/components/widgets/find_box.py +85 -0
- srcodex-0.2.0.dist-info/METADATA +170 -0
- srcodex-0.2.0.dist-info/RECORD +52 -0
- srcodex-0.2.0.dist-info/WHEEL +5 -0
- srcodex-0.2.0.dist-info/entry_points.txt +2 -0
- srcodex-0.2.0.dist-info/licenses/LICENSE +21 -0
- srcodex-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Code Explorer - Main Indexer
|
|
4
|
+
Scans source code and builds a searchable database
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sqlite3
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import hashlib
|
|
11
|
+
import subprocess
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Optional, Dict
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
import click
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
from .ctags_parser import CTagsParser
|
|
20
|
+
from .explorer import FileDiscovery
|
|
21
|
+
from .reference_ingestor import ReferenceIngestor
|
|
22
|
+
from .reference_resolver import ReferenceResolver
|
|
23
|
+
from .field_access_analyzer import FieldAccessAnalyzer
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Indexer:
|
|
27
|
+
def __init__(self, db_path: str, verbose: bool = False):
|
|
28
|
+
"""
|
|
29
|
+
Args:
|
|
30
|
+
db_path: Path to SQLite database
|
|
31
|
+
"""
|
|
32
|
+
self.db_path = db_path
|
|
33
|
+
self.verbose = verbose
|
|
34
|
+
self.conn = None
|
|
35
|
+
self.ctags = CTagsParser()
|
|
36
|
+
self.source_root = None
|
|
37
|
+
|
|
38
|
+
def connect_db(self):
|
|
39
|
+
"""Connect to database and initialize schema"""
|
|
40
|
+
self.conn = sqlite3.connect(self.db_path)
|
|
41
|
+
self.conn.row_factory = sqlite3.Row
|
|
42
|
+
|
|
43
|
+
self.conn.execute("PRAGMA foreign_keys = ON")
|
|
44
|
+
|
|
45
|
+
# Read and execute schema
|
|
46
|
+
schema_path = Path(__file__).parent / "db_schema.sql"
|
|
47
|
+
with open(schema_path, 'r') as f:
|
|
48
|
+
schema_sql = f.read()
|
|
49
|
+
self.conn.executescript(schema_sql)
|
|
50
|
+
|
|
51
|
+
self.conn.commit()
|
|
52
|
+
|
|
53
|
+
if self.verbose:
|
|
54
|
+
print(f"✓ Database initialized: {self.db_path}")
|
|
55
|
+
|
|
56
|
+
def close_db(self):
|
|
57
|
+
"""Close database connection"""
|
|
58
|
+
if self.conn:
|
|
59
|
+
self.conn.close()
|
|
60
|
+
|
|
61
|
+
def index_directory(self, source_dir: str, extensions: List[str] = None, force_clear: bool = False):
|
|
62
|
+
"""
|
|
63
|
+
Index all files in a directory
|
|
64
|
+
Args:
|
|
65
|
+
source_dir: Root directory to scan
|
|
66
|
+
extensions: File extensions to index (default: ['.c', '.h'])
|
|
67
|
+
force_clear: If True, clear database without prompting
|
|
68
|
+
"""
|
|
69
|
+
if extensions is None:
|
|
70
|
+
extensions = ['.c', '.h', '.cpp', '.py']
|
|
71
|
+
|
|
72
|
+
source_path = Path(source_dir).resolve()
|
|
73
|
+
|
|
74
|
+
# Store source root for relative path computation
|
|
75
|
+
self.source_root = source_path
|
|
76
|
+
|
|
77
|
+
print(f"Scanning directory: {source_dir}")
|
|
78
|
+
|
|
79
|
+
# Use unified FileDiscovery module
|
|
80
|
+
discovery = FileDiscovery(source_dir, extensions)
|
|
81
|
+
files_to_index = discovery.discover_files_absolute()
|
|
82
|
+
|
|
83
|
+
print(f"Found {len(files_to_index)} files to index")
|
|
84
|
+
|
|
85
|
+
if force_clear:
|
|
86
|
+
self._clear_database()
|
|
87
|
+
if self.verbose:
|
|
88
|
+
print("Database cleared (--force)")
|
|
89
|
+
elif click.confirm("Clear existing database?", default=True):
|
|
90
|
+
self._clear_database()
|
|
91
|
+
|
|
92
|
+
# Parse ALL files with SINGLE ctags invocation (with progress bar)
|
|
93
|
+
file_to_symbols = self.ctags.parse_root(str(source_path), extensions, source_root=str(source_path))
|
|
94
|
+
|
|
95
|
+
# Index each file (store metadata + symbols) in ONE transaction
|
|
96
|
+
self.conn.execute("BEGIN")
|
|
97
|
+
try:
|
|
98
|
+
total_symbols = 0
|
|
99
|
+
with tqdm(total=len(files_to_index), desc="Indexing", unit="file") as pbar:
|
|
100
|
+
for file_path in files_to_index:
|
|
101
|
+
try:
|
|
102
|
+
# Normalize to canonical form: rel_posix (same as parse_root() keys)
|
|
103
|
+
file_path_canonical = Path(file_path).relative_to(source_path).as_posix()
|
|
104
|
+
symbols = file_to_symbols.get(file_path_canonical, [])
|
|
105
|
+
symbols_count = self._index_file_with_symbols(str(file_path), symbols)
|
|
106
|
+
total_symbols += symbols_count
|
|
107
|
+
pbar.set_postfix({"symbols": total_symbols})
|
|
108
|
+
except Exception as e:
|
|
109
|
+
if self.verbose:
|
|
110
|
+
print(f"\nError indexing {file_path}: {e}")
|
|
111
|
+
finally:
|
|
112
|
+
pbar.update(1)
|
|
113
|
+
|
|
114
|
+
self.conn.commit()
|
|
115
|
+
except Exception as e:
|
|
116
|
+
self.conn.rollback()
|
|
117
|
+
raise
|
|
118
|
+
|
|
119
|
+
# Update metadata
|
|
120
|
+
self._update_metadata(total_symbols, len(files_to_index))
|
|
121
|
+
|
|
122
|
+
print(f"\nIndexing complete!")
|
|
123
|
+
print(f" Files indexed: {len(files_to_index)}")
|
|
124
|
+
print(f" Symbols found: {total_symbols}")
|
|
125
|
+
|
|
126
|
+
def _clear_database(self):
|
|
127
|
+
cursor = self.conn.cursor()
|
|
128
|
+
cursor.execute("DELETE FROM symbol_edges")
|
|
129
|
+
cursor.execute("DELETE FROM raw_references")
|
|
130
|
+
cursor.execute("DELETE FROM symbols")
|
|
131
|
+
cursor.execute("DELETE FROM files")
|
|
132
|
+
cursor.execute("DELETE FROM symbols_fts")
|
|
133
|
+
self.conn.commit()
|
|
134
|
+
|
|
135
|
+
if self.verbose:
|
|
136
|
+
print("✓ Database cleared")
|
|
137
|
+
|
|
138
|
+
def _index_file_with_symbols(self, file_path: str, symbols: List[Dict]) -> int:
|
|
139
|
+
"""
|
|
140
|
+
Index a single file with PRE-PARSED symbols (from batch ctags call).
|
|
141
|
+
RECOMMENDED method - symbols already parsed by parse_root().
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
file_path: Path to source file (absolute)
|
|
145
|
+
symbols: Pre-parsed symbols from ctags
|
|
146
|
+
Returns:
|
|
147
|
+
Number of symbols indexed
|
|
148
|
+
"""
|
|
149
|
+
# Compute relative path for storage (use POSIX for cross-platform)
|
|
150
|
+
file_path_rel = Path(file_path).relative_to(self.source_root).as_posix()
|
|
151
|
+
|
|
152
|
+
# Read file for metadata (sha1, mtime, size)
|
|
153
|
+
with open(file_path, 'rb') as f:
|
|
154
|
+
content_bytes = f.read()
|
|
155
|
+
|
|
156
|
+
# Compute metadata
|
|
157
|
+
file_size = len(content_bytes)
|
|
158
|
+
sha1_hash = hashlib.sha1(content_bytes).hexdigest()
|
|
159
|
+
mtime = os.path.getmtime(file_path)
|
|
160
|
+
|
|
161
|
+
# Determine language from extension
|
|
162
|
+
ext = Path(file_path).suffix.lower()
|
|
163
|
+
language_map = {
|
|
164
|
+
'.c': 'c',
|
|
165
|
+
'.h': 'c',
|
|
166
|
+
'.cpp': 'cpp',
|
|
167
|
+
'.cc': 'cpp',
|
|
168
|
+
'.cxx': 'cpp',
|
|
169
|
+
'.hpp': 'cpp',
|
|
170
|
+
'.hxx': 'cpp',
|
|
171
|
+
'.py': 'python',
|
|
172
|
+
'.mk': 'makefile',
|
|
173
|
+
'.java': 'java',
|
|
174
|
+
'.rs': 'rust',
|
|
175
|
+
}
|
|
176
|
+
language = language_map.get(ext, 'unknown')
|
|
177
|
+
|
|
178
|
+
cursor = self.conn.cursor()
|
|
179
|
+
|
|
180
|
+
# Delete existing symbols for this file (per-file refresh)
|
|
181
|
+
cursor.execute("DELETE FROM symbols WHERE file_path = ?", (file_path_rel,))
|
|
182
|
+
|
|
183
|
+
# Store file metadata
|
|
184
|
+
cursor.execute(
|
|
185
|
+
"""INSERT OR REPLACE INTO files (path, size, language, sha1, last_modified)
|
|
186
|
+
VALUES (?, ?, ?, ?, ?)""",
|
|
187
|
+
(file_path_rel, file_size, language, sha1_hash, mtime)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Store symbols (already parsed!) - use executemany for batch insert
|
|
191
|
+
symbol_rows = [
|
|
192
|
+
(
|
|
193
|
+
symbol['name'],
|
|
194
|
+
symbol['type'],
|
|
195
|
+
symbol.get('kind_raw'),
|
|
196
|
+
file_path_rel, # RELATIVE path
|
|
197
|
+
symbol['line'],
|
|
198
|
+
symbol.get('signature'),
|
|
199
|
+
symbol.get('typeref'),
|
|
200
|
+
symbol.get('scope', 'global'),
|
|
201
|
+
symbol.get('scope_kind'),
|
|
202
|
+
symbol.get('scope_name'),
|
|
203
|
+
symbol.get('is_file_scope')
|
|
204
|
+
)
|
|
205
|
+
for symbol in symbols
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
if symbol_rows:
|
|
209
|
+
cursor.executemany(
|
|
210
|
+
"""
|
|
211
|
+
INSERT INTO symbols (name, type, kind_raw, file_path, line_number, signature, typeref, scope, scope_kind, scope_name, is_file_scope)
|
|
212
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
213
|
+
""",
|
|
214
|
+
symbol_rows
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return len(symbols)
|
|
218
|
+
|
|
219
|
+
def _index_file(self, file_path: str) -> int:
|
|
220
|
+
"""
|
|
221
|
+
Index a single file with per-file ctags invocation.
|
|
222
|
+
|
|
223
|
+
DEPRECATED for bulk indexing - use _index_file_with_symbols() instead.
|
|
224
|
+
Kept for:
|
|
225
|
+
- Incremental updates of single files
|
|
226
|
+
- Debugging
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
file_path: Path to source file (absolute)
|
|
230
|
+
Returns:
|
|
231
|
+
Number of symbols found
|
|
232
|
+
"""
|
|
233
|
+
# Convert to relative path for storage
|
|
234
|
+
file_path_obj = Path(file_path)
|
|
235
|
+
if self.source_root:
|
|
236
|
+
try:
|
|
237
|
+
rel_path = file_path_obj.relative_to(self.source_root)
|
|
238
|
+
file_path_rel = rel_path.as_posix() # POSIX format for cross-platform
|
|
239
|
+
except ValueError:
|
|
240
|
+
# File is outside source_root, use absolute (POSIX)
|
|
241
|
+
file_path_rel = file_path_obj.as_posix()
|
|
242
|
+
else:
|
|
243
|
+
file_path_rel = file_path_obj.as_posix()
|
|
244
|
+
|
|
245
|
+
# Read file content (use absolute path for actual file access)
|
|
246
|
+
try:
|
|
247
|
+
with open(file_path, 'rb') as f: # Binary mode for SHA1
|
|
248
|
+
content_bytes = f.read()
|
|
249
|
+
content = content_bytes.decode('utf-8', errors='ignore')
|
|
250
|
+
except Exception as e:
|
|
251
|
+
if self.verbose:
|
|
252
|
+
print(f"Warning: Could not read {file_path}: {e}")
|
|
253
|
+
return 0
|
|
254
|
+
|
|
255
|
+
# Compute metadata
|
|
256
|
+
file_size = len(content_bytes)
|
|
257
|
+
sha1_hash = hashlib.sha1(content_bytes).hexdigest()
|
|
258
|
+
mtime = os.path.getmtime(file_path)
|
|
259
|
+
|
|
260
|
+
# Determine language from extension
|
|
261
|
+
ext = Path(file_path).suffix.lower()
|
|
262
|
+
language_map = {
|
|
263
|
+
'.c': 'c',
|
|
264
|
+
'.h': 'c',
|
|
265
|
+
'.cpp': 'cpp',
|
|
266
|
+
'.cc': 'cpp',
|
|
267
|
+
'.cxx': 'cpp',
|
|
268
|
+
'.hpp': 'cpp',
|
|
269
|
+
'.hxx': 'cpp',
|
|
270
|
+
'.py': 'python',
|
|
271
|
+
'.mk': 'makefile',
|
|
272
|
+
'.java': 'java',
|
|
273
|
+
'.rs': 'rust',
|
|
274
|
+
}
|
|
275
|
+
language = language_map.get(ext, 'unknown')
|
|
276
|
+
|
|
277
|
+
cursor = self.conn.cursor()
|
|
278
|
+
|
|
279
|
+
# Delete existing symbols for this file (per-file refresh) - use relative path
|
|
280
|
+
cursor.execute("DELETE FROM symbols WHERE file_path = ?", (file_path_rel,))
|
|
281
|
+
|
|
282
|
+
# Store file METADATA only (not content) - use relative path
|
|
283
|
+
cursor.execute(
|
|
284
|
+
"""INSERT OR REPLACE INTO files (path, size, language, sha1, last_modified)
|
|
285
|
+
VALUES (?, ?, ?, ?, ?)""",
|
|
286
|
+
(file_path_rel, file_size, language, sha1_hash, mtime)
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Parse symbols with ctags (use absolute path for ctags)
|
|
290
|
+
symbols = self.ctags.parse_file(file_path)
|
|
291
|
+
|
|
292
|
+
# Store symbols - override file_path with relative path
|
|
293
|
+
for symbol in symbols:
|
|
294
|
+
cursor.execute(
|
|
295
|
+
"""
|
|
296
|
+
INSERT INTO symbols (name, type, kind_raw, file_path, line_number, signature, typeref, scope, scope_kind, scope_name, is_file_scope)
|
|
297
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
298
|
+
""",
|
|
299
|
+
(
|
|
300
|
+
symbol['name'],
|
|
301
|
+
symbol['type'], # Normalized type
|
|
302
|
+
symbol.get('kind_raw'), # Raw ctags kind
|
|
303
|
+
file_path_rel, # RELATIVE path for portability
|
|
304
|
+
symbol['line'],
|
|
305
|
+
symbol.get('signature'), # NULL if not available
|
|
306
|
+
symbol.get('typeref'), # NULL if not available
|
|
307
|
+
symbol.get('scope', 'global'),
|
|
308
|
+
symbol.get('scope_kind'),
|
|
309
|
+
symbol.get('scope_name'),
|
|
310
|
+
symbol.get('is_file_scope')
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
return len(symbols)
|
|
315
|
+
|
|
316
|
+
def _update_metadata(self, total_symbols: int, total_files: int):
|
|
317
|
+
"""Update metadata table with indexing statistics"""
|
|
318
|
+
cursor = self.conn.cursor()
|
|
319
|
+
cursor.execute(
|
|
320
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
|
321
|
+
('total_symbols', str(total_symbols))
|
|
322
|
+
)
|
|
323
|
+
cursor.execute(
|
|
324
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
|
325
|
+
('total_files', str(total_files))
|
|
326
|
+
)
|
|
327
|
+
cursor.execute(
|
|
328
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
|
329
|
+
('indexed_at', datetime.now().isoformat())
|
|
330
|
+
)
|
|
331
|
+
# Store source root for path resolution
|
|
332
|
+
if self.source_root:
|
|
333
|
+
cursor.execute(
|
|
334
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
|
335
|
+
('source_root', str(self.source_root))
|
|
336
|
+
)
|
|
337
|
+
self.conn.commit()
|
|
338
|
+
|
|
339
|
+
def build_cscope_database(self, output_dir: str = None):
|
|
340
|
+
"""
|
|
341
|
+
Build cscope database for cross-reference queries.
|
|
342
|
+
|
|
343
|
+
CRITICAL: Builds cscope with cwd=source_root and rel_posix paths to ensure
|
|
344
|
+
cscope output paths match DB canonical paths exactly. All cscope files
|
|
345
|
+
(cscope.out, cscope.files, etc.) are stored in output_dir.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
output_dir: Directory to store cscope files (default: None, must be provided)
|
|
349
|
+
"""
|
|
350
|
+
print("\n[Stage 2a] Building cscope database...")
|
|
351
|
+
|
|
352
|
+
if not self.source_root:
|
|
353
|
+
print("Error: source_root not set. Cannot build cscope database.")
|
|
354
|
+
return
|
|
355
|
+
|
|
356
|
+
if output_dir is None:
|
|
357
|
+
print("Error: output_dir must be provided (e.g., 'data/cscope')")
|
|
358
|
+
return
|
|
359
|
+
|
|
360
|
+
output_dir = Path(output_dir)
|
|
361
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
362
|
+
|
|
363
|
+
# Get all indexed files from database
|
|
364
|
+
cursor = self.conn.cursor()
|
|
365
|
+
cursor.execute("SELECT path FROM files")
|
|
366
|
+
files = cursor.fetchall()
|
|
367
|
+
|
|
368
|
+
if not files:
|
|
369
|
+
print("Warning: No files found in database. Run indexing first.")
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
# Write cscope.files with RELATIVE paths (same as DB canonical format)
|
|
373
|
+
cscope_files_path = output_dir / "cscope.files"
|
|
374
|
+
with open(cscope_files_path, 'w') as f:
|
|
375
|
+
for file_row in files:
|
|
376
|
+
file_path_rel = file_row['path'] # Already canonical rel_posix!
|
|
377
|
+
f.write(f"{file_path_rel}\n")
|
|
378
|
+
|
|
379
|
+
print(f" Wrote {len(files)} files to {cscope_files_path}")
|
|
380
|
+
|
|
381
|
+
# Run cscope with cwd=source_root to force rel_posix output paths
|
|
382
|
+
# Use -f flag to specify output location in output_dir
|
|
383
|
+
# Use absolute paths for -i and -f since cwd is source_root
|
|
384
|
+
try:
|
|
385
|
+
cscope_out = output_dir / "cscope.out"
|
|
386
|
+
result = subprocess.run(
|
|
387
|
+
['cscope', '-b', '-q', '-k',
|
|
388
|
+
'-i', str(cscope_files_path.absolute()),
|
|
389
|
+
'-f', str(cscope_out.absolute())],
|
|
390
|
+
cwd=self.source_root, # KEY: Run from source_root for relative paths!
|
|
391
|
+
capture_output=True,
|
|
392
|
+
text=True,
|
|
393
|
+
check=True
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Check that output files were created in output_dir
|
|
397
|
+
if cscope_out.exists():
|
|
398
|
+
size_mb = cscope_out.stat().st_size / (1024 * 1024)
|
|
399
|
+
print(f"Cscope database built: {cscope_out} ({size_mb:.2f} MB)")
|
|
400
|
+
|
|
401
|
+
# Store cscope_dir for later use
|
|
402
|
+
self.cscope_dir = output_dir
|
|
403
|
+
else:
|
|
404
|
+
print(f"Warning: cscope.out not found at {cscope_out}")
|
|
405
|
+
|
|
406
|
+
except subprocess.CalledProcessError as e:
|
|
407
|
+
print(f"Error building cscope database: {e}")
|
|
408
|
+
if e.stderr:
|
|
409
|
+
print(f" stderr: {e.stderr}")
|
|
410
|
+
except FileNotFoundError:
|
|
411
|
+
print("Error: cscope command not found. Install with: sudo apt install cscope")
|
|
412
|
+
|
|
413
|
+
def analyze_field_accesses(self):
|
|
414
|
+
"""
|
|
415
|
+
Stage 1.5: Analyze field accesses in function bodies
|
|
416
|
+
Creates ACCESSES edges in symbol_edges table
|
|
417
|
+
"""
|
|
418
|
+
print("\n[Stage 1.5] Analyzing field accesses...")
|
|
419
|
+
|
|
420
|
+
analyzer = FieldAccessAnalyzer(
|
|
421
|
+
db_conn=self.conn,
|
|
422
|
+
source_root=self.source_root
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Use parallel version for better performance
|
|
426
|
+
stats = analyzer.analyze_all_functions_parallel(clear_existing=True)
|
|
427
|
+
|
|
428
|
+
return stats
|
|
429
|
+
|
|
430
|
+
def ingest_raw_references(self, cscope_dir: Optional[str] = None):
|
|
431
|
+
"""
|
|
432
|
+
Stage 2: Ingest raw cscope output into raw_references table
|
|
433
|
+
Args:
|
|
434
|
+
cscope_dir: Directory containing cscope.out (required, typically data/cscope/)
|
|
435
|
+
"""
|
|
436
|
+
if cscope_dir is None:
|
|
437
|
+
print("Error: cscope_dir must be provided (e.g., 'data/cscope')")
|
|
438
|
+
return
|
|
439
|
+
|
|
440
|
+
cscope_path = Path(cscope_dir)
|
|
441
|
+
|
|
442
|
+
if not (cscope_path / "cscope.out").exists():
|
|
443
|
+
print("Warning: Cscope database not found. Run with --cscope flag first.")
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
print("\n[Stage 2b] Ingesting raw references from cscope...")
|
|
447
|
+
|
|
448
|
+
ingestor = ReferenceIngestor(
|
|
449
|
+
db_conn=self.conn,
|
|
450
|
+
source_root=self.source_root,
|
|
451
|
+
cscope_dir=cscope_path
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Ingest all three types of references
|
|
455
|
+
total_refs = 0
|
|
456
|
+
|
|
457
|
+
# 2a. Callees (functions called by each function)
|
|
458
|
+
callees_count = ingestor.ingest_callees(clear_existing=True)
|
|
459
|
+
total_refs += callees_count
|
|
460
|
+
|
|
461
|
+
# 2b. Callers (functions that call each function)
|
|
462
|
+
callers_count = ingestor.ingest_callers(clear_existing=True)
|
|
463
|
+
total_refs += callers_count
|
|
464
|
+
|
|
465
|
+
# 2c. Includes (files that include each header)
|
|
466
|
+
includes_count = ingestor.ingest_includes(clear_existing=True)
|
|
467
|
+
total_refs += includes_count
|
|
468
|
+
|
|
469
|
+
print(f"\nIngested {total_refs} total raw references:")
|
|
470
|
+
print(f" - Callees: {callees_count}")
|
|
471
|
+
print(f" - Callers: {callers_count}")
|
|
472
|
+
print(f" - Includes: {includes_count}")
|
|
473
|
+
|
|
474
|
+
def resolve_semantic_edges(self):
|
|
475
|
+
"""
|
|
476
|
+
Stage 3: Resolve raw references into semantic graph edges
|
|
477
|
+
Converts (file, function) names → symbol IDs and stores typed edges
|
|
478
|
+
"""
|
|
479
|
+
print("\n[Stage 3] Resolving semantic edges...")
|
|
480
|
+
|
|
481
|
+
resolver = ReferenceResolver(db_conn=self.conn)
|
|
482
|
+
|
|
483
|
+
# 3a. Resolve callees → CALLS edges (symbol-to-symbol)
|
|
484
|
+
callees_stats = resolver.resolve_callees(clear_existing=True)
|
|
485
|
+
|
|
486
|
+
# 3b. Resolve includes → INCLUDES edges (file-to-file)
|
|
487
|
+
includes_stats = resolver.resolve_includes(clear_existing=True)
|
|
488
|
+
|
|
489
|
+
print(f"\nResolved {callees_stats['resolved_edges']} symbol edges + {includes_stats['resolved_edges']} file edges")
|
|
490
|
+
|
|
491
|
+
def print_stats(self):
|
|
492
|
+
"""Print database statistics"""
|
|
493
|
+
cursor = self.conn.cursor()
|
|
494
|
+
|
|
495
|
+
# Get counts
|
|
496
|
+
cursor.execute("SELECT COUNT(*) as count FROM symbols")
|
|
497
|
+
symbol_count = cursor.fetchone()['count']
|
|
498
|
+
|
|
499
|
+
cursor.execute("SELECT COUNT(*) as count FROM files")
|
|
500
|
+
file_count = cursor.fetchone()['count']
|
|
501
|
+
|
|
502
|
+
cursor.execute("SELECT COUNT(*) as count FROM raw_references")
|
|
503
|
+
raw_ref_count = cursor.fetchone()['count']
|
|
504
|
+
|
|
505
|
+
cursor.execute("SELECT COUNT(*) as count FROM symbol_edges")
|
|
506
|
+
symbol_edge_count = cursor.fetchone()['count']
|
|
507
|
+
|
|
508
|
+
cursor.execute("SELECT COUNT(*) as count FROM file_edges")
|
|
509
|
+
file_edge_count = cursor.fetchone()['count']
|
|
510
|
+
|
|
511
|
+
# Get symbol type breakdown
|
|
512
|
+
cursor.execute("SELECT type, COUNT(*) as count FROM symbols GROUP BY type ORDER BY count DESC")
|
|
513
|
+
type_counts = cursor.fetchall()
|
|
514
|
+
|
|
515
|
+
print("\nDatabase Statistics:")
|
|
516
|
+
print(f" Files: {file_count}")
|
|
517
|
+
print(f" Symbols: {symbol_count}")
|
|
518
|
+
print(f" Raw refs: {raw_ref_count}")
|
|
519
|
+
print(f" Symbol edges: {symbol_edge_count}")
|
|
520
|
+
print(f" File edges: {file_edge_count}")
|
|
521
|
+
print("\n Symbol Types:")
|
|
522
|
+
for row in type_counts:
|
|
523
|
+
print(f" {row['type']:15} {row['count']:6}")
|
|
524
|
+
|
|
525
|
+
@click.command()
|
|
526
|
+
@click.argument('source_dir', type=click.Path(exists=True))
|
|
527
|
+
@click.option('--db', default='data/pmfw.db', help='Database path')
|
|
528
|
+
@click.option('--extensions', default='.c,.h,.cpp,.cc,.py,.mk', help='File extensions (comma-separated)')
|
|
529
|
+
@click.option('--refs', is_flag=True, help='[PIPELINE] Build cscope + ingest + resolve (full reference pipeline)')
|
|
530
|
+
@click.option('--build-cscope', is_flag=True, help='[STAGE] Build cscope database only')
|
|
531
|
+
@click.option('--ingest-refs', is_flag=True, help='[STAGE] Ingest raw references only (requires existing cscope DB)')
|
|
532
|
+
@click.option('--resolve-refs', is_flag=True, help='[STAGE] Resolve semantic edges only (requires raw_references)')
|
|
533
|
+
@click.option('--force', '-f', is_flag=True, help='Force clear database without prompting')
|
|
534
|
+
@click.option('--verbose', '-v', is_flag=True, help='Verbose output')
|
|
535
|
+
|
|
536
|
+
def main(source_dir, db, extensions, refs, build_cscope, ingest_refs, resolve_refs, force, verbose):
|
|
537
|
+
"""
|
|
538
|
+
Index source code and build semantic graph
|
|
539
|
+
|
|
540
|
+
PIPELINE STAGES:
|
|
541
|
+
1. Index symbols (always runs)
|
|
542
|
+
2. Build cscope DB (optional, --build-cscope or --refs)
|
|
543
|
+
3. Ingest raw refs (optional, --ingest-refs or --refs)
|
|
544
|
+
4. Resolve edges (optional, --resolve-refs or --refs)
|
|
545
|
+
|
|
546
|
+
Examples:
|
|
547
|
+
# Symbols only (Stage 1):
|
|
548
|
+
python indexer.py test_code --db data/test.db --force
|
|
549
|
+
|
|
550
|
+
# Full pipeline (Stages 1-4):
|
|
551
|
+
python indexer.py test_code --db data/test.db --force --refs
|
|
552
|
+
|
|
553
|
+
# Debug Stage 2 only (requires existing cscope DB):
|
|
554
|
+
python indexer.py test_code --db data/test.db --ingest-refs
|
|
555
|
+
|
|
556
|
+
# Build pipeline piece by piece:
|
|
557
|
+
python indexer.py test_code --db data/test.db --force
|
|
558
|
+
python indexer.py test_code --db data/test.db --build-cscope
|
|
559
|
+
python indexer.py test_code --db data/test.db --ingest-refs
|
|
560
|
+
python indexer.py test_code --db data/test.db --resolve-refs
|
|
561
|
+
"""
|
|
562
|
+
ext_list = [f".{ext.strip().lstrip('.')}" for ext in extensions.split(',')]
|
|
563
|
+
|
|
564
|
+
# Find project root (parent of indexer/ directory or where .git exists)
|
|
565
|
+
script_dir = Path(__file__).parent.resolve()
|
|
566
|
+
project_root = script_dir.parent
|
|
567
|
+
|
|
568
|
+
# Resolve database path: always relative to project_root/data/
|
|
569
|
+
db_path = Path(db)
|
|
570
|
+
if not db_path.is_absolute():
|
|
571
|
+
# If relative path given, resolve it relative to project_root/data/
|
|
572
|
+
if db_path.parts[0] == 'data':
|
|
573
|
+
db_path = project_root / db_path
|
|
574
|
+
else:
|
|
575
|
+
db_path = project_root / "data" / db_path
|
|
576
|
+
|
|
577
|
+
# Ensure parent directory exists
|
|
578
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
579
|
+
|
|
580
|
+
# Derive cscope directory from database path
|
|
581
|
+
# If db is data/test.db, cscope_dir is data/cscope/
|
|
582
|
+
# If db is data/pmfw.db, cscope_dir is data/cscope/
|
|
583
|
+
cscope_dir = db_path.parent / "cscope"
|
|
584
|
+
|
|
585
|
+
print(f" Source Code Explorer - Indexer")
|
|
586
|
+
print(f" Source: {source_dir}")
|
|
587
|
+
print(f" Database: {db_path}")
|
|
588
|
+
print(f" Cscope: {cscope_dir}")
|
|
589
|
+
print(f" Extensions: {', '.join(ext_list)}\n")
|
|
590
|
+
|
|
591
|
+
# Create indexer with resolved absolute path
|
|
592
|
+
indexer = Indexer(str(db_path), verbose=verbose)
|
|
593
|
+
|
|
594
|
+
# Track timing for each stage
|
|
595
|
+
stage_times = {}
|
|
596
|
+
total_start = time.time()
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
# Connect to database
|
|
600
|
+
indexer.connect_db()
|
|
601
|
+
|
|
602
|
+
# Determine pipeline stages to run
|
|
603
|
+
run_build_cscope = refs or build_cscope
|
|
604
|
+
run_ingest = refs or ingest_refs
|
|
605
|
+
run_resolve = refs or resolve_refs
|
|
606
|
+
|
|
607
|
+
# Stage 1: Index files (CTags → symbols, files tables)
|
|
608
|
+
# Always runs unless we're doing stage-specific operations
|
|
609
|
+
if not (ingest_refs or resolve_refs):
|
|
610
|
+
stage1_start = time.time()
|
|
611
|
+
indexer.index_directory(source_dir, ext_list, force_clear=force)
|
|
612
|
+
stage_times['Stage 1 (Symbol Extraction)'] = time.time() - stage1_start
|
|
613
|
+
else:
|
|
614
|
+
# If skipping Stage 1, still need to set source_root for Stage 2/3
|
|
615
|
+
indexer.source_root = Path(source_dir).resolve()
|
|
616
|
+
|
|
617
|
+
# Stage 1.5: Analyze field accesses (NEW!)
|
|
618
|
+
if not (ingest_refs or resolve_refs): # Only if we just ran Stage 1
|
|
619
|
+
stage15_start = time.time()
|
|
620
|
+
indexer.analyze_field_accesses()
|
|
621
|
+
stage_times['Stage 1.5 (Field Access)'] = time.time() - stage15_start
|
|
622
|
+
|
|
623
|
+
# Stage 2a: Build cscope database
|
|
624
|
+
if run_build_cscope:
|
|
625
|
+
stage2a_start = time.time()
|
|
626
|
+
indexer.build_cscope_database(output_dir=str(cscope_dir))
|
|
627
|
+
stage_times['Stage 2a (Build Cscope)'] = time.time() - stage2a_start
|
|
628
|
+
|
|
629
|
+
# Stage 2b: Ingest raw references (cscope → raw_references table)
|
|
630
|
+
if run_ingest:
|
|
631
|
+
stage2b_start = time.time()
|
|
632
|
+
indexer.ingest_raw_references(cscope_dir=str(cscope_dir))
|
|
633
|
+
stage_times['Stage 2b (Ingest References)'] = time.time() - stage2b_start
|
|
634
|
+
|
|
635
|
+
# Stage 3: Resolve semantic edges (raw_references → symbol_edges table)
|
|
636
|
+
if run_resolve:
|
|
637
|
+
stage3_start = time.time()
|
|
638
|
+
indexer.resolve_semantic_edges()
|
|
639
|
+
stage_times['Stage 3 (Resolve Edges)'] = time.time() - stage3_start
|
|
640
|
+
|
|
641
|
+
# Print statistics
|
|
642
|
+
indexer.print_stats()
|
|
643
|
+
|
|
644
|
+
finally:
|
|
645
|
+
indexer.close_db()
|
|
646
|
+
|
|
647
|
+
total_time = time.time() - total_start
|
|
648
|
+
|
|
649
|
+
# Print timing summary
|
|
650
|
+
if stage_times:
|
|
651
|
+
print("\nTiming:")
|
|
652
|
+
for stage_name, duration in stage_times.items():
|
|
653
|
+
print(f" {stage_name}: {duration:.2f}s")
|
|
654
|
+
print(f" Total: {total_time:.2f}s")
|
|
655
|
+
|
|
656
|
+
# Print database size
|
|
657
|
+
db_size_mb = db_path.stat().st_size / (1024 * 1024)
|
|
658
|
+
print(f"\nDatabase: {db_path} ({db_size_mb:.2f} MB)")
|
|
659
|
+
|
|
660
|
+
print(f"\nDone!")
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
if __name__ == "__main__":
|
|
664
|
+
main()
|