code-explore-by-sql 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,284 @@
1
+ """Build the three-table index for source code.
2
+
3
+ Pipeline per file:
4
+ 1. Dispatch by file extension -> LanguageConfig
5
+ 2. Read file content -> upsert into file_content (FTS5)
6
+ 3. bracket_scanner -> bracket blocks
7
+ 4. symbol_analyzer -> SymbolDef[] with QN normalization + decoration metadata
8
+ 5. edge_extractor -> StrictEdge[] (4 types only)
9
+ 6. Write symbol_index + strict_edges
10
+
11
+ Usage:
12
+ python -m code_source_sql.build_db <source_root> <db_path> [options]
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import hashlib
19
+ import time
20
+ from pathlib import Path
21
+
22
+ from .configs import FrameworkConfig, LanguageConfig, ProjectConfig, get_language
23
+ from .unreal_rules import make_unreal_framework
24
+
25
+ EXCLUDE_PARTS = {".git", ".vs", "Binaries", "Build", "DerivedDataCache", "Intermediate", "Saved", "ThirdParty"}
26
+
27
+ _INVALID_MODULE_NAMES = frozenset({
28
+ "Private", "Public", "Classes", "Inc", "Src", "Source",
29
+ "Include", "Internal", "Tests", "Test",
30
+ })
31
+
32
+
33
+ def iter_source_files(
34
+ root: Path,
35
+ extensions: set[str],
36
+ exclude_parts: frozenset[str],
37
+ limit: int | None = None,
38
+ ) -> list[Path]:
39
+ files: list[Path] = []
40
+ for path in root.rglob("*"):
41
+ if not path.is_file() or path.suffix.lower() not in extensions:
42
+ continue
43
+ if set(path.parts) & exclude_parts:
44
+ continue
45
+ files.append(path)
46
+ if limit:
47
+ files = files[:limit]
48
+ return files
49
+
50
+
51
+ def infer_module_name(path: Path, source_marker: str = "Source", categories: set[str] | None = None) -> str | None:
52
+ parts = path.parts
53
+ if source_marker not in parts:
54
+ return None
55
+ idx = parts.index(source_marker)
56
+ cats = categories or set()
57
+ max_offset = len(parts) - idx - 1
58
+ for offset in range(1, max_offset):
59
+ candidate = parts[idx + offset]
60
+ if candidate in _INVALID_MODULE_NAMES:
61
+ continue
62
+ if candidate in cats:
63
+ continue
64
+ return candidate
65
+ return None
66
+
67
+
68
+ def _get_configs(
69
+ project: ProjectConfig,
70
+ ) -> dict[str, tuple[LanguageConfig, FrameworkConfig]]:
71
+ """Build a mapping from file extension -> (LanguageConfig, FrameworkConfig).
72
+
73
+ Framework selection is per-language:
74
+ - C++ files in an Unreal project get the Unreal framework (UCLASS, UFUNCTION, etc.)
75
+ - C# files in an Unreal project get the generic framework (no UE C++ rules)
76
+ - All files in a generic project get the generic framework
77
+ """
78
+ from .configs import make_generic_framework
79
+
80
+ # Build framework(s)
81
+ fw_name = project.framework_name
82
+ if fw_name == "unreal":
83
+ fw_unreal = make_unreal_framework()
84
+ else:
85
+ fw_unreal = None
86
+ fw_generic = make_generic_framework()
87
+
88
+ # Build languages (registry-based)
89
+ lang_names = set(project.extension_to_language.values())
90
+ langs: dict[str, LanguageConfig] = {}
91
+ for ln in lang_names:
92
+ langs[ln] = get_language(ln)
93
+
94
+ # Map extension -> (lang, fw)
95
+ # C# files always use generic framework even in Unreal projects,
96
+ # since UE C++ rules (UFUNCTION, UCLASS, etc.) don't apply to C#.
97
+ result: dict[str, tuple[LanguageConfig, FrameworkConfig]] = {}
98
+ for ext, ln in project.extension_to_language.items():
99
+ if fw_unreal and ln == "cpp":
100
+ result[ext] = (langs[ln], fw_unreal)
101
+ else:
102
+ result[ext] = (langs[ln], fw_generic)
103
+
104
+ return result
105
+
106
+
107
+ def _process_file(
108
+ file_id: int,
109
+ content: str,
110
+ lines: list[str],
111
+ lang: LanguageConfig,
112
+ fw: FrameworkConfig,
113
+ ) -> tuple[list, list, list]:
114
+ """Process a single file: extract symbols + edges."""
115
+ from .edge_extractor import extract_edges
116
+ from .symbol_analyzer import analyze_file
117
+
118
+ symbols, extras = analyze_file(lines, file_id, lang, fw)
119
+ edges = extract_edges(symbols, extras, lines, fw, lang)
120
+
121
+ return symbols, extras, edges
122
+
123
+
124
+ def build_index(
125
+ root: Path,
126
+ db_path: Path,
127
+ limit: int | None = None,
128
+ project: ProjectConfig | None = None,
129
+ ) -> int:
130
+ from .db import commit, connect, initialize_schema, insert_edges, insert_extra_symbols, insert_symbols, upsert_file
131
+
132
+ if project is None:
133
+ # Default to generic project config
134
+ from .configs import make_generic_project
135
+ project = make_generic_project()
136
+
137
+ conn = connect(db_path)
138
+ initialize_schema(conn)
139
+
140
+ # Clear existing data for clean rebuild
141
+ conn.execute("DELETE FROM strict_edges")
142
+ conn.execute("DELETE FROM symbol_index")
143
+ conn.execute("DELETE FROM file_content")
144
+ conn.commit()
145
+
146
+ extensions = set(project.extension_to_language.keys())
147
+ ext_configs = _get_configs(project)
148
+
149
+ files = iter_source_files(root, extensions, project.exclude_parts, limit)
150
+ print(f"Indexing {len(files)} files from {root}...", flush=True)
151
+ t_start = time.time()
152
+
153
+ count = 0
154
+ sym_count = 0
155
+ edge_count = 0
156
+ batch_size = 200
157
+
158
+ for _, path in enumerate(files):
159
+ try:
160
+ content = path.read_text(encoding="utf-8", errors="replace")
161
+ except OSError:
162
+ continue
163
+
164
+ digest = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
165
+ rel = str(path.relative_to(root)).replace("\\", "/")
166
+ module = infer_module_name(path, project.source_marker, project.categories)
167
+
168
+ # Dispatch by file extension
169
+ ext = path.suffix.lower()
170
+ from .configs import make_generic_framework
171
+ lang, fw_for_file = ext_configs.get(ext, (get_language("cpp"), make_generic_framework()))
172
+
173
+ file_id = upsert_file(conn, rel, module, content, digest, language=lang.name)
174
+ lines = content.split("\n")
175
+
176
+ sym_rows, extra_rows, edge_rows = _process_file(file_id, content, lines, lang, fw_for_file)
177
+
178
+ insert_symbols(conn, sym_rows)
179
+ insert_extra_symbols(conn, extra_rows)
180
+ insert_edges(conn, edge_rows)
181
+
182
+ sym_count += len(sym_rows) + len(extra_rows)
183
+ edge_count += len(edge_rows)
184
+ count += 1
185
+
186
+ if count % batch_size == 0:
187
+ commit(conn)
188
+ elapsed = time.time() - t_start
189
+ rate = count / elapsed if elapsed > 0 else 0
190
+ print(f" {count}/{len(files)} ({count/len(files)*100:.0f}%) "
191
+ f"{rate:.0f} files/s, {sym_count:,} symbols, {edge_count:,} edges", flush=True)
192
+
193
+ commit(conn)
194
+ elapsed = time.time() - t_start
195
+ print(f"Done: {count} files, {sym_count:,} symbols, {edge_count:,} edges in {elapsed:.1f}s", flush=True)
196
+
197
+ # Cross-language edge cleanup: remove edges where source and target
198
+ # symbols are defined in different languages
199
+ cross_before = conn.execute("SELECT COUNT(*) AS c FROM strict_edges").fetchone()["c"]
200
+ conn.execute("""
201
+ DELETE FROM strict_edges WHERE id IN (
202
+ SELECT e.id FROM strict_edges e
203
+ JOIN symbol_index si_src ON e.source_qn = si_src.qualified_name
204
+ JOIN symbol_index si_tgt ON e.target_qn = si_tgt.qualified_name
205
+ WHERE si_src.language != si_tgt.language
206
+ )
207
+ """)
208
+ conn.commit()
209
+ cross_after = conn.execute("SELECT COUNT(*) AS c FROM strict_edges").fetchone()["c"]
210
+ removed = cross_before - cross_after
211
+ if removed > 0:
212
+ print(f"Cross-language edges removed: {removed:,} ({cross_before:,} -> {cross_after:,})", flush=True)
213
+
214
+ # Print summary stats
215
+ fc = conn.execute("SELECT COUNT(*) AS c FROM file_content").fetchone()["c"]
216
+ sc = conn.execute("SELECT COUNT(*) AS c FROM symbol_index").fetchone()["c"]
217
+ ec = conn.execute("SELECT COUNT(*) AS c FROM strict_edges").fetchone()["c"]
218
+ print(f"DB stats: {fc:,} files, {sc:,} symbols, {ec:,} edges", flush=True)
219
+
220
+ # Print edge type breakdown
221
+ for row in conn.execute(
222
+ "SELECT edge_type, COUNT(*) AS c FROM strict_edges GROUP BY edge_type ORDER BY c DESC"
223
+ ).fetchall():
224
+ print(f" {row['edge_type']}: {row['c']:,}", flush=True)
225
+
226
+ # Print language breakdown
227
+ for row in conn.execute(
228
+ "SELECT language, COUNT(*) AS c FROM symbol_index GROUP BY language ORDER BY c DESC"
229
+ ).fetchall():
230
+ print(f" symbols[{row['language']}]: {row['c']:,}", flush=True)
231
+
232
+ conn.close()
233
+ return count
234
+
235
+
236
+ def main() -> None:
237
+ parser = argparse.ArgumentParser(
238
+ description="Build SQLite index for source code (three-table architecture)."
239
+ )
240
+ parser.add_argument("root", type=Path, help="Source root directory")
241
+ parser.add_argument("db", type=Path, help="SQLite database path")
242
+ parser.add_argument("--limit", type=int, default=None, help="File limit for testing")
243
+ parser.add_argument(
244
+ "--source-marker", default="",
245
+ help="Path component marking source root (default: auto-detect)"
246
+ )
247
+ parser.add_argument(
248
+ "--categories", default="",
249
+ help="Comma-separated category dirs to skip after source marker"
250
+ )
251
+ parser.add_argument(
252
+ "--framework", default="generic",
253
+ choices=["unreal", "generic"],
254
+ help="Framework rules to apply (default: generic)"
255
+ )
256
+ args = parser.parse_args()
257
+
258
+ cats = {c.strip() for c in args.categories.split(",") if c.strip()} or None
259
+
260
+ if args.framework == "unreal":
261
+ fw = make_unreal_framework()
262
+ from .configs import make_unreal_project
263
+ base_project = make_unreal_project(framework=fw)
264
+ source_marker = args.source_marker or "Source"
265
+ else:
266
+ from .configs import make_generic_framework, make_generic_project
267
+ fw = make_generic_framework()
268
+ base_project = make_generic_project()
269
+ source_marker = args.source_marker
270
+
271
+ project = ProjectConfig(
272
+ extension_to_language=base_project.extension_to_language,
273
+ exclude_parts=base_project.exclude_parts,
274
+ source_marker=source_marker,
275
+ categories=frozenset(cats) if cats else frozenset(),
276
+ invalid_module_names=base_project.invalid_module_names,
277
+ framework_name=fw.name,
278
+ )
279
+
280
+ build_index(args.root, args.db, args.limit, project)
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()