code-explore-by-sql 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_explore_by_sql-0.1.0.dist-info/METADATA +205 -0
- code_explore_by_sql-0.1.0.dist-info/RECORD +29 -0
- code_explore_by_sql-0.1.0.dist-info/WHEEL +4 -0
- code_explore_by_sql-0.1.0.dist-info/entry_points.txt +3 -0
- code_explore_by_sql-0.1.0.dist-info/licenses/LICENSE +21 -0
- code_source_sql/__init__.py +9 -0
- code_source_sql/__main__.py +5 -0
- code_source_sql/bracket_scanner.py +385 -0
- code_source_sql/build_db.py +284 -0
- code_source_sql/code_block_summary.py +522 -0
- code_source_sql/configs.py +402 -0
- code_source_sql/db.py +625 -0
- code_source_sql/edge_extractor.py +183 -0
- code_source_sql/languages/__init__.py +31 -0
- code_source_sql/languages/c.py +118 -0
- code_source_sql/languages/cpp.py +106 -0
- code_source_sql/languages/csharp.py +103 -0
- code_source_sql/languages/glsl.py +162 -0
- code_source_sql/languages/go.py +91 -0
- code_source_sql/languages/hlsl.py +155 -0
- code_source_sql/languages/java.py +98 -0
- code_source_sql/languages/javascript.py +215 -0
- code_source_sql/languages/kotlin.py +108 -0
- code_source_sql/languages/python.py +105 -0
- code_source_sql/languages/rust.py +91 -0
- code_source_sql/languages/swift.py +116 -0
- code_source_sql/server.py +264 -0
- code_source_sql/symbol_analyzer.py +487 -0
- code_source_sql/unreal_rules.py +163 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Build the three-table index for source code.
|
|
2
|
+
|
|
3
|
+
Pipeline per file:
|
|
4
|
+
1. Dispatch by file extension -> LanguageConfig
|
|
5
|
+
2. Read file content -> upsert into file_content (FTS5)
|
|
6
|
+
3. bracket_scanner -> bracket blocks
|
|
7
|
+
4. symbol_analyzer -> SymbolDef[] with QN normalization + decoration metadata
|
|
8
|
+
5. edge_extractor -> StrictEdge[] (4 types only)
|
|
9
|
+
6. Write symbol_index + strict_edges
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python -m code_source_sql.build_db <source_root> <db_path> [options]
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import hashlib
|
|
19
|
+
import time
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from .configs import FrameworkConfig, LanguageConfig, ProjectConfig, get_language
|
|
23
|
+
from .unreal_rules import make_unreal_framework
|
|
24
|
+
|
|
25
|
+
EXCLUDE_PARTS = {".git", ".vs", "Binaries", "Build", "DerivedDataCache", "Intermediate", "Saved", "ThirdParty"}
|
|
26
|
+
|
|
27
|
+
_INVALID_MODULE_NAMES = frozenset({
|
|
28
|
+
"Private", "Public", "Classes", "Inc", "Src", "Source",
|
|
29
|
+
"Include", "Internal", "Tests", "Test",
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def iter_source_files(
|
|
34
|
+
root: Path,
|
|
35
|
+
extensions: set[str],
|
|
36
|
+
exclude_parts: frozenset[str],
|
|
37
|
+
limit: int | None = None,
|
|
38
|
+
) -> list[Path]:
|
|
39
|
+
files: list[Path] = []
|
|
40
|
+
for path in root.rglob("*"):
|
|
41
|
+
if not path.is_file() or path.suffix.lower() not in extensions:
|
|
42
|
+
continue
|
|
43
|
+
if set(path.parts) & exclude_parts:
|
|
44
|
+
continue
|
|
45
|
+
files.append(path)
|
|
46
|
+
if limit:
|
|
47
|
+
files = files[:limit]
|
|
48
|
+
return files
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def infer_module_name(path: Path, source_marker: str = "Source", categories: set[str] | None = None) -> str | None:
|
|
52
|
+
parts = path.parts
|
|
53
|
+
if source_marker not in parts:
|
|
54
|
+
return None
|
|
55
|
+
idx = parts.index(source_marker)
|
|
56
|
+
cats = categories or set()
|
|
57
|
+
max_offset = len(parts) - idx - 1
|
|
58
|
+
for offset in range(1, max_offset):
|
|
59
|
+
candidate = parts[idx + offset]
|
|
60
|
+
if candidate in _INVALID_MODULE_NAMES:
|
|
61
|
+
continue
|
|
62
|
+
if candidate in cats:
|
|
63
|
+
continue
|
|
64
|
+
return candidate
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _get_configs(
|
|
69
|
+
project: ProjectConfig,
|
|
70
|
+
) -> dict[str, tuple[LanguageConfig, FrameworkConfig]]:
|
|
71
|
+
"""Build a mapping from file extension -> (LanguageConfig, FrameworkConfig).
|
|
72
|
+
|
|
73
|
+
Framework selection is per-language:
|
|
74
|
+
- C++ files in an Unreal project get the Unreal framework (UCLASS, UFUNCTION, etc.)
|
|
75
|
+
- C# files in an Unreal project get the generic framework (no UE C++ rules)
|
|
76
|
+
- All files in a generic project get the generic framework
|
|
77
|
+
"""
|
|
78
|
+
from .configs import make_generic_framework
|
|
79
|
+
|
|
80
|
+
# Build framework(s)
|
|
81
|
+
fw_name = project.framework_name
|
|
82
|
+
if fw_name == "unreal":
|
|
83
|
+
fw_unreal = make_unreal_framework()
|
|
84
|
+
else:
|
|
85
|
+
fw_unreal = None
|
|
86
|
+
fw_generic = make_generic_framework()
|
|
87
|
+
|
|
88
|
+
# Build languages (registry-based)
|
|
89
|
+
lang_names = set(project.extension_to_language.values())
|
|
90
|
+
langs: dict[str, LanguageConfig] = {}
|
|
91
|
+
for ln in lang_names:
|
|
92
|
+
langs[ln] = get_language(ln)
|
|
93
|
+
|
|
94
|
+
# Map extension -> (lang, fw)
|
|
95
|
+
# C# files always use generic framework even in Unreal projects,
|
|
96
|
+
# since UE C++ rules (UFUNCTION, UCLASS, etc.) don't apply to C#.
|
|
97
|
+
result: dict[str, tuple[LanguageConfig, FrameworkConfig]] = {}
|
|
98
|
+
for ext, ln in project.extension_to_language.items():
|
|
99
|
+
if fw_unreal and ln == "cpp":
|
|
100
|
+
result[ext] = (langs[ln], fw_unreal)
|
|
101
|
+
else:
|
|
102
|
+
result[ext] = (langs[ln], fw_generic)
|
|
103
|
+
|
|
104
|
+
return result
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _process_file(
|
|
108
|
+
file_id: int,
|
|
109
|
+
content: str,
|
|
110
|
+
lines: list[str],
|
|
111
|
+
lang: LanguageConfig,
|
|
112
|
+
fw: FrameworkConfig,
|
|
113
|
+
) -> tuple[list, list, list]:
|
|
114
|
+
"""Process a single file: extract symbols + edges."""
|
|
115
|
+
from .edge_extractor import extract_edges
|
|
116
|
+
from .symbol_analyzer import analyze_file
|
|
117
|
+
|
|
118
|
+
symbols, extras = analyze_file(lines, file_id, lang, fw)
|
|
119
|
+
edges = extract_edges(symbols, extras, lines, fw, lang)
|
|
120
|
+
|
|
121
|
+
return symbols, extras, edges
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def build_index(
|
|
125
|
+
root: Path,
|
|
126
|
+
db_path: Path,
|
|
127
|
+
limit: int | None = None,
|
|
128
|
+
project: ProjectConfig | None = None,
|
|
129
|
+
) -> int:
|
|
130
|
+
from .db import commit, connect, initialize_schema, insert_edges, insert_extra_symbols, insert_symbols, upsert_file
|
|
131
|
+
|
|
132
|
+
if project is None:
|
|
133
|
+
# Default to generic project config
|
|
134
|
+
from .configs import make_generic_project
|
|
135
|
+
project = make_generic_project()
|
|
136
|
+
|
|
137
|
+
conn = connect(db_path)
|
|
138
|
+
initialize_schema(conn)
|
|
139
|
+
|
|
140
|
+
# Clear existing data for clean rebuild
|
|
141
|
+
conn.execute("DELETE FROM strict_edges")
|
|
142
|
+
conn.execute("DELETE FROM symbol_index")
|
|
143
|
+
conn.execute("DELETE FROM file_content")
|
|
144
|
+
conn.commit()
|
|
145
|
+
|
|
146
|
+
extensions = set(project.extension_to_language.keys())
|
|
147
|
+
ext_configs = _get_configs(project)
|
|
148
|
+
|
|
149
|
+
files = iter_source_files(root, extensions, project.exclude_parts, limit)
|
|
150
|
+
print(f"Indexing {len(files)} files from {root}...", flush=True)
|
|
151
|
+
t_start = time.time()
|
|
152
|
+
|
|
153
|
+
count = 0
|
|
154
|
+
sym_count = 0
|
|
155
|
+
edge_count = 0
|
|
156
|
+
batch_size = 200
|
|
157
|
+
|
|
158
|
+
for _, path in enumerate(files):
|
|
159
|
+
try:
|
|
160
|
+
content = path.read_text(encoding="utf-8", errors="replace")
|
|
161
|
+
except OSError:
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
digest = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
|
|
165
|
+
rel = str(path.relative_to(root)).replace("\\", "/")
|
|
166
|
+
module = infer_module_name(path, project.source_marker, project.categories)
|
|
167
|
+
|
|
168
|
+
# Dispatch by file extension
|
|
169
|
+
ext = path.suffix.lower()
|
|
170
|
+
from .configs import make_generic_framework
|
|
171
|
+
lang, fw_for_file = ext_configs.get(ext, (get_language("cpp"), make_generic_framework()))
|
|
172
|
+
|
|
173
|
+
file_id = upsert_file(conn, rel, module, content, digest, language=lang.name)
|
|
174
|
+
lines = content.split("\n")
|
|
175
|
+
|
|
176
|
+
sym_rows, extra_rows, edge_rows = _process_file(file_id, content, lines, lang, fw_for_file)
|
|
177
|
+
|
|
178
|
+
insert_symbols(conn, sym_rows)
|
|
179
|
+
insert_extra_symbols(conn, extra_rows)
|
|
180
|
+
insert_edges(conn, edge_rows)
|
|
181
|
+
|
|
182
|
+
sym_count += len(sym_rows) + len(extra_rows)
|
|
183
|
+
edge_count += len(edge_rows)
|
|
184
|
+
count += 1
|
|
185
|
+
|
|
186
|
+
if count % batch_size == 0:
|
|
187
|
+
commit(conn)
|
|
188
|
+
elapsed = time.time() - t_start
|
|
189
|
+
rate = count / elapsed if elapsed > 0 else 0
|
|
190
|
+
print(f" {count}/{len(files)} ({count/len(files)*100:.0f}%) "
|
|
191
|
+
f"{rate:.0f} files/s, {sym_count:,} symbols, {edge_count:,} edges", flush=True)
|
|
192
|
+
|
|
193
|
+
commit(conn)
|
|
194
|
+
elapsed = time.time() - t_start
|
|
195
|
+
print(f"Done: {count} files, {sym_count:,} symbols, {edge_count:,} edges in {elapsed:.1f}s", flush=True)
|
|
196
|
+
|
|
197
|
+
# Cross-language edge cleanup: remove edges where source and target
|
|
198
|
+
# symbols are defined in different languages
|
|
199
|
+
cross_before = conn.execute("SELECT COUNT(*) AS c FROM strict_edges").fetchone()["c"]
|
|
200
|
+
conn.execute("""
|
|
201
|
+
DELETE FROM strict_edges WHERE id IN (
|
|
202
|
+
SELECT e.id FROM strict_edges e
|
|
203
|
+
JOIN symbol_index si_src ON e.source_qn = si_src.qualified_name
|
|
204
|
+
JOIN symbol_index si_tgt ON e.target_qn = si_tgt.qualified_name
|
|
205
|
+
WHERE si_src.language != si_tgt.language
|
|
206
|
+
)
|
|
207
|
+
""")
|
|
208
|
+
conn.commit()
|
|
209
|
+
cross_after = conn.execute("SELECT COUNT(*) AS c FROM strict_edges").fetchone()["c"]
|
|
210
|
+
removed = cross_before - cross_after
|
|
211
|
+
if removed > 0:
|
|
212
|
+
print(f"Cross-language edges removed: {removed:,} ({cross_before:,} -> {cross_after:,})", flush=True)
|
|
213
|
+
|
|
214
|
+
# Print summary stats
|
|
215
|
+
fc = conn.execute("SELECT COUNT(*) AS c FROM file_content").fetchone()["c"]
|
|
216
|
+
sc = conn.execute("SELECT COUNT(*) AS c FROM symbol_index").fetchone()["c"]
|
|
217
|
+
ec = conn.execute("SELECT COUNT(*) AS c FROM strict_edges").fetchone()["c"]
|
|
218
|
+
print(f"DB stats: {fc:,} files, {sc:,} symbols, {ec:,} edges", flush=True)
|
|
219
|
+
|
|
220
|
+
# Print edge type breakdown
|
|
221
|
+
for row in conn.execute(
|
|
222
|
+
"SELECT edge_type, COUNT(*) AS c FROM strict_edges GROUP BY edge_type ORDER BY c DESC"
|
|
223
|
+
).fetchall():
|
|
224
|
+
print(f" {row['edge_type']}: {row['c']:,}", flush=True)
|
|
225
|
+
|
|
226
|
+
# Print language breakdown
|
|
227
|
+
for row in conn.execute(
|
|
228
|
+
"SELECT language, COUNT(*) AS c FROM symbol_index GROUP BY language ORDER BY c DESC"
|
|
229
|
+
).fetchall():
|
|
230
|
+
print(f" symbols[{row['language']}]: {row['c']:,}", flush=True)
|
|
231
|
+
|
|
232
|
+
conn.close()
|
|
233
|
+
return count
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def main() -> None:
|
|
237
|
+
parser = argparse.ArgumentParser(
|
|
238
|
+
description="Build SQLite index for source code (three-table architecture)."
|
|
239
|
+
)
|
|
240
|
+
parser.add_argument("root", type=Path, help="Source root directory")
|
|
241
|
+
parser.add_argument("db", type=Path, help="SQLite database path")
|
|
242
|
+
parser.add_argument("--limit", type=int, default=None, help="File limit for testing")
|
|
243
|
+
parser.add_argument(
|
|
244
|
+
"--source-marker", default="",
|
|
245
|
+
help="Path component marking source root (default: auto-detect)"
|
|
246
|
+
)
|
|
247
|
+
parser.add_argument(
|
|
248
|
+
"--categories", default="",
|
|
249
|
+
help="Comma-separated category dirs to skip after source marker"
|
|
250
|
+
)
|
|
251
|
+
parser.add_argument(
|
|
252
|
+
"--framework", default="generic",
|
|
253
|
+
choices=["unreal", "generic"],
|
|
254
|
+
help="Framework rules to apply (default: generic)"
|
|
255
|
+
)
|
|
256
|
+
args = parser.parse_args()
|
|
257
|
+
|
|
258
|
+
cats = {c.strip() for c in args.categories.split(",") if c.strip()} or None
|
|
259
|
+
|
|
260
|
+
if args.framework == "unreal":
|
|
261
|
+
fw = make_unreal_framework()
|
|
262
|
+
from .configs import make_unreal_project
|
|
263
|
+
base_project = make_unreal_project(framework=fw)
|
|
264
|
+
source_marker = args.source_marker or "Source"
|
|
265
|
+
else:
|
|
266
|
+
from .configs import make_generic_framework, make_generic_project
|
|
267
|
+
fw = make_generic_framework()
|
|
268
|
+
base_project = make_generic_project()
|
|
269
|
+
source_marker = args.source_marker
|
|
270
|
+
|
|
271
|
+
project = ProjectConfig(
|
|
272
|
+
extension_to_language=base_project.extension_to_language,
|
|
273
|
+
exclude_parts=base_project.exclude_parts,
|
|
274
|
+
source_marker=source_marker,
|
|
275
|
+
categories=frozenset(cats) if cats else frozenset(),
|
|
276
|
+
invalid_module_names=base_project.invalid_module_names,
|
|
277
|
+
framework_name=fw.name,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
build_index(args.root, args.db, args.limit, project)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
if __name__ == "__main__":
|
|
284
|
+
main()
|