code-explore-by-sql 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_explore_by_sql-0.1.0.dist-info/METADATA +205 -0
- code_explore_by_sql-0.1.0.dist-info/RECORD +29 -0
- code_explore_by_sql-0.1.0.dist-info/WHEEL +4 -0
- code_explore_by_sql-0.1.0.dist-info/entry_points.txt +3 -0
- code_explore_by_sql-0.1.0.dist-info/licenses/LICENSE +21 -0
- code_source_sql/__init__.py +9 -0
- code_source_sql/__main__.py +5 -0
- code_source_sql/bracket_scanner.py +385 -0
- code_source_sql/build_db.py +284 -0
- code_source_sql/code_block_summary.py +522 -0
- code_source_sql/configs.py +402 -0
- code_source_sql/db.py +625 -0
- code_source_sql/edge_extractor.py +183 -0
- code_source_sql/languages/__init__.py +31 -0
- code_source_sql/languages/c.py +118 -0
- code_source_sql/languages/cpp.py +106 -0
- code_source_sql/languages/csharp.py +103 -0
- code_source_sql/languages/glsl.py +162 -0
- code_source_sql/languages/go.py +91 -0
- code_source_sql/languages/hlsl.py +155 -0
- code_source_sql/languages/java.py +98 -0
- code_source_sql/languages/javascript.py +215 -0
- code_source_sql/languages/kotlin.py +108 -0
- code_source_sql/languages/python.py +105 -0
- code_source_sql/languages/rust.py +91 -0
- code_source_sql/languages/swift.py +116 -0
- code_source_sql/server.py +264 -0
- code_source_sql/symbol_analyzer.py +487 -0
- code_source_sql/unreal_rules.py +163 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: code-explore-by-sql
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SQLite FTS5 (trigram) MCP server for code source search.
|
|
5
|
+
Project-URL: Repository, https://github.com/didi514354875/code-explore-by-sql
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: mcp[cli]>=1.2.0
|
|
20
|
+
Requires-Dist: pydantic>=2.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# code-explore-by-sql
|
|
24
|
+
|
|
25
|
+
Local stdio MCP server for fast source code navigation using **SQLite FTS5** (trigram tokenizer) + **bracket skeleton indexing**.
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
- **Full-text search**: FTS5 with trigram tokenizer for code-symbol-precise search (`GetGBuffer`, `FMaterial`, `UE_LOG`)
|
|
30
|
+
- **Symbol lookup**: read code by qualified name with fuzzy matching (`Jump` → `ACharacter::Jump`)
|
|
31
|
+
- **Bracket skeleton index**: lightweight structural indexing via FSM brace matching (no AST parser needed)
|
|
32
|
+
- **12 language support**: C, C++, C#, Go, HLSL, GLSL, Java, JavaScript, Kotlin, Python, Rust, Swift
|
|
33
|
+
- **Multi-database**: query multiple codebases simultaneously via `CODE_SOURCE_DBS`
|
|
34
|
+
- **Token-efficient responses**: compact snippets (~2,600 tokens/20 results, 95% reduction vs full file reads)
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
### From PyPI (recommended)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Run the MCP server directly (no clone needed)
|
|
42
|
+
uvx code-explore-by-sql
|
|
43
|
+
|
|
44
|
+
# Or install persistently
|
|
45
|
+
pip install code-explore-by-sql
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Build a database
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Build index for your codebase
|
|
52
|
+
uvx code-source-sql-build-db /path/to/source /path/to/output.db
|
|
53
|
+
|
|
54
|
+
# Smoke test with limited files
|
|
55
|
+
uvx code-source-sql-build-db /path/to/source /path/to/output.db --limit 1000
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Performance: ~84,700 files indexed in ~3.3 minutes on a 2-core machine.
|
|
59
|
+
|
|
60
|
+
### Configure in MCP clients
|
|
61
|
+
|
|
62
|
+
**Claude Code** (`.claude/mcp.json`):
|
|
63
|
+
```json
|
|
64
|
+
{
|
|
65
|
+
"mcpServers": {
|
|
66
|
+
"code-source-sql": {
|
|
67
|
+
"command": "uvx",
|
|
68
|
+
"args": ["code-explore-by-sql"],
|
|
69
|
+
"env": {
|
|
70
|
+
"CODE_SOURCE_DB": "/path/to/your/code.db",
|
|
71
|
+
"CODE_SOURCE_DBS": "/path/to/your/code.db:/path/to/another.db"
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**VS Code** (`.vscode/mcp.json`):
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"servers": {
|
|
82
|
+
"code-source-sql": {
|
|
83
|
+
"type": "stdio",
|
|
84
|
+
"command": "uvx",
|
|
85
|
+
"args": ["code-explore-by-sql"],
|
|
86
|
+
"env": {
|
|
87
|
+
"CODE_SOURCE_DB": "/path/to/your/code.db"
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**OpenAI Codex** (`~/.codex/config.toml`):
|
|
95
|
+
```toml
|
|
96
|
+
[mcp_servers.code-source-sql]
|
|
97
|
+
command = "uvx"
|
|
98
|
+
args = ["code-explore-by-sql"]
|
|
99
|
+
|
|
100
|
+
[mcp_servers.code-source-sql.env]
|
|
101
|
+
CODE_SOURCE_DB = "/path/to/your/code.db"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Hermes Agent** (`~/.hermes/config.yaml`):
|
|
105
|
+
```yaml
|
|
106
|
+
mcp_servers:
|
|
107
|
+
code-source-sql:
|
|
108
|
+
command: uvx
|
|
109
|
+
args:
|
|
110
|
+
- code-explore-by-sql
|
|
111
|
+
env:
|
|
112
|
+
CODE_SOURCE_DB: /path/to/your/code.db
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Tools (5)
|
|
116
|
+
|
|
117
|
+
| Tool | Purpose |
|
|
118
|
+
|------|---------|
|
|
119
|
+
| `list_databases` | Discover available databases with stats |
|
|
120
|
+
| `search_fts_tool` | FTS5 search — locate code blocks by keyword or raw FTS5 query |
|
|
121
|
+
| `read_symbol` | Read symbol code by qualified name (exact or fuzzy) |
|
|
122
|
+
| `read_file_range` | Read source code by file path and line range |
|
|
123
|
+
| `get_directory_structure` | Module/file counts overview |
|
|
124
|
+
|
|
125
|
+
### Multi-database
|
|
126
|
+
|
|
127
|
+
Each tool accepts an optional `db` parameter to select a database by alias. Aliases are derived from database filenames (`unreal.db` → `"unreal"`). Use `list_databases` to discover available aliases. Default (`db=""`) uses the primary database (`CODE_SOURCE_DB`).
|
|
128
|
+
|
|
129
|
+
### Search query modes
|
|
130
|
+
|
|
131
|
+
**Simple mode** (`keyword`):
|
|
132
|
+
```
|
|
133
|
+
keyword="GetGBuffer"
|
|
134
|
+
keyword="FMaterial Render"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Advanced mode** (`raw_query`) — full FTS5 boolean:
|
|
138
|
+
```
|
|
139
|
+
raw_query='"GetGBuffer" AND "Emissive"'
|
|
140
|
+
raw_query='"Material" NOT "hlsl"'
|
|
141
|
+
raw_query='(file_path : "BasePass") AND "roughness"'
|
|
142
|
+
raw_query='(module_name : "Renderer") AND "VirtualTexture"'
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Three-level funnel
|
|
146
|
+
|
|
147
|
+
1. **`search_fts_tool(keyword)`** → file candidates + block QNs
|
|
148
|
+
2. **`search_fts_tool(raw_query, file_path filter)`** → precise block in target file
|
|
149
|
+
3. **`read_symbol(block QN)`** or **`read_file_range(file, line)`** → full code
|
|
150
|
+
|
|
151
|
+
## Architecture
|
|
152
|
+
|
|
153
|
+
```
|
|
154
|
+
┌──────────────────────────────────────────────────────────────┐
|
|
155
|
+
│ MCP Server (FastMCP) │
|
|
156
|
+
├──────────┬──────────┬──────────┬──────────┬──────────────────┤
|
|
157
|
+
│ search │ read │ read │ get_dir │ list │
|
|
158
|
+
│ fts_tool │ _symbol │ _file │ _struct │ _databases │
|
|
159
|
+
│ │ │ _range │ │ │
|
|
160
|
+
├──────────┴──────────┴──────────┴──────────┴──────────────────┤
|
|
161
|
+
│ Query Pipeline │
|
|
162
|
+
│ FTS5 trigram → Symbol match → Edge extraction │
|
|
163
|
+
├──────────────────────────────────────────────────────────────┤
|
|
164
|
+
│ SQLite Database │
|
|
165
|
+
│ file_content + FTS5 │ symbol_index │ strict_edges │
|
|
166
|
+
└──────────────────────────────────────────────────────────────┘
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Bracket skeleton index
|
|
170
|
+
|
|
171
|
+
A 6-state finite state machine (CODE, LINE_COMMENT, BLOCK_COMMENT, STRING, CHAR_LITERAL, RAW_STRING) scans source code tracking brace pairs while correctly ignoring braces in comments and string literals. Each matched pair records `open_line`, `close_line`, `depth`, and `is_complete`.
|
|
172
|
+
|
|
173
|
+
Top-level blocks are classified by a **symbol analyzer** producing `block_type` (namespace/class/enum/function/macro) and `block_name` (qualified name).
|
|
174
|
+
|
|
175
|
+
### Multi-database registry
|
|
176
|
+
|
|
177
|
+
Databases are registered via environment variables at server startup:
|
|
178
|
+
- `CODE_SOURCE_DB` — primary database (default when `db` is omitted)
|
|
179
|
+
- `CODE_SOURCE_DBS` — colon-separated list of additional databases
|
|
180
|
+
|
|
181
|
+
Aliases are auto-derived from filename stems. Connections are cached with health checks.
|
|
182
|
+
|
|
183
|
+
## Environment Variables
|
|
184
|
+
|
|
185
|
+
| Variable | Required | Description |
|
|
186
|
+
|----------|----------|-------------|
|
|
187
|
+
| `CODE_SOURCE_DB` | Yes | Path to primary SQLite database |
|
|
188
|
+
| `CODE_SOURCE_DBS` | No | Colon-separated paths to additional databases |
|
|
189
|
+
|
|
190
|
+
## Development
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Clone and setup
|
|
194
|
+
git clone https://github.com/didi514354875/code-explore-by-sql.git
|
|
195
|
+
cd code-explore-by-sql
|
|
196
|
+
uv sync --dev
|
|
197
|
+
|
|
198
|
+
# Run tests
|
|
199
|
+
uv run pytest
|
|
200
|
+
uv run ruff check .
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
MIT
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
code_source_sql/__init__.py,sha256=jsHrPvnls-PZakFA6Smi4LmIn5Mqwh8IgYMovpduwww,206
|
|
2
|
+
code_source_sql/__main__.py,sha256=BbeiAT_uKtmwhiW_6UjrUTp6uwZPBXozuvw0igKJT6Q,114
|
|
3
|
+
code_source_sql/bracket_scanner.py,sha256=A9x0ABbiE7mOr7DQmUDWq2y8En9OAK2IZqqxW-_BdoA,13720
|
|
4
|
+
code_source_sql/build_db.py,sha256=7cRI6LTF8QSNlE0kdszMBlkdEDYuUKL5WmkAPLb4FDs,9957
|
|
5
|
+
code_source_sql/code_block_summary.py,sha256=VxMjyIVdryB2o7jYMwLgjBvR63-nBCCB-g_P9RlfYRo,16519
|
|
6
|
+
code_source_sql/configs.py,sha256=7WvjFR7Fyi4lPLTmbWPT5AoVIxxRqq2399fRYmLAjbk,15907
|
|
7
|
+
code_source_sql/db.py,sha256=4ScaahbeEFtUE1VU4GgLSKz0lYF91d_HNP67A2XbkJA,22254
|
|
8
|
+
code_source_sql/edge_extractor.py,sha256=bWGedGYFWeNCxtYXsPlI79vqoUZG8fKVpjl7UFp4-UI,7157
|
|
9
|
+
code_source_sql/server.py,sha256=4KUTvQP4UbZ-SWP3BE8nVTCI32pUlhTV1u4PfNaSREU,8587
|
|
10
|
+
code_source_sql/symbol_analyzer.py,sha256=lbuKCim5Pg7sJhSeq7l86NaGPWxWkCHRKYwy2tNXvUQ,17225
|
|
11
|
+
code_source_sql/unreal_rules.py,sha256=brv_Ktg5BoOKwaZhqWSa0POrJjzBcFF4eyQUb8oNOxo,6452
|
|
12
|
+
code_source_sql/languages/__init__.py,sha256=vsN0FMT2Zw2uN8ngwrPSbCzx99IUre8SXNZszc0VKdM,1273
|
|
13
|
+
code_source_sql/languages/c.py,sha256=2p4oPjiWk64-35A7pkHDGIeaJh2YePi_LR55lF8To_I,4632
|
|
14
|
+
code_source_sql/languages/cpp.py,sha256=q8Q4Md2wDBunpOoPkLkWW4qbQkhVxc3G-7d0bR_-TGc,4723
|
|
15
|
+
code_source_sql/languages/csharp.py,sha256=KuE3L_kSVzARIE5mo6FtzDMYFtt5A7bzsyOLe7yU_14,4405
|
|
16
|
+
code_source_sql/languages/glsl.py,sha256=ODyNfLDIOFkffIeDj5QTunQDIwz27g0TeHd573pywjM,6121
|
|
17
|
+
code_source_sql/languages/go.py,sha256=1nJoIc1oFw-zyKC4Ak-IsTykyfv7HluJu3BXWIbJdQQ,3896
|
|
18
|
+
code_source_sql/languages/hlsl.py,sha256=Rzahkljdww3PXcjPhc3hR3_UbF_KGi3cKTd2wXpBx_E,6158
|
|
19
|
+
code_source_sql/languages/java.py,sha256=Bx-G8MXWJ4bAyLWI-Hp7zZMGek-yYzMREh_9Mz4_cmo,4109
|
|
20
|
+
code_source_sql/languages/javascript.py,sha256=al1OhJ1OmECrNOftOg92FUHXlwQ19dGA0r6BOrcYzgc,8247
|
|
21
|
+
code_source_sql/languages/kotlin.py,sha256=AsuDQHFG4Z6QWLNaPXS3L4xXP70hfpmcif2qrEMNRi8,4093
|
|
22
|
+
code_source_sql/languages/python.py,sha256=CLEVm3UcXKVi-5Ue1FkgNaixvsP_jB7QDfpot39aA-0,3987
|
|
23
|
+
code_source_sql/languages/rust.py,sha256=vBqMsyeFMptk2Hl9Tp5rbj3F8QHTvqWi8whlhwJDYAc,4073
|
|
24
|
+
code_source_sql/languages/swift.py,sha256=UtFiQki14A-QXwJlg2JK61PoMI_e9ZUtlLb3SUqtIe0,4325
|
|
25
|
+
code_explore_by_sql-0.1.0.dist-info/METADATA,sha256=MKx9wA1szklj1C7QJ63O3u0r7VTIOX0lCp32Kchmhko,7332
|
|
26
|
+
code_explore_by_sql-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
27
|
+
code_explore_by_sql-0.1.0.dist-info/entry_points.txt,sha256=6WGaA6HTPD_ZLScWXGUpK81hl9MRM_McDqPu0OWD6Tc,114
|
|
28
|
+
code_explore_by_sql-0.1.0.dist-info/licenses/LICENSE,sha256=SC5Fwf9HNtdRRBSyPXU9NWn3it1kbpEpXGOQbhxa7KQ,1063
|
|
29
|
+
code_explore_by_sql-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 yanwei
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
"""Bracket skeleton scanner — lightweight structural indexing for source code.
|
|
2
|
+
|
|
3
|
+
FSM tracks brace depth while correctly ignoring braces inside comments,
|
|
4
|
+
string literals, character literals, raw strings, verbatim strings,
|
|
5
|
+
triple-quoted strings, and template literals. Supports multiple languages
|
|
6
|
+
via optional LanguageConfig parameter. When `lang is None`, defaults to
|
|
7
|
+
C/C++ syntax (backward compatible).
|
|
8
|
+
|
|
9
|
+
For indent-based languages (Python), delegates to _scan_indent_blocks().
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from .configs import LanguageConfig
|
|
19
|
+
|
|
20
|
+
# FSM states
|
|
21
|
+
_CODE = 0
|
|
22
|
+
_LINE_COMMENT = 1
|
|
23
|
+
_BLOCK_COMMENT = 2
|
|
24
|
+
_STRING = 3
|
|
25
|
+
_CHAR_LITERAL = 4
|
|
26
|
+
_RAW_STRING = 5
|
|
27
|
+
_VERBATIM_STRING = 6
|
|
28
|
+
_TRIPLE_STRING = 7
|
|
29
|
+
_TEMPLATE_LITERAL = 8
|
|
30
|
+
|
|
31
|
+
_TRIGGER_CHARS_BASE = frozenset('{}"/\'R@')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class BracketBlock:
|
|
36
|
+
open_line: int # 1-based
|
|
37
|
+
close_line: int # 1-based
|
|
38
|
+
depth: int
|
|
39
|
+
is_complete: bool
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def scan_brackets(
|
|
43
|
+
content: str,
|
|
44
|
+
verbatim_string_prefix: str | None = None,
|
|
45
|
+
raw_string_char: str | None = None,
|
|
46
|
+
lang: LanguageConfig | None = None,
|
|
47
|
+
) -> list[BracketBlock]:
|
|
48
|
+
"""Scan content and return matched brace pairs with depth info.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
content: Source code text to scan.
|
|
52
|
+
verbatim_string_prefix: If set (e.g. '@' for C#), enables verbatim
|
|
53
|
+
string handling. Braces inside verbatim strings are ignored.
|
|
54
|
+
Ignored when `lang` is provided (resolved from lang instead).
|
|
55
|
+
raw_string_char: If set (e.g. 'R' for C++), enables raw string
|
|
56
|
+
literal handling. Braces inside raw strings are ignored.
|
|
57
|
+
Ignored when `lang` is provided (resolved from lang instead).
|
|
58
|
+
lang: Optional LanguageConfig for multi-language support. When
|
|
59
|
+
provided, all syntax parameters are resolved from it. When
|
|
60
|
+
None, defaults to C/C++ syntax (backward compatible).
|
|
61
|
+
"""
|
|
62
|
+
# Indent-based language dispatch
|
|
63
|
+
if lang and lang.uses_indent_blocks:
|
|
64
|
+
return _scan_indent_blocks(content, lang)
|
|
65
|
+
|
|
66
|
+
# Resolve syntax parameters from lang or legacy params
|
|
67
|
+
if lang:
|
|
68
|
+
_line_comment = lang.line_comment
|
|
69
|
+
_block_comment_open, _block_comment_close = lang.block_comment_pair or (None, None)
|
|
70
|
+
_string_delims = lang.string_delimiters
|
|
71
|
+
_escape_char = lang.string_escape_char
|
|
72
|
+
_triple_quotes = lang.triple_quote_strings
|
|
73
|
+
_has_template_strings = lang.has_template_strings
|
|
74
|
+
_raw_style = lang.raw_string_style
|
|
75
|
+
# Override legacy params from lang
|
|
76
|
+
raw_string_char = lang.raw_string_char
|
|
77
|
+
verbatim_string_prefix = lang.verbatim_string_prefix
|
|
78
|
+
else:
|
|
79
|
+
_line_comment = "//"
|
|
80
|
+
_block_comment_open, _block_comment_close = "/*", "*/"
|
|
81
|
+
_string_delims = frozenset({'"', "'"})
|
|
82
|
+
_escape_char = "\\"
|
|
83
|
+
_triple_quotes = ()
|
|
84
|
+
_has_template_strings = False
|
|
85
|
+
_raw_style = "cpp"
|
|
86
|
+
|
|
87
|
+
depth = 0
|
|
88
|
+
stack: list[tuple[int, int]] = [] # (open_line, open_depth)
|
|
89
|
+
blocks: list[BracketBlock] = []
|
|
90
|
+
state = _CODE
|
|
91
|
+
raw_delim = ""
|
|
92
|
+
_current_string_delim = '"'
|
|
93
|
+
_current_triple_end = '"""'
|
|
94
|
+
|
|
95
|
+
# Build trigger chars for line-skip optimisation
|
|
96
|
+
_base = frozenset('{}"/\'')
|
|
97
|
+
extras = set()
|
|
98
|
+
if raw_string_char:
|
|
99
|
+
extras.add(raw_string_char)
|
|
100
|
+
if verbatim_string_prefix:
|
|
101
|
+
extras.add(verbatim_string_prefix)
|
|
102
|
+
if _has_template_strings:
|
|
103
|
+
extras.add('`')
|
|
104
|
+
trigger_chars = _base | frozenset(extras)
|
|
105
|
+
|
|
106
|
+
lines = content.split("\n")
|
|
107
|
+
last_line = len(lines)
|
|
108
|
+
|
|
109
|
+
for line_idx, line in enumerate(lines, start=1):
|
|
110
|
+
if state == _CODE and not any(c in line for c in trigger_chars):
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
i = 0
|
|
114
|
+
n = len(line)
|
|
115
|
+
while i < n:
|
|
116
|
+
ch = line[i]
|
|
117
|
+
next_ch = line[i + 1] if i + 1 < n else ""
|
|
118
|
+
|
|
119
|
+
if state == _CODE:
|
|
120
|
+
# Line comment
|
|
121
|
+
if _line_comment and line[i:i + len(_line_comment)] == _line_comment:
|
|
122
|
+
state = _LINE_COMMENT
|
|
123
|
+
i += len(_line_comment)
|
|
124
|
+
continue
|
|
125
|
+
# Block comment open
|
|
126
|
+
if _block_comment_open and line[i:i + len(_block_comment_open)] == _block_comment_open:
|
|
127
|
+
state = _BLOCK_COMMENT
|
|
128
|
+
i += len(_block_comment_open)
|
|
129
|
+
continue
|
|
130
|
+
# Raw string (C++ R"delim(...)delim" or Rust r#"..."#)
|
|
131
|
+
if raw_string_char and ch == raw_string_char and next_ch == '"':
|
|
132
|
+
if _raw_style == "rust":
|
|
133
|
+
# Rust: r"...", r#"..."#, r##"..."##
|
|
134
|
+
j = i + 2
|
|
135
|
+
hash_count = 0
|
|
136
|
+
while j < n and line[j] == '#':
|
|
137
|
+
hash_count += 1
|
|
138
|
+
j += 1
|
|
139
|
+
raw_delim = "#" * hash_count
|
|
140
|
+
state = _RAW_STRING
|
|
141
|
+
i = j
|
|
142
|
+
continue
|
|
143
|
+
else:
|
|
144
|
+
# C++ style: R"delim(...)delim"
|
|
145
|
+
delim_end = line.find("(", i + 2)
|
|
146
|
+
if delim_end != -1:
|
|
147
|
+
raw_delim = line[i + 2 : delim_end]
|
|
148
|
+
state = _RAW_STRING
|
|
149
|
+
i = delim_end + 1
|
|
150
|
+
continue
|
|
151
|
+
# Verbatim string: @"..." (C#) — doubles "" are escaped quotes
|
|
152
|
+
if verbatim_string_prefix and ch == verbatim_string_prefix and next_ch == '"':
|
|
153
|
+
state = _VERBATIM_STRING
|
|
154
|
+
i += 2
|
|
155
|
+
continue
|
|
156
|
+
# Triple-quoted strings (Python """ and ''')
|
|
157
|
+
if _triple_quotes:
|
|
158
|
+
matched_tq = None
|
|
159
|
+
for tq in _triple_quotes:
|
|
160
|
+
if line[i:i + len(tq)] == tq:
|
|
161
|
+
matched_tq = tq
|
|
162
|
+
break
|
|
163
|
+
if matched_tq:
|
|
164
|
+
_current_triple_end = matched_tq
|
|
165
|
+
state = _TRIPLE_STRING
|
|
166
|
+
i += len(matched_tq)
|
|
167
|
+
continue
|
|
168
|
+
# Template literals (JavaScript/TypeScript backtick)
|
|
169
|
+
if _has_template_strings and ch == '`':
|
|
170
|
+
state = _TEMPLATE_LITERAL
|
|
171
|
+
i += 1
|
|
172
|
+
continue
|
|
173
|
+
# String delimiters
|
|
174
|
+
if ch in _string_delims:
|
|
175
|
+
state = _STRING
|
|
176
|
+
_current_string_delim = ch
|
|
177
|
+
i += 1
|
|
178
|
+
continue
|
|
179
|
+
# Char literal (only when ' is NOT a string delimiter)
|
|
180
|
+
if ch == "'" and "'" not in _string_delims:
|
|
181
|
+
state = _CHAR_LITERAL
|
|
182
|
+
i += 1
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
if ch == "{":
|
|
186
|
+
depth += 1
|
|
187
|
+
stack.append((line_idx, depth))
|
|
188
|
+
elif ch == "}":
|
|
189
|
+
if stack:
|
|
190
|
+
open_line, open_depth = stack.pop()
|
|
191
|
+
blocks.append(BracketBlock(
|
|
192
|
+
open_line=open_line,
|
|
193
|
+
close_line=line_idx,
|
|
194
|
+
depth=open_depth,
|
|
195
|
+
is_complete=True,
|
|
196
|
+
))
|
|
197
|
+
depth = max(0, depth - 1)
|
|
198
|
+
|
|
199
|
+
i += 1
|
|
200
|
+
|
|
201
|
+
elif state == _LINE_COMMENT:
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
elif state == _BLOCK_COMMENT:
|
|
205
|
+
if _block_comment_close and line[i:i + len(_block_comment_close)] == _block_comment_close:
|
|
206
|
+
state = _CODE
|
|
207
|
+
i += len(_block_comment_close)
|
|
208
|
+
continue
|
|
209
|
+
i += 1
|
|
210
|
+
|
|
211
|
+
elif state == _STRING:
|
|
212
|
+
if _escape_char and ch == _escape_char:
|
|
213
|
+
i += 2
|
|
214
|
+
continue
|
|
215
|
+
if ch == _current_string_delim:
|
|
216
|
+
state = _CODE
|
|
217
|
+
i += 1
|
|
218
|
+
|
|
219
|
+
elif state == _CHAR_LITERAL:
|
|
220
|
+
if _escape_char and ch == _escape_char:
|
|
221
|
+
i += 2
|
|
222
|
+
continue
|
|
223
|
+
if ch == "'":
|
|
224
|
+
state = _CODE
|
|
225
|
+
i += 1
|
|
226
|
+
|
|
227
|
+
elif state == _RAW_STRING:
|
|
228
|
+
if _raw_style == "rust":
|
|
229
|
+
end_marker = raw_delim + '"'
|
|
230
|
+
else:
|
|
231
|
+
end_marker = ")" + raw_delim + '"'
|
|
232
|
+
pos = line.find(end_marker, i)
|
|
233
|
+
if pos != -1:
|
|
234
|
+
state = _CODE
|
|
235
|
+
i = pos + len(end_marker)
|
|
236
|
+
continue
|
|
237
|
+
break
|
|
238
|
+
|
|
239
|
+
elif state == _VERBATIM_STRING:
|
|
240
|
+
# In verbatim strings, "" is an escaped quote, " alone ends the string
|
|
241
|
+
if ch == '"':
|
|
242
|
+
if next_ch == '"':
|
|
243
|
+
i += 2 # skip escaped quote
|
|
244
|
+
continue
|
|
245
|
+
else:
|
|
246
|
+
state = _CODE
|
|
247
|
+
i += 1
|
|
248
|
+
continue
|
|
249
|
+
i += 1
|
|
250
|
+
|
|
251
|
+
elif state == _TRIPLE_STRING:
|
|
252
|
+
tq_end = _current_triple_end
|
|
253
|
+
pos = line.find(tq_end, i)
|
|
254
|
+
if pos != -1:
|
|
255
|
+
state = _CODE
|
|
256
|
+
i = pos + len(tq_end)
|
|
257
|
+
continue
|
|
258
|
+
break # multi-line triple string
|
|
259
|
+
|
|
260
|
+
elif state == _TEMPLATE_LITERAL:
|
|
261
|
+
if _escape_char and ch == _escape_char:
|
|
262
|
+
i += 2
|
|
263
|
+
continue
|
|
264
|
+
if ch == '`':
|
|
265
|
+
state = _CODE
|
|
266
|
+
i += 1
|
|
267
|
+
|
|
268
|
+
if state == _LINE_COMMENT:
|
|
269
|
+
state = _CODE
|
|
270
|
+
|
|
271
|
+
for open_line, open_depth in stack:
|
|
272
|
+
blocks.append(BracketBlock(
|
|
273
|
+
open_line=open_line,
|
|
274
|
+
close_line=last_line,
|
|
275
|
+
depth=open_depth,
|
|
276
|
+
is_complete=False,
|
|
277
|
+
))
|
|
278
|
+
|
|
279
|
+
return blocks
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _scan_indent_blocks(content: str, lang: LanguageConfig) -> list[BracketBlock]:
|
|
283
|
+
"""Scan indent-based source and return pseudo-brace blocks.
|
|
284
|
+
|
|
285
|
+
Detects block openings when indentation increases relative to the
|
|
286
|
+
previous significant line. Closes blocks on dedent. Tracks
|
|
287
|
+
triple-quoted strings to avoid false positives from colons inside
|
|
288
|
+
string literals.
|
|
289
|
+
"""
|
|
290
|
+
lines = content.split("\n")
|
|
291
|
+
blocks: list[BracketBlock] = []
|
|
292
|
+
stack: list[tuple[int, int, int]] = [] # (open_line_1based, depth, indent_level)
|
|
293
|
+
depth = 0
|
|
294
|
+
in_triple_string = False
|
|
295
|
+
triple_end = ""
|
|
296
|
+
prev_indent = 0 # indent of previous significant line
|
|
297
|
+
prev_line_idx = 0 # 1-based line number of previous significant line
|
|
298
|
+
|
|
299
|
+
_line_comment = lang.line_comment
|
|
300
|
+
_triple_quotes = lang.triple_quote_strings
|
|
301
|
+
|
|
302
|
+
for line_idx, line in enumerate(lines, start=1):
|
|
303
|
+
stripped = line.strip()
|
|
304
|
+
|
|
305
|
+
# Track triple-quoted strings (they may contain colons)
|
|
306
|
+
if _triple_quotes and not in_triple_string:
|
|
307
|
+
for tq in _triple_quotes:
|
|
308
|
+
idx = stripped.find(tq)
|
|
309
|
+
if idx != -1:
|
|
310
|
+
# Check if it closes on same line
|
|
311
|
+
after = stripped[idx + len(tq):]
|
|
312
|
+
if tq in after:
|
|
313
|
+
continue # opens and closes on same line
|
|
314
|
+
in_triple_string = True
|
|
315
|
+
triple_end = tq
|
|
316
|
+
break
|
|
317
|
+
if in_triple_string:
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
if in_triple_string:
|
|
321
|
+
if triple_end in line:
|
|
322
|
+
in_triple_string = False
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
# Skip blank and comment-only lines for indent comparison
|
|
326
|
+
if not stripped or (_line_comment and stripped.startswith(_line_comment)):
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
indent = len(line) - len(line.lstrip())
|
|
330
|
+
|
|
331
|
+
# Dedent: pop stack until matching indent
|
|
332
|
+
while stack and indent < stack[-1][2]:
|
|
333
|
+
open_line, block_depth, _ = stack.pop()
|
|
334
|
+
blocks.append(BracketBlock(
|
|
335
|
+
open_line=open_line,
|
|
336
|
+
close_line=line_idx - 1,
|
|
337
|
+
depth=block_depth,
|
|
338
|
+
is_complete=True,
|
|
339
|
+
))
|
|
340
|
+
depth -= 1
|
|
341
|
+
|
|
342
|
+
# Indent increase: open a new block (open_line is previous significant line)
|
|
343
|
+
if indent > prev_indent and prev_line_idx > 0:
|
|
344
|
+
depth += 1
|
|
345
|
+
stack.append((prev_line_idx, depth, indent))
|
|
346
|
+
|
|
347
|
+
prev_indent = indent
|
|
348
|
+
prev_line_idx = line_idx
|
|
349
|
+
|
|
350
|
+
# Close remaining open blocks
|
|
351
|
+
total_lines = len(lines)
|
|
352
|
+
for open_line, block_depth, _ in stack:
|
|
353
|
+
blocks.append(BracketBlock(
|
|
354
|
+
open_line=open_line,
|
|
355
|
+
close_line=total_lines,
|
|
356
|
+
depth=block_depth,
|
|
357
|
+
is_complete=False,
|
|
358
|
+
))
|
|
359
|
+
|
|
360
|
+
return blocks
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def compute_parent_map(blocks: list[BracketBlock]) -> dict[tuple[int, int], tuple[int, int] | None]:
|
|
364
|
+
"""Compute parent for each block based on nesting.
|
|
365
|
+
|
|
366
|
+
Returns dict mapping (open_line, depth) -> parent's (open_line, depth) or None.
|
|
367
|
+
"""
|
|
368
|
+
sorted_blocks = sorted(blocks, key=lambda b: b.open_line)
|
|
369
|
+
active: dict[int, BracketBlock] = {}
|
|
370
|
+
parent_map: dict[tuple[int, int], tuple[int, int] | None] = {}
|
|
371
|
+
|
|
372
|
+
for b in sorted_blocks:
|
|
373
|
+
for depth in sorted(active, reverse=True):
|
|
374
|
+
if active[depth].close_line < b.open_line:
|
|
375
|
+
del active[depth]
|
|
376
|
+
|
|
377
|
+
if b.depth > 1 and (b.depth - 1) in active:
|
|
378
|
+
parent = active[b.depth - 1]
|
|
379
|
+
parent_map[(b.open_line, b.depth)] = (parent.open_line, parent.depth)
|
|
380
|
+
else:
|
|
381
|
+
parent_map[(b.open_line, b.depth)] = None
|
|
382
|
+
|
|
383
|
+
active[b.depth] = b
|
|
384
|
+
|
|
385
|
+
return parent_map
|