coderay 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay-1.0.0/PKG-INFO +145 -0
- coderay-1.0.0/README.md +107 -0
- coderay-1.0.0/pyproject.toml +76 -0
- coderay-1.0.0/setup.cfg +4 -0
- coderay-1.0.0/src/coderay/__init__.py +1 -0
- coderay-1.0.0/src/coderay/chunking/__init__.py +0 -0
- coderay-1.0.0/src/coderay/chunking/chunker.py +127 -0
- coderay-1.0.0/src/coderay/chunking/registry.py +190 -0
- coderay-1.0.0/src/coderay/cli/__init__.py +3 -0
- coderay-1.0.0/src/coderay/cli/commands.py +475 -0
- coderay-1.0.0/src/coderay/core/__init__.py +0 -0
- coderay-1.0.0/src/coderay/core/config.py +73 -0
- coderay-1.0.0/src/coderay/core/lock.py +36 -0
- coderay-1.0.0/src/coderay/core/models.py +71 -0
- coderay-1.0.0/src/coderay/core/timing.py +45 -0
- coderay-1.0.0/src/coderay/core/utils.py +35 -0
- coderay-1.0.0/src/coderay/embedding/__init__.py +0 -0
- coderay-1.0.0/src/coderay/embedding/base.py +60 -0
- coderay-1.0.0/src/coderay/embedding/local.py +68 -0
- coderay-1.0.0/src/coderay/embedding/openai.py +87 -0
- coderay-1.0.0/src/coderay/graph/__init__.py +19 -0
- coderay-1.0.0/src/coderay/graph/builder.py +128 -0
- coderay-1.0.0/src/coderay/graph/code_graph.py +311 -0
- coderay-1.0.0/src/coderay/graph/extractor.py +315 -0
- coderay-1.0.0/src/coderay/mcp_server/__init__.py +0 -0
- coderay-1.0.0/src/coderay/mcp_server/server.py +178 -0
- coderay-1.0.0/src/coderay/pipeline/__init__.py +0 -0
- coderay-1.0.0/src/coderay/pipeline/indexer.py +417 -0
- coderay-1.0.0/src/coderay/pipeline/watcher.py +318 -0
- coderay-1.0.0/src/coderay/retrieval/__init__.py +3 -0
- coderay-1.0.0/src/coderay/retrieval/boosting.py +80 -0
- coderay-1.0.0/src/coderay/retrieval/search.py +121 -0
- coderay-1.0.0/src/coderay/skeleton/__init__.py +0 -0
- coderay-1.0.0/src/coderay/skeleton/extractor.py +140 -0
- coderay-1.0.0/src/coderay/state/__init__.py +8 -0
- coderay-1.0.0/src/coderay/state/machine.py +242 -0
- coderay-1.0.0/src/coderay/state/version.py +47 -0
- coderay-1.0.0/src/coderay/storage/__init__.py +0 -0
- coderay-1.0.0/src/coderay/storage/lancedb.py +268 -0
- coderay-1.0.0/src/coderay/vcs/__init__.py +0 -0
- coderay-1.0.0/src/coderay/vcs/git.py +193 -0
- coderay-1.0.0/src/coderay.egg-info/PKG-INFO +145 -0
- coderay-1.0.0/src/coderay.egg-info/SOURCES.txt +45 -0
- coderay-1.0.0/src/coderay.egg-info/dependency_links.txt +1 -0
- coderay-1.0.0/src/coderay.egg-info/entry_points.txt +3 -0
- coderay-1.0.0/src/coderay.egg-info/requires.txt +37 -0
- coderay-1.0.0/src/coderay.egg-info/top_level.txt +1 -0
coderay-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: coderay
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: X-ray your codebase — semantic search, code graphs, file skeletons, and MCP server
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
8
|
+
Requires-Dist: networkx>=3.0
|
|
9
|
+
Requires-Dist: tree-sitter>=0.24.0
|
|
10
|
+
Requires-Dist: tree-sitter-python>=0.25.0
|
|
11
|
+
Requires-Dist: lancedb>=0.5.0
|
|
12
|
+
Requires-Dist: pyyaml>=6.0
|
|
13
|
+
Requires-Dist: click>=8.0
|
|
14
|
+
Requires-Dist: filelock>=3.0
|
|
15
|
+
Requires-Dist: fastembed>=0.4.0
|
|
16
|
+
Requires-Dist: watchdog>=4.0.0
|
|
17
|
+
Requires-Dist: pathspec>=0.12.0
|
|
18
|
+
Provides-Extra: openai
|
|
19
|
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
|
20
|
+
Provides-Extra: languages
|
|
21
|
+
Requires-Dist: tree-sitter-javascript>=0.23.0; extra == "languages"
|
|
22
|
+
Requires-Dist: tree-sitter-typescript>=0.23.0; extra == "languages"
|
|
23
|
+
Requires-Dist: tree-sitter-go>=0.23.0; extra == "languages"
|
|
24
|
+
Provides-Extra: mcp
|
|
25
|
+
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.8.0; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: openai>=1.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: httpx>=0.27.0; extra == "dev"
|
|
33
|
+
Requires-Dist: mcp>=1.0.0; extra == "dev"
|
|
34
|
+
Provides-Extra: maintain
|
|
35
|
+
Requires-Dist: pylance>=0.15.0; extra == "maintain"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: coderay[dev,languages,maintain,mcp,openai]; extra == "all"
|
|
38
|
+
|
|
39
|
+
# CodeRay
|
|
40
|
+
|
|
41
|
+
A local, offline-first semantic code indexer. Builds a vector index,
|
|
42
|
+
call/import graph, and file skeletons — exposed as an MCP server for
|
|
43
|
+
AI coding assistants and a standalone CLI.
|
|
44
|
+
|
|
45
|
+
## What you get
|
|
46
|
+
|
|
47
|
+
| Capability | What it does | Why it matters | AI assistant benefit |
|
|
48
|
+
|---|---|---|---|
|
|
49
|
+
| **Semantic search** | Find code by meaning, not keywords. "where do we handle auth errors" returns results even if the code never uses that phrase. | Grep finds text. This finds *intent*. | Better context retrieval for plan and edit modes |
|
|
50
|
+
| **Blast radius** (`get_impact_radius`) | Given a function or module, show every node reachable within N hops via calls, imports, and inheritance. | Before changing `UserService.save()`, see exactly what breaks. | Safer refactors — agent sees downstream impact before editing |
|
|
51
|
+
| **File skeleton** (`get_file_skeleton`) | Signatures, docstrings, imports — no function bodies. The API surface of a file at a glance. | Understand a 500-line file in 30 lines without reading the implementation. | Drastically fewer tokens than reading the full file |
|
|
52
|
+
| **Index status** | Chunk count, schema version, branch, last commit, store health. | Confirm the index is fresh before relying on results. | Agent self-checks before trusting search results |
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install "coderay[all] @ git+https://github.com/bogdan-copocean/coderay.git"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
For development:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git clone https://github.com/bogdan-copocean/coderay.git
|
|
64
|
+
cd coderay
|
|
65
|
+
pip install -e ".[all]"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
cd /path/to/your/project
|
|
72
|
+
coderay build --repo .
|
|
73
|
+
coderay search "how does authentication work"
|
|
74
|
+
coderay watch --repo .
|
|
75
|
+
coderay graph --kind calls
|
|
76
|
+
coderay skeleton src/app/main.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## MCP server (Claude Code / Cursor)
|
|
80
|
+
|
|
81
|
+
Add to `~/.claude/claude_code_config.json` or Cursor MCP settings:
|
|
82
|
+
|
|
83
|
+
```json
|
|
84
|
+
{
|
|
85
|
+
"mcpServers": {
|
|
86
|
+
"coderay": {
|
|
87
|
+
"command": "/path/to/your/.venv/bin/coderay-mcp",
|
|
88
|
+
"args": []
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## CLI reference
|
|
95
|
+
|
|
96
|
+
| Command | Description |
|
|
97
|
+
|---|---|
|
|
98
|
+
| `coderay build [--full] --repo .` | Build index (incremental or full rebuild) |
|
|
99
|
+
| `coderay update --repo .` | Incremental update (changed files only) |
|
|
100
|
+
| `coderay watch --repo . [--debounce N]` | Watch for file changes, re-index automatically |
|
|
101
|
+
| `coderay search "query" [--top-k N]` | Semantic search |
|
|
102
|
+
| `coderay list [--by-file]` | List indexed chunks |
|
|
103
|
+
| `coderay status` | Index state, branch, commit, chunk count |
|
|
104
|
+
| `coderay maintain --repo .` | Compact index, reclaim space |
|
|
105
|
+
| `coderay skeleton FILE` | Print file skeleton |
|
|
106
|
+
| `coderay graph --kind calls\|imports` | List graph edges |
|
|
107
|
+
|
|
108
|
+
## Configuration
|
|
109
|
+
|
|
110
|
+
Optional `config.yaml` in the index directory:
|
|
111
|
+
|
|
112
|
+
```yaml
|
|
113
|
+
embedder:
|
|
114
|
+
provider: local # local | openai
|
|
115
|
+
model: all-MiniLM-L6-v2
|
|
116
|
+
dimensions: 384
|
|
117
|
+
|
|
118
|
+
search:
|
|
119
|
+
boost_rules:
|
|
120
|
+
"tests/": 0.5
|
|
121
|
+
"src/core/": 1.2
|
|
122
|
+
|
|
123
|
+
graph:
|
|
124
|
+
exclude_callees:
|
|
125
|
+
- "our_sdk_helper"
|
|
126
|
+
include_callees:
|
|
127
|
+
- "isinstance"
|
|
128
|
+
|
|
129
|
+
watch:
|
|
130
|
+
debounce_seconds: 2
|
|
131
|
+
branch_switch_threshold: 50
|
|
132
|
+
exclude_patterns:
|
|
133
|
+
- "*.log"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Development
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
pip install -e ".[dev]"
|
|
140
|
+
make test
|
|
141
|
+
make lint
|
|
142
|
+
make format
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Requires Python >= 3.10 and Git.
|
coderay-1.0.0/README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# CodeRay
|
|
2
|
+
|
|
3
|
+
A local, offline-first semantic code indexer. Builds a vector index,
|
|
4
|
+
call/import graph, and file skeletons — exposed as an MCP server for
|
|
5
|
+
AI coding assistants and a standalone CLI.
|
|
6
|
+
|
|
7
|
+
## What you get
|
|
8
|
+
|
|
9
|
+
| Capability | What it does | Why it matters | AI assistant benefit |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| **Semantic search** | Find code by meaning, not keywords. "where do we handle auth errors" returns results even if the code never uses that phrase. | Grep finds text. This finds *intent*. | Better context retrieval for plan and edit modes |
|
|
12
|
+
| **Blast radius** (`get_impact_radius`) | Given a function or module, show every node reachable within N hops via calls, imports, and inheritance. | Before changing `UserService.save()`, see exactly what breaks. | Safer refactors — agent sees downstream impact before editing |
|
|
13
|
+
| **File skeleton** (`get_file_skeleton`) | Signatures, docstrings, imports — no function bodies. The API surface of a file at a glance. | Understand a 500-line file in 30 lines without reading the implementation. | Drastically fewer tokens than reading the full file |
|
|
14
|
+
| **Index status** | Chunk count, schema version, branch, last commit, store health. | Confirm the index is fresh before relying on results. | Agent self-checks before trusting search results |
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install "coderay[all] @ git+https://github.com/bogdan-copocean/coderay.git"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
For development:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/bogdan-copocean/coderay.git
|
|
26
|
+
cd coderay
|
|
27
|
+
pip install -e ".[all]"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick start
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
cd /path/to/your/project
|
|
34
|
+
coderay build --repo .
|
|
35
|
+
coderay search "how does authentication work"
|
|
36
|
+
coderay watch --repo .
|
|
37
|
+
coderay graph --kind calls
|
|
38
|
+
coderay skeleton src/app/main.py
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## MCP server (Claude Code / Cursor)
|
|
42
|
+
|
|
43
|
+
Add to `~/.claude/claude_code_config.json` or Cursor MCP settings:
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"mcpServers": {
|
|
48
|
+
"coderay": {
|
|
49
|
+
"command": "/path/to/your/.venv/bin/coderay-mcp",
|
|
50
|
+
"args": []
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## CLI reference
|
|
57
|
+
|
|
58
|
+
| Command | Description |
|
|
59
|
+
|---|---|
|
|
60
|
+
| `coderay build [--full] --repo .` | Build index (incremental or full rebuild) |
|
|
61
|
+
| `coderay update --repo .` | Incremental update (changed files only) |
|
|
62
|
+
| `coderay watch --repo . [--debounce N]` | Watch for file changes, re-index automatically |
|
|
63
|
+
| `coderay search "query" [--top-k N]` | Semantic search |
|
|
64
|
+
| `coderay list [--by-file]` | List indexed chunks |
|
|
65
|
+
| `coderay status` | Index state, branch, commit, chunk count |
|
|
66
|
+
| `coderay maintain --repo .` | Compact index, reclaim space |
|
|
67
|
+
| `coderay skeleton FILE` | Print file skeleton |
|
|
68
|
+
| `coderay graph --kind calls\|imports` | List graph edges |
|
|
69
|
+
|
|
70
|
+
## Configuration
|
|
71
|
+
|
|
72
|
+
Optional `config.yaml` in the index directory:
|
|
73
|
+
|
|
74
|
+
```yaml
|
|
75
|
+
embedder:
|
|
76
|
+
provider: local # local | openai
|
|
77
|
+
model: all-MiniLM-L6-v2
|
|
78
|
+
dimensions: 384
|
|
79
|
+
|
|
80
|
+
search:
|
|
81
|
+
boost_rules:
|
|
82
|
+
"tests/": 0.5
|
|
83
|
+
"src/core/": 1.2
|
|
84
|
+
|
|
85
|
+
graph:
|
|
86
|
+
exclude_callees:
|
|
87
|
+
- "our_sdk_helper"
|
|
88
|
+
include_callees:
|
|
89
|
+
- "isinstance"
|
|
90
|
+
|
|
91
|
+
watch:
|
|
92
|
+
debounce_seconds: 2
|
|
93
|
+
branch_switch_threshold: 50
|
|
94
|
+
exclude_patterns:
|
|
95
|
+
- "*.log"
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Development
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install -e ".[dev]"
|
|
102
|
+
make test
|
|
103
|
+
make lint
|
|
104
|
+
make format
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Requires Python >= 3.10 and Git.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "coderay"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "X-ray your codebase — semantic search, code graphs, file skeletons, and MCP server"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"python-dotenv>=1.0.0",
|
|
13
|
+
"networkx>=3.0",
|
|
14
|
+
"tree-sitter>=0.24.0",
|
|
15
|
+
"tree-sitter-python>=0.25.0",
|
|
16
|
+
"lancedb>=0.5.0",
|
|
17
|
+
"pyyaml>=6.0",
|
|
18
|
+
"click>=8.0",
|
|
19
|
+
"filelock>=3.0",
|
|
20
|
+
"fastembed>=0.4.0",
|
|
21
|
+
"watchdog>=4.0.0",
|
|
22
|
+
"pathspec>=0.12.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
openai = ["openai>=1.0.0"]
|
|
27
|
+
languages = [
|
|
28
|
+
"tree-sitter-javascript>=0.23.0",
|
|
29
|
+
"tree-sitter-typescript>=0.23.0",
|
|
30
|
+
"tree-sitter-go>=0.23.0",
|
|
31
|
+
]
|
|
32
|
+
mcp = ["mcp>=1.0.0"]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7.0",
|
|
35
|
+
"pytest-cov>=4.0",
|
|
36
|
+
"ruff>=0.8.0",
|
|
37
|
+
"mypy>=1.0.0",
|
|
38
|
+
"openai>=1.0.0",
|
|
39
|
+
"httpx>=0.27.0",
|
|
40
|
+
"mcp>=1.0.0",
|
|
41
|
+
]
|
|
42
|
+
maintain = [
|
|
43
|
+
"pylance>=0.15.0",
|
|
44
|
+
]
|
|
45
|
+
all = [
|
|
46
|
+
"coderay[openai,languages,mcp,dev,maintain]",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.scripts]
|
|
50
|
+
coderay = "coderay.cli.commands:main"
|
|
51
|
+
coderay-mcp = "coderay.mcp_server.server:main"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
where = ["src"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
target-version = "py310"
|
|
58
|
+
line-length = 88
|
|
59
|
+
src = ["src", "tests"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
63
|
+
ignore = []
|
|
64
|
+
|
|
65
|
+
[tool.ruff.lint.isort]
|
|
66
|
+
known-first-party = ["coderay"]
|
|
67
|
+
|
|
68
|
+
[tool.mypy]
|
|
69
|
+
python_version = "3.10"
|
|
70
|
+
warn_return_any = true
|
|
71
|
+
warn_unused_configs = true
|
|
72
|
+
ignore_missing_imports = true
|
|
73
|
+
|
|
74
|
+
[tool.pytest.ini_options]
|
|
75
|
+
testpaths = ["tests"]
|
|
76
|
+
pythonpath = ["src"]
|
coderay-1.0.0/setup.cfg
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from coderay.chunking.registry import LanguageConfig, get_language_for_file
|
|
7
|
+
from coderay.core.models import Chunk
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_symbol_name(node, source_bytes: bytes) -> str:
|
|
13
|
+
"""Extract symbol name from a definition node."""
|
|
14
|
+
if node.type == "decorated_definition":
|
|
15
|
+
for child in node.children:
|
|
16
|
+
if child.type != "decorator":
|
|
17
|
+
return _get_symbol_name(child, source_bytes)
|
|
18
|
+
return ""
|
|
19
|
+
|
|
20
|
+
for child in node.children:
|
|
21
|
+
if child.type == "identifier":
|
|
22
|
+
return source_bytes[child.start_byte : child.end_byte].decode(
|
|
23
|
+
"utf-8", errors="replace"
|
|
24
|
+
)
|
|
25
|
+
if child.type in ("class", "def", "func", "function", "type"):
|
|
26
|
+
for sibling in node.children:
|
|
27
|
+
if sibling.type == "identifier":
|
|
28
|
+
return source_bytes[sibling.start_byte : sibling.end_byte].decode(
|
|
29
|
+
"utf-8", errors="replace"
|
|
30
|
+
)
|
|
31
|
+
if node.type in ("property_identifier", "field_identifier"):
|
|
32
|
+
return source_bytes[node.start_byte : node.end_byte].decode(
|
|
33
|
+
"utf-8", errors="replace"
|
|
34
|
+
)
|
|
35
|
+
return ""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _collect_preamble_lines(
|
|
39
|
+
root, source_bytes: bytes, chunk_types: tuple[str, ...]
|
|
40
|
+
) -> list[str]:
|
|
41
|
+
"""Collect top-level lines that are NOT part of any chunk_type definition."""
|
|
42
|
+
lines: list[str] = []
|
|
43
|
+
for child in root.children:
|
|
44
|
+
if child.type in chunk_types:
|
|
45
|
+
continue
|
|
46
|
+
text = (
|
|
47
|
+
source_bytes[child.start_byte : child.end_byte]
|
|
48
|
+
.decode("utf-8", errors="replace")
|
|
49
|
+
.strip()
|
|
50
|
+
)
|
|
51
|
+
if text:
|
|
52
|
+
lines.append(text)
|
|
53
|
+
return lines
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _chunk_file_with_config(
|
|
57
|
+
path: str,
|
|
58
|
+
content: str,
|
|
59
|
+
lang_cfg: LanguageConfig,
|
|
60
|
+
) -> list[Chunk]:
|
|
61
|
+
"""Chunk a file using the provided language configuration."""
|
|
62
|
+
try:
|
|
63
|
+
parser = lang_cfg.get_parser()
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.warning("Could not load parser for %s (%s): %s", path, lang_cfg.name, e)
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
source_bytes = content.encode("utf-8")
|
|
69
|
+
tree = parser.parse(source_bytes)
|
|
70
|
+
root = tree.root_node
|
|
71
|
+
chunks: list[Chunk] = []
|
|
72
|
+
|
|
73
|
+
def dfs(node) -> None:
|
|
74
|
+
if node.type in lang_cfg.chunk_types:
|
|
75
|
+
# [py] Avoid duplicates on decorated functions.
|
|
76
|
+
# [py] Decorators are stored with symbol of the function that is decorating
|
|
77
|
+
# [py] But the content field of the decorated function will capture them
|
|
78
|
+
if node.parent and node.parent.type in lang_cfg.chunk_types:
|
|
79
|
+
for child in node.children:
|
|
80
|
+
dfs(child)
|
|
81
|
+
return
|
|
82
|
+
start_line = node.start_point[0] + 1
|
|
83
|
+
end_line = node.end_point[0] + 1
|
|
84
|
+
text = source_bytes[node.start_byte : node.end_byte].decode(
|
|
85
|
+
"utf-8", errors="replace"
|
|
86
|
+
)
|
|
87
|
+
symbol = _get_symbol_name(node, source_bytes) or f"<{node.type}>"
|
|
88
|
+
chunks.append(
|
|
89
|
+
Chunk(
|
|
90
|
+
path=path,
|
|
91
|
+
start_line=start_line,
|
|
92
|
+
end_line=end_line,
|
|
93
|
+
symbol=symbol,
|
|
94
|
+
language=lang_cfg.name,
|
|
95
|
+
content=text,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
for child in node.children:
|
|
99
|
+
dfs(child)
|
|
100
|
+
|
|
101
|
+
if preamble_lines := _collect_preamble_lines(
|
|
102
|
+
root, source_bytes, lang_cfg.chunk_types
|
|
103
|
+
):
|
|
104
|
+
chunks.append(
|
|
105
|
+
Chunk(
|
|
106
|
+
path=path,
|
|
107
|
+
start_line=1,
|
|
108
|
+
end_line=root.end_point[0] + 1,
|
|
109
|
+
symbol="<module>",
|
|
110
|
+
language=lang_cfg.name,
|
|
111
|
+
content="\n".join(preamble_lines),
|
|
112
|
+
),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
dfs(root)
|
|
116
|
+
|
|
117
|
+
logger.debug("Chunked %s: %d chunks", path, len(chunks))
|
|
118
|
+
return chunks
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def chunk_file(path: str | Path, content: str, language: str = "python") -> list[Chunk]:
|
|
122
|
+
"""Chunk a source file into semantic units (functions, classes, preamble)."""
|
|
123
|
+
path_str = str(path) if isinstance(path, Path) else path
|
|
124
|
+
if not (lang_cfg := get_language_for_file(path_str)):
|
|
125
|
+
logger.warning("No language config for %s ", path_str)
|
|
126
|
+
return []
|
|
127
|
+
return _chunk_file_with_config(path_str, content, lang_cfg)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from tree_sitter import Language, Parser
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LanguageConfig:
|
|
16
|
+
"""Configuration for a single language's tree-sitter grammar."""
|
|
17
|
+
|
|
18
|
+
name: str
|
|
19
|
+
extensions: tuple[str, ...]
|
|
20
|
+
language_fn: Callable[[], Any]
|
|
21
|
+
chunk_types: tuple[str, ...]
|
|
22
|
+
scope_types: tuple[str, ...] = ("function_definition", "class_definition")
|
|
23
|
+
import_types: tuple[str, ...] = ("import_statement", "import_from_statement")
|
|
24
|
+
call_types: tuple[str, ...] = ("call", "call_expression")
|
|
25
|
+
function_scope_types: tuple[str, ...] = ("function_definition",)
|
|
26
|
+
class_scope_types: tuple[str, ...] = ("class_definition",)
|
|
27
|
+
init_filenames: tuple[str, ...] = ()
|
|
28
|
+
|
|
29
|
+
def get_parser(self) -> Parser:
|
|
30
|
+
"""Create and return a tree-sitter Parser for this language."""
|
|
31
|
+
lang = Language(self.language_fn())
|
|
32
|
+
parser = Parser(lang)
|
|
33
|
+
return parser
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _python_language():
|
|
37
|
+
import tree_sitter_python as tspython
|
|
38
|
+
|
|
39
|
+
return tspython.language()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _javascript_language():
|
|
43
|
+
import tree_sitter_javascript as tsjs
|
|
44
|
+
|
|
45
|
+
return tsjs.language()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _typescript_language():
|
|
49
|
+
import tree_sitter_typescript as tsts
|
|
50
|
+
|
|
51
|
+
return tsts.language()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _go_language():
|
|
55
|
+
import tree_sitter_go as tsgo
|
|
56
|
+
|
|
57
|
+
return tsgo.language()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
PYTHON_CONFIG = LanguageConfig(
|
|
61
|
+
name="python",
|
|
62
|
+
extensions=(".py", ".pyi"),
|
|
63
|
+
language_fn=_python_language,
|
|
64
|
+
chunk_types=(
|
|
65
|
+
"function_definition",
|
|
66
|
+
"class_definition",
|
|
67
|
+
"decorated_definition",
|
|
68
|
+
),
|
|
69
|
+
scope_types=("function_definition", "class_definition"),
|
|
70
|
+
import_types=("import_statement", "import_from_statement"),
|
|
71
|
+
call_types=("call",),
|
|
72
|
+
function_scope_types=("function_definition",),
|
|
73
|
+
class_scope_types=("class_definition",),
|
|
74
|
+
init_filenames=("__init__",),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
JAVASCRIPT_CONFIG = LanguageConfig(
|
|
78
|
+
name="javascript",
|
|
79
|
+
extensions=(".js", ".jsx", ".mjs", ".cjs"),
|
|
80
|
+
language_fn=_javascript_language,
|
|
81
|
+
chunk_types=(
|
|
82
|
+
"function_declaration",
|
|
83
|
+
"class_declaration",
|
|
84
|
+
"method_definition",
|
|
85
|
+
"arrow_function",
|
|
86
|
+
"export_statement",
|
|
87
|
+
"lexical_declaration",
|
|
88
|
+
),
|
|
89
|
+
scope_types=("function_declaration", "class_declaration", "method_definition"),
|
|
90
|
+
import_types=("import_statement",),
|
|
91
|
+
call_types=("call_expression",),
|
|
92
|
+
function_scope_types=("function_declaration", "method_definition"),
|
|
93
|
+
class_scope_types=("class_declaration",),
|
|
94
|
+
init_filenames=("index",),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
TYPESCRIPT_CONFIG = LanguageConfig(
|
|
98
|
+
name="typescript",
|
|
99
|
+
extensions=(".ts", ".tsx"),
|
|
100
|
+
language_fn=_typescript_language,
|
|
101
|
+
chunk_types=(
|
|
102
|
+
"function_declaration",
|
|
103
|
+
"class_declaration",
|
|
104
|
+
"method_definition",
|
|
105
|
+
"arrow_function",
|
|
106
|
+
"export_statement",
|
|
107
|
+
"lexical_declaration",
|
|
108
|
+
"interface_declaration",
|
|
109
|
+
"type_alias_declaration",
|
|
110
|
+
),
|
|
111
|
+
scope_types=(
|
|
112
|
+
"function_declaration",
|
|
113
|
+
"class_declaration",
|
|
114
|
+
"method_definition",
|
|
115
|
+
"interface_declaration",
|
|
116
|
+
),
|
|
117
|
+
import_types=("import_statement",),
|
|
118
|
+
call_types=("call_expression",),
|
|
119
|
+
function_scope_types=("function_declaration", "method_definition"),
|
|
120
|
+
class_scope_types=("class_declaration", "interface_declaration"),
|
|
121
|
+
init_filenames=("index",),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
GO_CONFIG = LanguageConfig(
|
|
125
|
+
name="go",
|
|
126
|
+
extensions=(".go",),
|
|
127
|
+
language_fn=_go_language,
|
|
128
|
+
chunk_types=(
|
|
129
|
+
"function_declaration",
|
|
130
|
+
"method_declaration",
|
|
131
|
+
"type_declaration",
|
|
132
|
+
),
|
|
133
|
+
scope_types=("function_declaration", "method_declaration"),
|
|
134
|
+
import_types=("import_declaration",),
|
|
135
|
+
call_types=("call_expression",),
|
|
136
|
+
function_scope_types=("function_declaration", "method_declaration"),
|
|
137
|
+
class_scope_types=(),
|
|
138
|
+
init_filenames=(),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
LANGUAGE_REGISTRY: dict[str, LanguageConfig] = {
|
|
142
|
+
"python": PYTHON_CONFIG,
|
|
143
|
+
"javascript": JAVASCRIPT_CONFIG,
|
|
144
|
+
"typescript": TYPESCRIPT_CONFIG,
|
|
145
|
+
"go": GO_CONFIG,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
_EXTENSION_MAP: dict[str, str] = {}
|
|
149
|
+
for _lang_name, _cfg in LANGUAGE_REGISTRY.items():
|
|
150
|
+
for _ext in _cfg.extensions:
|
|
151
|
+
_EXTENSION_MAP[_ext] = _lang_name
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_language_for_file(path: str | Path) -> LanguageConfig | None:
|
|
155
|
+
"""Return the LanguageConfig for a file based on its extension, or None."""
|
|
156
|
+
ext = Path(path).suffix.lower()
|
|
157
|
+
lang_name = _EXTENSION_MAP.get(ext)
|
|
158
|
+
if lang_name is None:
|
|
159
|
+
return None
|
|
160
|
+
return LANGUAGE_REGISTRY.get(lang_name)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def get_supported_extensions() -> set[str]:
|
|
164
|
+
"""Return all file extensions we can index."""
|
|
165
|
+
return set(_EXTENSION_MAP.keys())
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_init_filenames() -> set[str]:
|
|
169
|
+
"""Return all init-style filenames across languages (e.g. __init__, index)."""
|
|
170
|
+
names: set[str] = set()
|
|
171
|
+
for cfg in LANGUAGE_REGISTRY.values():
|
|
172
|
+
names.update(cfg.init_filenames)
|
|
173
|
+
return names
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def get_resolution_suffixes() -> list[str]:
|
|
177
|
+
"""Return file suffixes for resolving import targets."""
|
|
178
|
+
suffixes: list[str] = []
|
|
179
|
+
seen: set[str] = set()
|
|
180
|
+
for cfg in LANGUAGE_REGISTRY.values():
|
|
181
|
+
for ext in cfg.extensions:
|
|
182
|
+
if ext not in seen:
|
|
183
|
+
suffixes.append(ext)
|
|
184
|
+
seen.add(ext)
|
|
185
|
+
for init in cfg.init_filenames:
|
|
186
|
+
combo = f"/{init}{ext}"
|
|
187
|
+
if combo not in seen:
|
|
188
|
+
suffixes.append(combo)
|
|
189
|
+
seen.add(combo)
|
|
190
|
+
return suffixes
|