coderay 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderay/__init__.py +1 -0
- coderay/chunking/__init__.py +0 -0
- coderay/chunking/chunker.py +127 -0
- coderay/chunking/registry.py +190 -0
- coderay/cli/__init__.py +3 -0
- coderay/cli/commands.py +475 -0
- coderay/core/__init__.py +0 -0
- coderay/core/config.py +73 -0
- coderay/core/lock.py +36 -0
- coderay/core/models.py +71 -0
- coderay/core/timing.py +45 -0
- coderay/core/utils.py +35 -0
- coderay/embedding/__init__.py +0 -0
- coderay/embedding/base.py +60 -0
- coderay/embedding/local.py +68 -0
- coderay/embedding/openai.py +87 -0
- coderay/graph/__init__.py +19 -0
- coderay/graph/builder.py +128 -0
- coderay/graph/code_graph.py +311 -0
- coderay/graph/extractor.py +315 -0
- coderay/mcp_server/__init__.py +0 -0
- coderay/mcp_server/server.py +178 -0
- coderay/pipeline/__init__.py +0 -0
- coderay/pipeline/indexer.py +417 -0
- coderay/pipeline/watcher.py +318 -0
- coderay/retrieval/__init__.py +3 -0
- coderay/retrieval/boosting.py +80 -0
- coderay/retrieval/search.py +121 -0
- coderay/skeleton/__init__.py +0 -0
- coderay/skeleton/extractor.py +140 -0
- coderay/state/__init__.py +8 -0
- coderay/state/machine.py +242 -0
- coderay/state/version.py +47 -0
- coderay/storage/__init__.py +0 -0
- coderay/storage/lancedb.py +268 -0
- coderay/vcs/__init__.py +0 -0
- coderay/vcs/git.py +193 -0
- coderay-1.0.0.dist-info/METADATA +145 -0
- coderay-1.0.0.dist-info/RECORD +42 -0
- coderay-1.0.0.dist-info/WHEEL +5 -0
- coderay-1.0.0.dist-info/entry_points.txt +3 -0
- coderay-1.0.0.dist-info/top_level.txt +1 -0
coderay/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from coderay.chunking.registry import LanguageConfig, get_language_for_file
|
|
7
|
+
from coderay.core.models import Chunk
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_symbol_name(node, source_bytes: bytes) -> str:
|
|
13
|
+
"""Extract symbol name from a definition node."""
|
|
14
|
+
if node.type == "decorated_definition":
|
|
15
|
+
for child in node.children:
|
|
16
|
+
if child.type != "decorator":
|
|
17
|
+
return _get_symbol_name(child, source_bytes)
|
|
18
|
+
return ""
|
|
19
|
+
|
|
20
|
+
for child in node.children:
|
|
21
|
+
if child.type == "identifier":
|
|
22
|
+
return source_bytes[child.start_byte : child.end_byte].decode(
|
|
23
|
+
"utf-8", errors="replace"
|
|
24
|
+
)
|
|
25
|
+
if child.type in ("class", "def", "func", "function", "type"):
|
|
26
|
+
for sibling in node.children:
|
|
27
|
+
if sibling.type == "identifier":
|
|
28
|
+
return source_bytes[sibling.start_byte : sibling.end_byte].decode(
|
|
29
|
+
"utf-8", errors="replace"
|
|
30
|
+
)
|
|
31
|
+
if node.type in ("property_identifier", "field_identifier"):
|
|
32
|
+
return source_bytes[node.start_byte : node.end_byte].decode(
|
|
33
|
+
"utf-8", errors="replace"
|
|
34
|
+
)
|
|
35
|
+
return ""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _collect_preamble_lines(
|
|
39
|
+
root, source_bytes: bytes, chunk_types: tuple[str, ...]
|
|
40
|
+
) -> list[str]:
|
|
41
|
+
"""Collect top-level lines that are NOT part of any chunk_type definition."""
|
|
42
|
+
lines: list[str] = []
|
|
43
|
+
for child in root.children:
|
|
44
|
+
if child.type in chunk_types:
|
|
45
|
+
continue
|
|
46
|
+
text = (
|
|
47
|
+
source_bytes[child.start_byte : child.end_byte]
|
|
48
|
+
.decode("utf-8", errors="replace")
|
|
49
|
+
.strip()
|
|
50
|
+
)
|
|
51
|
+
if text:
|
|
52
|
+
lines.append(text)
|
|
53
|
+
return lines
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _chunk_file_with_config(
|
|
57
|
+
path: str,
|
|
58
|
+
content: str,
|
|
59
|
+
lang_cfg: LanguageConfig,
|
|
60
|
+
) -> list[Chunk]:
|
|
61
|
+
"""Chunk a file using the provided language configuration."""
|
|
62
|
+
try:
|
|
63
|
+
parser = lang_cfg.get_parser()
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.warning("Could not load parser for %s (%s): %s", path, lang_cfg.name, e)
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
source_bytes = content.encode("utf-8")
|
|
69
|
+
tree = parser.parse(source_bytes)
|
|
70
|
+
root = tree.root_node
|
|
71
|
+
chunks: list[Chunk] = []
|
|
72
|
+
|
|
73
|
+
def dfs(node) -> None:
|
|
74
|
+
if node.type in lang_cfg.chunk_types:
|
|
75
|
+
# [py] Avoid duplicates on decorated functions.
|
|
76
|
+
# [py] Decorators are stored with symbol of the function that is decorating
|
|
77
|
+
# [py] But the content field of the decorated function will capture them
|
|
78
|
+
if node.parent and node.parent.type in lang_cfg.chunk_types:
|
|
79
|
+
for child in node.children:
|
|
80
|
+
dfs(child)
|
|
81
|
+
return
|
|
82
|
+
start_line = node.start_point[0] + 1
|
|
83
|
+
end_line = node.end_point[0] + 1
|
|
84
|
+
text = source_bytes[node.start_byte : node.end_byte].decode(
|
|
85
|
+
"utf-8", errors="replace"
|
|
86
|
+
)
|
|
87
|
+
symbol = _get_symbol_name(node, source_bytes) or f"<{node.type}>"
|
|
88
|
+
chunks.append(
|
|
89
|
+
Chunk(
|
|
90
|
+
path=path,
|
|
91
|
+
start_line=start_line,
|
|
92
|
+
end_line=end_line,
|
|
93
|
+
symbol=symbol,
|
|
94
|
+
language=lang_cfg.name,
|
|
95
|
+
content=text,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
for child in node.children:
|
|
99
|
+
dfs(child)
|
|
100
|
+
|
|
101
|
+
if preamble_lines := _collect_preamble_lines(
|
|
102
|
+
root, source_bytes, lang_cfg.chunk_types
|
|
103
|
+
):
|
|
104
|
+
chunks.append(
|
|
105
|
+
Chunk(
|
|
106
|
+
path=path,
|
|
107
|
+
start_line=1,
|
|
108
|
+
end_line=root.end_point[0] + 1,
|
|
109
|
+
symbol="<module>",
|
|
110
|
+
language=lang_cfg.name,
|
|
111
|
+
content="\n".join(preamble_lines),
|
|
112
|
+
),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
dfs(root)
|
|
116
|
+
|
|
117
|
+
logger.debug("Chunked %s: %d chunks", path, len(chunks))
|
|
118
|
+
return chunks
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def chunk_file(path: str | Path, content: str, language: str = "python") -> list[Chunk]:
|
|
122
|
+
"""Chunk a source file into semantic units (functions, classes, preamble)."""
|
|
123
|
+
path_str = str(path) if isinstance(path, Path) else path
|
|
124
|
+
if not (lang_cfg := get_language_for_file(path_str)):
|
|
125
|
+
logger.warning("No language config for %s ", path_str)
|
|
126
|
+
return []
|
|
127
|
+
return _chunk_file_with_config(path_str, content, lang_cfg)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from tree_sitter import Language, Parser
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LanguageConfig:
|
|
16
|
+
"""Configuration for a single language's tree-sitter grammar."""
|
|
17
|
+
|
|
18
|
+
name: str
|
|
19
|
+
extensions: tuple[str, ...]
|
|
20
|
+
language_fn: Callable[[], Any]
|
|
21
|
+
chunk_types: tuple[str, ...]
|
|
22
|
+
scope_types: tuple[str, ...] = ("function_definition", "class_definition")
|
|
23
|
+
import_types: tuple[str, ...] = ("import_statement", "import_from_statement")
|
|
24
|
+
call_types: tuple[str, ...] = ("call", "call_expression")
|
|
25
|
+
function_scope_types: tuple[str, ...] = ("function_definition",)
|
|
26
|
+
class_scope_types: tuple[str, ...] = ("class_definition",)
|
|
27
|
+
init_filenames: tuple[str, ...] = ()
|
|
28
|
+
|
|
29
|
+
def get_parser(self) -> Parser:
|
|
30
|
+
"""Create and return a tree-sitter Parser for this language."""
|
|
31
|
+
lang = Language(self.language_fn())
|
|
32
|
+
parser = Parser(lang)
|
|
33
|
+
return parser
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _python_language():
|
|
37
|
+
import tree_sitter_python as tspython
|
|
38
|
+
|
|
39
|
+
return tspython.language()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _javascript_language():
|
|
43
|
+
import tree_sitter_javascript as tsjs
|
|
44
|
+
|
|
45
|
+
return tsjs.language()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _typescript_language():
|
|
49
|
+
import tree_sitter_typescript as tsts
|
|
50
|
+
|
|
51
|
+
return tsts.language()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _go_language():
|
|
55
|
+
import tree_sitter_go as tsgo
|
|
56
|
+
|
|
57
|
+
return tsgo.language()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
PYTHON_CONFIG = LanguageConfig(
|
|
61
|
+
name="python",
|
|
62
|
+
extensions=(".py", ".pyi"),
|
|
63
|
+
language_fn=_python_language,
|
|
64
|
+
chunk_types=(
|
|
65
|
+
"function_definition",
|
|
66
|
+
"class_definition",
|
|
67
|
+
"decorated_definition",
|
|
68
|
+
),
|
|
69
|
+
scope_types=("function_definition", "class_definition"),
|
|
70
|
+
import_types=("import_statement", "import_from_statement"),
|
|
71
|
+
call_types=("call",),
|
|
72
|
+
function_scope_types=("function_definition",),
|
|
73
|
+
class_scope_types=("class_definition",),
|
|
74
|
+
init_filenames=("__init__",),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
JAVASCRIPT_CONFIG = LanguageConfig(
|
|
78
|
+
name="javascript",
|
|
79
|
+
extensions=(".js", ".jsx", ".mjs", ".cjs"),
|
|
80
|
+
language_fn=_javascript_language,
|
|
81
|
+
chunk_types=(
|
|
82
|
+
"function_declaration",
|
|
83
|
+
"class_declaration",
|
|
84
|
+
"method_definition",
|
|
85
|
+
"arrow_function",
|
|
86
|
+
"export_statement",
|
|
87
|
+
"lexical_declaration",
|
|
88
|
+
),
|
|
89
|
+
scope_types=("function_declaration", "class_declaration", "method_definition"),
|
|
90
|
+
import_types=("import_statement",),
|
|
91
|
+
call_types=("call_expression",),
|
|
92
|
+
function_scope_types=("function_declaration", "method_definition"),
|
|
93
|
+
class_scope_types=("class_declaration",),
|
|
94
|
+
init_filenames=("index",),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
TYPESCRIPT_CONFIG = LanguageConfig(
|
|
98
|
+
name="typescript",
|
|
99
|
+
extensions=(".ts", ".tsx"),
|
|
100
|
+
language_fn=_typescript_language,
|
|
101
|
+
chunk_types=(
|
|
102
|
+
"function_declaration",
|
|
103
|
+
"class_declaration",
|
|
104
|
+
"method_definition",
|
|
105
|
+
"arrow_function",
|
|
106
|
+
"export_statement",
|
|
107
|
+
"lexical_declaration",
|
|
108
|
+
"interface_declaration",
|
|
109
|
+
"type_alias_declaration",
|
|
110
|
+
),
|
|
111
|
+
scope_types=(
|
|
112
|
+
"function_declaration",
|
|
113
|
+
"class_declaration",
|
|
114
|
+
"method_definition",
|
|
115
|
+
"interface_declaration",
|
|
116
|
+
),
|
|
117
|
+
import_types=("import_statement",),
|
|
118
|
+
call_types=("call_expression",),
|
|
119
|
+
function_scope_types=("function_declaration", "method_definition"),
|
|
120
|
+
class_scope_types=("class_declaration", "interface_declaration"),
|
|
121
|
+
init_filenames=("index",),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
GO_CONFIG = LanguageConfig(
|
|
125
|
+
name="go",
|
|
126
|
+
extensions=(".go",),
|
|
127
|
+
language_fn=_go_language,
|
|
128
|
+
chunk_types=(
|
|
129
|
+
"function_declaration",
|
|
130
|
+
"method_declaration",
|
|
131
|
+
"type_declaration",
|
|
132
|
+
),
|
|
133
|
+
scope_types=("function_declaration", "method_declaration"),
|
|
134
|
+
import_types=("import_declaration",),
|
|
135
|
+
call_types=("call_expression",),
|
|
136
|
+
function_scope_types=("function_declaration", "method_declaration"),
|
|
137
|
+
class_scope_types=(),
|
|
138
|
+
init_filenames=(),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
LANGUAGE_REGISTRY: dict[str, LanguageConfig] = {
|
|
142
|
+
"python": PYTHON_CONFIG,
|
|
143
|
+
"javascript": JAVASCRIPT_CONFIG,
|
|
144
|
+
"typescript": TYPESCRIPT_CONFIG,
|
|
145
|
+
"go": GO_CONFIG,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
_EXTENSION_MAP: dict[str, str] = {}
|
|
149
|
+
for _lang_name, _cfg in LANGUAGE_REGISTRY.items():
|
|
150
|
+
for _ext in _cfg.extensions:
|
|
151
|
+
_EXTENSION_MAP[_ext] = _lang_name
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_language_for_file(path: str | Path) -> LanguageConfig | None:
|
|
155
|
+
"""Return the LanguageConfig for a file based on its extension, or None."""
|
|
156
|
+
ext = Path(path).suffix.lower()
|
|
157
|
+
lang_name = _EXTENSION_MAP.get(ext)
|
|
158
|
+
if lang_name is None:
|
|
159
|
+
return None
|
|
160
|
+
return LANGUAGE_REGISTRY.get(lang_name)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def get_supported_extensions() -> set[str]:
|
|
164
|
+
"""Return all file extensions we can index."""
|
|
165
|
+
return set(_EXTENSION_MAP.keys())
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_init_filenames() -> set[str]:
|
|
169
|
+
"""Return all init-style filenames across languages (e.g. __init__, index)."""
|
|
170
|
+
names: set[str] = set()
|
|
171
|
+
for cfg in LANGUAGE_REGISTRY.values():
|
|
172
|
+
names.update(cfg.init_filenames)
|
|
173
|
+
return names
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def get_resolution_suffixes() -> list[str]:
|
|
177
|
+
"""Return file suffixes for resolving import targets."""
|
|
178
|
+
suffixes: list[str] = []
|
|
179
|
+
seen: set[str] = set()
|
|
180
|
+
for cfg in LANGUAGE_REGISTRY.values():
|
|
181
|
+
for ext in cfg.extensions:
|
|
182
|
+
if ext not in seen:
|
|
183
|
+
suffixes.append(ext)
|
|
184
|
+
seen.add(ext)
|
|
185
|
+
for init in cfg.init_filenames:
|
|
186
|
+
combo = f"/{init}{ext}"
|
|
187
|
+
if combo not in seen:
|
|
188
|
+
suffixes.append(combo)
|
|
189
|
+
seen.add(combo)
|
|
190
|
+
return suffixes
|
coderay/cli/__init__.py
ADDED