coderay 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coderay/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "1.0.0"
File without changes
@@ -0,0 +1,127 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from coderay.chunking.registry import LanguageConfig, get_language_for_file
7
+ from coderay.core.models import Chunk
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def _get_symbol_name(node, source_bytes: bytes) -> str:
13
+ """Extract symbol name from a definition node."""
14
+ if node.type == "decorated_definition":
15
+ for child in node.children:
16
+ if child.type != "decorator":
17
+ return _get_symbol_name(child, source_bytes)
18
+ return ""
19
+
20
+ for child in node.children:
21
+ if child.type == "identifier":
22
+ return source_bytes[child.start_byte : child.end_byte].decode(
23
+ "utf-8", errors="replace"
24
+ )
25
+ if child.type in ("class", "def", "func", "function", "type"):
26
+ for sibling in node.children:
27
+ if sibling.type == "identifier":
28
+ return source_bytes[sibling.start_byte : sibling.end_byte].decode(
29
+ "utf-8", errors="replace"
30
+ )
31
+ if node.type in ("property_identifier", "field_identifier"):
32
+ return source_bytes[node.start_byte : node.end_byte].decode(
33
+ "utf-8", errors="replace"
34
+ )
35
+ return ""
36
+
37
+
38
+ def _collect_preamble_lines(
39
+ root, source_bytes: bytes, chunk_types: tuple[str, ...]
40
+ ) -> list[str]:
41
+ """Collect top-level lines that are NOT part of any chunk_type definition."""
42
+ lines: list[str] = []
43
+ for child in root.children:
44
+ if child.type in chunk_types:
45
+ continue
46
+ text = (
47
+ source_bytes[child.start_byte : child.end_byte]
48
+ .decode("utf-8", errors="replace")
49
+ .strip()
50
+ )
51
+ if text:
52
+ lines.append(text)
53
+ return lines
54
+
55
+
56
+ def _chunk_file_with_config(
57
+ path: str,
58
+ content: str,
59
+ lang_cfg: LanguageConfig,
60
+ ) -> list[Chunk]:
61
+ """Chunk a file using the provided language configuration."""
62
+ try:
63
+ parser = lang_cfg.get_parser()
64
+ except Exception as e:
65
+ logger.warning("Could not load parser for %s (%s): %s", path, lang_cfg.name, e)
66
+ return []
67
+
68
+ source_bytes = content.encode("utf-8")
69
+ tree = parser.parse(source_bytes)
70
+ root = tree.root_node
71
+ chunks: list[Chunk] = []
72
+
73
+ def dfs(node) -> None:
74
+ if node.type in lang_cfg.chunk_types:
75
+ # [py] Avoid duplicates on decorated functions.
76
+ # [py] Decorators are stored with symbol of the function that is decorating
77
+ # [py] But the content field of the decorated function will capture them
78
+ if node.parent and node.parent.type in lang_cfg.chunk_types:
79
+ for child in node.children:
80
+ dfs(child)
81
+ return
82
+ start_line = node.start_point[0] + 1
83
+ end_line = node.end_point[0] + 1
84
+ text = source_bytes[node.start_byte : node.end_byte].decode(
85
+ "utf-8", errors="replace"
86
+ )
87
+ symbol = _get_symbol_name(node, source_bytes) or f"<{node.type}>"
88
+ chunks.append(
89
+ Chunk(
90
+ path=path,
91
+ start_line=start_line,
92
+ end_line=end_line,
93
+ symbol=symbol,
94
+ language=lang_cfg.name,
95
+ content=text,
96
+ )
97
+ )
98
+ for child in node.children:
99
+ dfs(child)
100
+
101
+ if preamble_lines := _collect_preamble_lines(
102
+ root, source_bytes, lang_cfg.chunk_types
103
+ ):
104
+ chunks.append(
105
+ Chunk(
106
+ path=path,
107
+ start_line=1,
108
+ end_line=root.end_point[0] + 1,
109
+ symbol="<module>",
110
+ language=lang_cfg.name,
111
+ content="\n".join(preamble_lines),
112
+ ),
113
+ )
114
+
115
+ dfs(root)
116
+
117
+ logger.debug("Chunked %s: %d chunks", path, len(chunks))
118
+ return chunks
119
+
120
+
121
+ def chunk_file(path: str | Path, content: str, language: str = "python") -> list[Chunk]:
122
+ """Chunk a source file into semantic units (functions, classes, preamble)."""
123
+ path_str = str(path) if isinstance(path, Path) else path
124
+ if not (lang_cfg := get_language_for_file(path_str)):
125
+ logger.warning("No language config for %s ", path_str)
126
+ return []
127
+ return _chunk_file_with_config(path_str, content, lang_cfg)
@@ -0,0 +1,190 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections.abc import Callable
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from tree_sitter import Language, Parser
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class LanguageConfig:
16
+ """Configuration for a single language's tree-sitter grammar."""
17
+
18
+ name: str
19
+ extensions: tuple[str, ...]
20
+ language_fn: Callable[[], Any]
21
+ chunk_types: tuple[str, ...]
22
+ scope_types: tuple[str, ...] = ("function_definition", "class_definition")
23
+ import_types: tuple[str, ...] = ("import_statement", "import_from_statement")
24
+ call_types: tuple[str, ...] = ("call", "call_expression")
25
+ function_scope_types: tuple[str, ...] = ("function_definition",)
26
+ class_scope_types: tuple[str, ...] = ("class_definition",)
27
+ init_filenames: tuple[str, ...] = ()
28
+
29
+ def get_parser(self) -> Parser:
30
+ """Create and return a tree-sitter Parser for this language."""
31
+ lang = Language(self.language_fn())
32
+ parser = Parser(lang)
33
+ return parser
34
+
35
+
36
+ def _python_language():
37
+ import tree_sitter_python as tspython
38
+
39
+ return tspython.language()
40
+
41
+
42
+ def _javascript_language():
43
+ import tree_sitter_javascript as tsjs
44
+
45
+ return tsjs.language()
46
+
47
+
48
+ def _typescript_language():
49
+ import tree_sitter_typescript as tsts
50
+
51
+ return tsts.language()
52
+
53
+
54
+ def _go_language():
55
+ import tree_sitter_go as tsgo
56
+
57
+ return tsgo.language()
58
+
59
+
60
+ PYTHON_CONFIG = LanguageConfig(
61
+ name="python",
62
+ extensions=(".py", ".pyi"),
63
+ language_fn=_python_language,
64
+ chunk_types=(
65
+ "function_definition",
66
+ "class_definition",
67
+ "decorated_definition",
68
+ ),
69
+ scope_types=("function_definition", "class_definition"),
70
+ import_types=("import_statement", "import_from_statement"),
71
+ call_types=("call",),
72
+ function_scope_types=("function_definition",),
73
+ class_scope_types=("class_definition",),
74
+ init_filenames=("__init__",),
75
+ )
76
+
77
+ JAVASCRIPT_CONFIG = LanguageConfig(
78
+ name="javascript",
79
+ extensions=(".js", ".jsx", ".mjs", ".cjs"),
80
+ language_fn=_javascript_language,
81
+ chunk_types=(
82
+ "function_declaration",
83
+ "class_declaration",
84
+ "method_definition",
85
+ "arrow_function",
86
+ "export_statement",
87
+ "lexical_declaration",
88
+ ),
89
+ scope_types=("function_declaration", "class_declaration", "method_definition"),
90
+ import_types=("import_statement",),
91
+ call_types=("call_expression",),
92
+ function_scope_types=("function_declaration", "method_definition"),
93
+ class_scope_types=("class_declaration",),
94
+ init_filenames=("index",),
95
+ )
96
+
97
+ TYPESCRIPT_CONFIG = LanguageConfig(
98
+ name="typescript",
99
+ extensions=(".ts", ".tsx"),
100
+ language_fn=_typescript_language,
101
+ chunk_types=(
102
+ "function_declaration",
103
+ "class_declaration",
104
+ "method_definition",
105
+ "arrow_function",
106
+ "export_statement",
107
+ "lexical_declaration",
108
+ "interface_declaration",
109
+ "type_alias_declaration",
110
+ ),
111
+ scope_types=(
112
+ "function_declaration",
113
+ "class_declaration",
114
+ "method_definition",
115
+ "interface_declaration",
116
+ ),
117
+ import_types=("import_statement",),
118
+ call_types=("call_expression",),
119
+ function_scope_types=("function_declaration", "method_definition"),
120
+ class_scope_types=("class_declaration", "interface_declaration"),
121
+ init_filenames=("index",),
122
+ )
123
+
124
+ GO_CONFIG = LanguageConfig(
125
+ name="go",
126
+ extensions=(".go",),
127
+ language_fn=_go_language,
128
+ chunk_types=(
129
+ "function_declaration",
130
+ "method_declaration",
131
+ "type_declaration",
132
+ ),
133
+ scope_types=("function_declaration", "method_declaration"),
134
+ import_types=("import_declaration",),
135
+ call_types=("call_expression",),
136
+ function_scope_types=("function_declaration", "method_declaration"),
137
+ class_scope_types=(),
138
+ init_filenames=(),
139
+ )
140
+
141
+ LANGUAGE_REGISTRY: dict[str, LanguageConfig] = {
142
+ "python": PYTHON_CONFIG,
143
+ "javascript": JAVASCRIPT_CONFIG,
144
+ "typescript": TYPESCRIPT_CONFIG,
145
+ "go": GO_CONFIG,
146
+ }
147
+
148
+ _EXTENSION_MAP: dict[str, str] = {}
149
+ for _lang_name, _cfg in LANGUAGE_REGISTRY.items():
150
+ for _ext in _cfg.extensions:
151
+ _EXTENSION_MAP[_ext] = _lang_name
152
+
153
+
154
+ def get_language_for_file(path: str | Path) -> LanguageConfig | None:
155
+ """Return the LanguageConfig for a file based on its extension, or None."""
156
+ ext = Path(path).suffix.lower()
157
+ lang_name = _EXTENSION_MAP.get(ext)
158
+ if lang_name is None:
159
+ return None
160
+ return LANGUAGE_REGISTRY.get(lang_name)
161
+
162
+
163
+ def get_supported_extensions() -> set[str]:
164
+ """Return all file extensions we can index."""
165
+ return set(_EXTENSION_MAP.keys())
166
+
167
+
168
+ def get_init_filenames() -> set[str]:
169
+ """Return all init-style filenames across languages (e.g. __init__, index)."""
170
+ names: set[str] = set()
171
+ for cfg in LANGUAGE_REGISTRY.values():
172
+ names.update(cfg.init_filenames)
173
+ return names
174
+
175
+
176
+ def get_resolution_suffixes() -> list[str]:
177
+ """Return file suffixes for resolving import targets."""
178
+ suffixes: list[str] = []
179
+ seen: set[str] = set()
180
+ for cfg in LANGUAGE_REGISTRY.values():
181
+ for ext in cfg.extensions:
182
+ if ext not in seen:
183
+ suffixes.append(ext)
184
+ seen.add(ext)
185
+ for init in cfg.init_filenames:
186
+ combo = f"/{init}{ext}"
187
+ if combo not in seen:
188
+ suffixes.append(combo)
189
+ seen.add(combo)
190
+ return suffixes
@@ -0,0 +1,3 @@
1
+ from coderay.cli.commands import cli, main
2
+
3
+ __all__ = ["cli", "main"]