contextinator 0.0.post81__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextinator/__init__.py +34 -0
- contextinator/__main__.py +13 -0
- contextinator/_version.py +34 -0
- contextinator/chunking/__init__.py +34 -0
- contextinator/chunking/ast_parser.py +551 -0
- contextinator/chunking/ast_visualizer.py +315 -0
- contextinator/chunking/chunk_service.py +271 -0
- contextinator/chunking/context_builder.py +120 -0
- contextinator/chunking/file_discovery.py +163 -0
- contextinator/chunking/node_collector.py +157 -0
- contextinator/chunking/splitter.py +166 -0
- contextinator/cli.py +812 -0
- contextinator/config/__init__.py +50 -0
- contextinator/config/settings.py +323 -0
- contextinator/embedding/__init__.py +23 -0
- contextinator/embedding/embedding_service.py +510 -0
- contextinator/py.typed +0 -0
- contextinator/tools/__init__.py +158 -0
- contextinator/tools/full_text_search.py +290 -0
- contextinator/tools/read_file.py +206 -0
- contextinator/tools/regex_search.py +187 -0
- contextinator/tools/semantic_search.py +209 -0
- contextinator/tools/symbol_search.py +216 -0
- contextinator/utils/__init__.py +49 -0
- contextinator/utils/exceptions.py +212 -0
- contextinator/utils/hash_utils.py +30 -0
- contextinator/utils/logger.py +62 -0
- contextinator/utils/output_formatter.py +183 -0
- contextinator/utils/progress.py +70 -0
- contextinator/utils/repo_utils.py +165 -0
- contextinator/utils/token_counter.py +75 -0
- contextinator/utils/toon_encoder.py +45 -0
- contextinator/vectorstore/__init__.py +18 -0
- contextinator/vectorstore/chroma_store.py +502 -0
- contextinator-0.0.post81.dist-info/METADATA +576 -0
- contextinator-0.0.post81.dist-info/RECORD +40 -0
- contextinator-0.0.post81.dist-info/WHEEL +5 -0
- contextinator-0.0.post81.dist-info/entry_points.txt +2 -0
- contextinator-0.0.post81.dist-info/licenses/LICENSE +201 -0
- contextinator-0.0.post81.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Contextinator: Intelligent Codebase Understanding for AI Agents.
|
|
3
|
+
|
|
4
|
+
Transform any codebase into semantically-aware, searchable knowledge
|
|
5
|
+
for AI-powered workflows using AST parsing and vector embeddings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
# the _version.py file is auto-generated when you build the package.
|
|
10
|
+
from ._version import version as __version__
|
|
11
|
+
except ImportError:
|
|
12
|
+
# when in dev
|
|
13
|
+
__version__ = "0.0.0+unknown"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Core functionality exports
|
|
18
|
+
from .chunking import chunk_repository
|
|
19
|
+
from .embedding import embed_chunks
|
|
20
|
+
from .vectorstore import store_repository_embeddings, ChromaVectorStore
|
|
21
|
+
from .tools import semantic_search, full_text_search, symbol_search, regex_search
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"__version__",
|
|
25
|
+
"chunk_repository",
|
|
26
|
+
"embed_chunks",
|
|
27
|
+
"store_repository_embeddings",
|
|
28
|
+
"semantic_search",
|
|
29
|
+
"full_text_search",
|
|
30
|
+
"symbol_search",
|
|
31
|
+
"regex_search",
|
|
32
|
+
"read_file",
|
|
33
|
+
"ChromaVectorStore",
|
|
34
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entry point for running Contextinator as a module.
|
|
3
|
+
|
|
4
|
+
This allows the package to be executed with:
|
|
5
|
+
python -m contextinator <command> [options]
|
|
6
|
+
|
|
7
|
+
This is the recommended way to run Contextinator when installed as a package.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .cli import main
|
|
11
|
+
|
|
12
|
+
if __name__ == '__main__':
|
|
13
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.0.post81'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 0, 'post81')
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunking module for Contextinator.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive functionality for parsing source code files,
|
|
5
|
+
extracting semantic chunks using AST analysis, and managing the chunking pipeline.
|
|
6
|
+
|
|
7
|
+
The main components include:
|
|
8
|
+
- File discovery with intelligent ignore patterns
|
|
9
|
+
- AST parsing using Tree-sitter for multiple languages
|
|
10
|
+
- Semantic node extraction (functions, classes, methods)
|
|
11
|
+
- Chunk splitting and deduplication
|
|
12
|
+
- AST visualization for debugging
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Core chunking functionality
|
|
16
|
+
from .ast_parser import parse_file
|
|
17
|
+
from .ast_visualizer import save_ast_overview
|
|
18
|
+
from .chunk_service import chunk_repository, load_chunks, save_chunks
|
|
19
|
+
from .context_builder import build_context
|
|
20
|
+
from .file_discovery import discover_files
|
|
21
|
+
from .node_collector import collect_nodes
|
|
22
|
+
from .splitter import split_chunk
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
'build_context',
|
|
26
|
+
'chunk_repository',
|
|
27
|
+
'collect_nodes',
|
|
28
|
+
'discover_files',
|
|
29
|
+
'load_chunks',
|
|
30
|
+
'parse_file',
|
|
31
|
+
'save_ast_overview',
|
|
32
|
+
'save_chunks',
|
|
33
|
+
'split_chunk',
|
|
34
|
+
]
|
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Abstract Syntax Tree (AST) parsing module for Contextinator.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to parse source code files using Tree-sitter
|
|
5
|
+
parsers and extract semantic code chunks like functions, classes, and methods.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import uuid
|
|
10
|
+
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from ..config import SUPPORTED_EXTENSIONS
|
|
13
|
+
from ..utils.logger import logger
|
|
14
|
+
|
|
15
|
+
# Tree-sitter imports with graceful fallback
|
|
16
|
+
try:
|
|
17
|
+
from tree_sitter import Language, Parser
|
|
18
|
+
|
|
19
|
+
# Language-specific imports
|
|
20
|
+
import tree_sitter_bash
|
|
21
|
+
import tree_sitter_c
|
|
22
|
+
import tree_sitter_c_sharp
|
|
23
|
+
import tree_sitter_cpp
|
|
24
|
+
import tree_sitter_go
|
|
25
|
+
import tree_sitter_java
|
|
26
|
+
import tree_sitter_javascript
|
|
27
|
+
import tree_sitter_json
|
|
28
|
+
import tree_sitter_kotlin
|
|
29
|
+
import tree_sitter_lua
|
|
30
|
+
import tree_sitter_markdown
|
|
31
|
+
import tree_sitter_php
|
|
32
|
+
import tree_sitter_python
|
|
33
|
+
import tree_sitter_rust
|
|
34
|
+
import tree_sitter_solidity
|
|
35
|
+
import tree_sitter_sql
|
|
36
|
+
import tree_sitter_swift
|
|
37
|
+
import tree_sitter_toml
|
|
38
|
+
import tree_sitter_typescript
|
|
39
|
+
import tree_sitter_yaml
|
|
40
|
+
|
|
41
|
+
# Optional: dockerfile (not available on Windows)
|
|
42
|
+
try:
|
|
43
|
+
import tree_sitter_dockerfile
|
|
44
|
+
HAS_DOCKERFILE = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
tree_sitter_dockerfile = None
|
|
47
|
+
HAS_DOCKERFILE = False
|
|
48
|
+
logger.debug("tree-sitter-dockerfile not available (Windows platform)")
|
|
49
|
+
|
|
50
|
+
from .ast_visualizer import save_ast_visualization
|
|
51
|
+
|
|
52
|
+
# Language module mapping for parser creation
|
|
53
|
+
LANGUAGE_MODULES: Dict[str, Any] = {
|
|
54
|
+
'python': tree_sitter_python,
|
|
55
|
+
'javascript': tree_sitter_javascript,
|
|
56
|
+
'typescript': tree_sitter_typescript,
|
|
57
|
+
'tsx': tree_sitter_typescript, # TSX uses the same TypeScript module
|
|
58
|
+
'java': tree_sitter_java,
|
|
59
|
+
'go': tree_sitter_go,
|
|
60
|
+
'rust': tree_sitter_rust,
|
|
61
|
+
'cpp': tree_sitter_cpp,
|
|
62
|
+
'c': tree_sitter_c,
|
|
63
|
+
'csharp': tree_sitter_c_sharp,
|
|
64
|
+
'cs': tree_sitter_c_sharp, # Alternative C# extension
|
|
65
|
+
'php': tree_sitter_php,
|
|
66
|
+
'bash': tree_sitter_bash,
|
|
67
|
+
'sh': tree_sitter_bash, # Shell scripts
|
|
68
|
+
'sql': tree_sitter_sql,
|
|
69
|
+
'kotlin': tree_sitter_kotlin,
|
|
70
|
+
'kt': tree_sitter_kotlin, # Kotlin extension
|
|
71
|
+
'yaml': tree_sitter_yaml,
|
|
72
|
+
'yml': tree_sitter_yaml, # Alternative YAML extension
|
|
73
|
+
'markdown': tree_sitter_markdown,
|
|
74
|
+
'md': tree_sitter_markdown, # Markdown extension
|
|
75
|
+
'json': tree_sitter_json,
|
|
76
|
+
'toml': tree_sitter_toml,
|
|
77
|
+
'swift': tree_sitter_swift,
|
|
78
|
+
'solidity': tree_sitter_solidity,
|
|
79
|
+
'sol': tree_sitter_solidity, # Solidity extension
|
|
80
|
+
'lua': tree_sitter_lua,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Add dockerfile support if available (platform-dependent)
|
|
84
|
+
if HAS_DOCKERFILE:
|
|
85
|
+
LANGUAGE_MODULES['dockerfile'] = tree_sitter_dockerfile
|
|
86
|
+
|
|
87
|
+
TREE_SITTER_AVAILABLE = True
|
|
88
|
+
logger.info("Tree-sitter imports successful")
|
|
89
|
+
|
|
90
|
+
except ImportError as e:
|
|
91
|
+
TREE_SITTER_AVAILABLE = False
|
|
92
|
+
LANGUAGE_MODULES = {}
|
|
93
|
+
logger.warning(f"Tree-sitter import failed: {e}")
|
|
94
|
+
logger.info("💡 Install missing modules with: pip install tree-sitter tree-sitter-python tree-sitter-javascript ...")
|
|
95
|
+
|
|
96
|
+
if TYPE_CHECKING:
|
|
97
|
+
from tree_sitter import Parser
|
|
98
|
+
|
|
99
|
+
# Node types to extract per language for semantic chunking
|
|
100
|
+
NODE_TYPES: Dict[str, List[str]] = {
|
|
101
|
+
'python': ['function_definition', 'class_definition', 'decorated_definition', 'import_statement', 'import_from_statement'],
|
|
102
|
+
'javascript': ['function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'method_definition', 'import_statement'],
|
|
103
|
+
'typescript': ['function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'method_definition', 'interface_declaration', 'import_statement'],
|
|
104
|
+
'tsx': ['function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'method_definition', 'interface_declaration', 'import_statement'],
|
|
105
|
+
'java': ['class_declaration', 'method_declaration', 'constructor_declaration', 'interface_declaration', 'import_declaration'],
|
|
106
|
+
'go': ['function_declaration', 'method_declaration', 'type_declaration', 'import_declaration'],
|
|
107
|
+
'rust': ['function_item', 'impl_item', 'struct_item', 'enum_item', 'trait_item', 'use_declaration'],
|
|
108
|
+
'cpp': ['function_definition', 'class_specifier', 'struct_specifier', 'preproc_include'],
|
|
109
|
+
'c': ['function_definition', 'struct_specifier', 'preproc_include'],
|
|
110
|
+
'csharp': ['class_declaration', 'method_declaration', 'constructor_declaration', 'interface_declaration', 'property_declaration', 'using_directive'],
|
|
111
|
+
'cs': ['class_declaration', 'method_declaration', 'constructor_declaration', 'interface_declaration', 'property_declaration', 'using_directive'],
|
|
112
|
+
'php': ['function_definition', 'class_declaration', 'method_declaration', 'namespace_use_declaration'],
|
|
113
|
+
'bash': ['function_definition', 'command'],
|
|
114
|
+
'sh': ['function_definition', 'command'],
|
|
115
|
+
'sql': ['create_table_statement', 'create_view_statement', 'create_function_statement', 'create_procedure_statement'],
|
|
116
|
+
'kotlin': ['class_declaration', 'function_declaration', 'property_declaration', 'object_declaration', 'import_header'],
|
|
117
|
+
'kt': ['class_declaration', 'function_declaration', 'property_declaration', 'object_declaration', 'import_header'],
|
|
118
|
+
'yaml': ['block_mapping', 'block_sequence'],
|
|
119
|
+
'yml': ['block_mapping', 'block_sequence'],
|
|
120
|
+
'markdown': ['section', 'heading', 'code_block'],
|
|
121
|
+
'md': ['section', 'heading', 'code_block'],
|
|
122
|
+
'dockerfile': ['instruction'],
|
|
123
|
+
'json': ['object', 'array'],
|
|
124
|
+
'toml': ['table', 'key_value'],
|
|
125
|
+
'swift': ['class_declaration', 'function_declaration', 'protocol_declaration', 'struct_declaration', 'import_declaration'],
|
|
126
|
+
'solidity': ['contract_declaration', 'function_definition', 'struct_definition', 'event_definition'],
|
|
127
|
+
'sol': ['contract_declaration', 'function_definition', 'struct_definition', 'event_definition'],
|
|
128
|
+
'lua': ['function_definition', 'local_function', 'table_constructor'],
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
PARENT_NODE_TYPES: Dict[str, List[str]] = {
|
|
133
|
+
'python': ['class_definition'],
|
|
134
|
+
'javascript': ['class_declaration'],
|
|
135
|
+
'typescript': ['class_declaration', 'interface_declaration'],
|
|
136
|
+
'tsx': ['class_declaration', 'interface_declaration'],
|
|
137
|
+
'java': ['class_declaration', 'interface_declaration'],
|
|
138
|
+
'go': ['type_declaration'],
|
|
139
|
+
'rust': ['impl_item', 'struct_item', 'enum_item', 'trait_item'],
|
|
140
|
+
'cpp': ['class_specifier', 'struct_specifier'],
|
|
141
|
+
'c': ['struct_specifier'],
|
|
142
|
+
'csharp': ['class_declaration', 'interface_declaration'],
|
|
143
|
+
'cs': ['class_declaration', 'interface_declaration'],
|
|
144
|
+
'php': ['class_declaration'],
|
|
145
|
+
'bash': [],
|
|
146
|
+
'sh': [],
|
|
147
|
+
'sql': [],
|
|
148
|
+
'kotlin': ['class_declaration', 'object_declaration'],
|
|
149
|
+
'kt': ['class_declaration', 'object_declaration'],
|
|
150
|
+
'yaml': [],
|
|
151
|
+
'yml': [],
|
|
152
|
+
'markdown': [],
|
|
153
|
+
'md': [],
|
|
154
|
+
'dockerfile': [],
|
|
155
|
+
'json': [],
|
|
156
|
+
'toml': [],
|
|
157
|
+
'swift': ['class_declaration', 'struct_declaration', 'protocol_declaration'],
|
|
158
|
+
'solidity': ['contract_declaration', 'struct_definition'],
|
|
159
|
+
'sol': ['contract_declaration', 'struct_definition'],
|
|
160
|
+
'lua': ['table_constructor'],
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Cache for parsers to avoid recreation
|
|
164
|
+
_parser_cache: Dict[str, "Parser"] = {}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def parse_file(
|
|
168
|
+
file_path: Path,
|
|
169
|
+
save_ast: bool = False,
|
|
170
|
+
chunks_dir: Optional[Path] = None,
|
|
171
|
+
repo_path: Optional[Path] = None
|
|
172
|
+
) -> Optional[Dict[str, Any]]:
|
|
173
|
+
"""
|
|
174
|
+
Parse a file and return its AST representation with extracted nodes.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
file_path: Path to the file to parse (absolute path)
|
|
178
|
+
save_ast: Whether to save AST visualization data
|
|
179
|
+
chunks_dir: Repository-specific chunks directory for AST data (required if save_ast=True)
|
|
180
|
+
repo_path: Repository root path for computing relative paths (optional)
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Dictionary containing AST nodes and metadata, or None if parsing fails
|
|
184
|
+
|
|
185
|
+
Raises:
|
|
186
|
+
ValidationError: If save_ast is True but chunks_dir is None
|
|
187
|
+
FileSystemError: If file cannot be read
|
|
188
|
+
"""
|
|
189
|
+
from ..utils.exceptions import ValidationError, FileSystemError, ParsingError
|
|
190
|
+
|
|
191
|
+
if save_ast and chunks_dir is None:
|
|
192
|
+
raise ValidationError("chunks_dir is required when save_ast=True", "chunks_dir", "Path object")
|
|
193
|
+
|
|
194
|
+
# Compute repo-relative path with forward slashes for cross-platform compatibility
|
|
195
|
+
if repo_path:
|
|
196
|
+
try:
|
|
197
|
+
relative_path = file_path.relative_to(repo_path)
|
|
198
|
+
# Convert to forward slashes for consistency
|
|
199
|
+
file_path_str = relative_path.as_posix()
|
|
200
|
+
except ValueError:
|
|
201
|
+
# If file is not relative to repo_path, use absolute path
|
|
202
|
+
logger.warning(f"File {file_path} is not within repo {repo_path}, using absolute path")
|
|
203
|
+
file_path_str = str(file_path)
|
|
204
|
+
else:
|
|
205
|
+
file_path_str = str(file_path)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
language = SUPPORTED_EXTENSIONS.get(file_path.suffix)
|
|
209
|
+
if not language:
|
|
210
|
+
logger.debug(f"Unsupported file extension: {file_path.suffix}")
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
# Handle file reading errors gracefully
|
|
214
|
+
try:
|
|
215
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
216
|
+
content = f.read()
|
|
217
|
+
except (OSError, IOError, PermissionError) as e:
|
|
218
|
+
raise FileSystemError(f"Cannot read file: {e}", str(file_path), "read")
|
|
219
|
+
|
|
220
|
+
if not TREE_SITTER_AVAILABLE:
|
|
221
|
+
# Fallback when Tree-sitter unavailable
|
|
222
|
+
logger.warning(f"Tree-sitter not available, using fallback for {file_path}")
|
|
223
|
+
result = _fallback_parse(file_path, file_path_str, language, content)
|
|
224
|
+
|
|
225
|
+
if save_ast and chunks_dir:
|
|
226
|
+
_save_ast_safely(file_path, language, None, content, result['nodes'], chunks_dir, result.get('tree_info'))
|
|
227
|
+
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
# Try AST parsing with fallback
|
|
231
|
+
try:
|
|
232
|
+
parser = get_parser(language)
|
|
233
|
+
if not parser:
|
|
234
|
+
logger.warning(f"No parser available for {language}, using fallback for {file_path}")
|
|
235
|
+
return _fallback_parse(file_path, file_path_str, language, content)
|
|
236
|
+
|
|
237
|
+
tree = parser.parse(bytes(content, 'utf-8'))
|
|
238
|
+
nodes = extract_nodes(tree.root_node, content, language)
|
|
239
|
+
|
|
240
|
+
logger.debug(f"Parsed {file_path} - Found {len(nodes)} semantic nodes")
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
# Fallback to file-level chunking on any parsing error
|
|
244
|
+
logger.warning(f"AST parsing failed for {file_path}, using fallback: {e}")
|
|
245
|
+
return _fallback_parse(file_path, file_path_str, language, content)
|
|
246
|
+
|
|
247
|
+
# If no nodes extracted, fallback to file-level
|
|
248
|
+
if not nodes:
|
|
249
|
+
logger.warning(f"No semantic nodes found in {file_path}, using file-level chunking")
|
|
250
|
+
result = _fallback_parse(file_path, file_path_str, language, content)
|
|
251
|
+
else:
|
|
252
|
+
result = {
|
|
253
|
+
'file_path': file_path_str,
|
|
254
|
+
'language': language,
|
|
255
|
+
'content': content,
|
|
256
|
+
'nodes': nodes,
|
|
257
|
+
'tree_info': {
|
|
258
|
+
'has_ast': True,
|
|
259
|
+
'root_node_type': tree.root_node.type,
|
|
260
|
+
'total_nodes': _count_nodes(tree.root_node),
|
|
261
|
+
'tree_depth': _get_tree_depth(tree.root_node)
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
# Save AST visualization if requested
|
|
266
|
+
if save_ast and chunks_dir:
|
|
267
|
+
root_node = tree.root_node if 'tree_info' in result and result['tree_info'].get('has_ast', False) else None
|
|
268
|
+
_save_ast_safely(file_path, language, root_node, content, result['nodes'], chunks_dir, result.get('tree_info'))
|
|
269
|
+
|
|
270
|
+
return result
|
|
271
|
+
|
|
272
|
+
except (ValidationError, FileSystemError):
|
|
273
|
+
# Re-raise our custom exceptions
|
|
274
|
+
raise
|
|
275
|
+
except Exception as e:
|
|
276
|
+
# Pattern 1: Log unexpected errors and continue
|
|
277
|
+
logger.error(f"Unexpected error parsing {file_path}: {e}")
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _fallback_parse(file_path: Path, file_path_str: str, language: str, content: str) -> Dict[str, Any]:
|
|
282
|
+
"""
|
|
283
|
+
Fallback parsing when tree-sitter is unavailable or fails.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
file_path: Absolute path to the file being parsed (for logging/display)
|
|
287
|
+
file_path_str: Repo-relative file path string to store in metadata
|
|
288
|
+
language: Programming language identifier
|
|
289
|
+
content: File content
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Dictionary with file-level chunk information
|
|
293
|
+
"""
|
|
294
|
+
return {
|
|
295
|
+
'file_path': file_path_str,
|
|
296
|
+
'language': language,
|
|
297
|
+
'content': content,
|
|
298
|
+
'nodes': [{
|
|
299
|
+
'type': 'file',
|
|
300
|
+
'name': file_path.name,
|
|
301
|
+
'content': content,
|
|
302
|
+
'start_line': 1,
|
|
303
|
+
'end_line': len(content.splitlines()),
|
|
304
|
+
'start_byte': 0,
|
|
305
|
+
'end_byte': len(content.encode('utf-8'))
|
|
306
|
+
}],
|
|
307
|
+
'tree_info': {
|
|
308
|
+
'has_ast': False,
|
|
309
|
+
'fallback_reason': 'tree-sitter not available or language modules missing',
|
|
310
|
+
'parser_available': TREE_SITTER_AVAILABLE
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def get_parser(language: str) -> Optional["Parser"]:
|
|
316
|
+
"""
|
|
317
|
+
Get tree-sitter parser for language with caching.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
language: Programming language identifier
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Parser instance or None if unavailable
|
|
324
|
+
"""
|
|
325
|
+
global _parser_cache
|
|
326
|
+
|
|
327
|
+
if not TREE_SITTER_AVAILABLE:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
# Return cached parser
|
|
331
|
+
if language in _parser_cache:
|
|
332
|
+
return _parser_cache[language]
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
# Get language module
|
|
336
|
+
lang_module = LANGUAGE_MODULES.get(language)
|
|
337
|
+
if not lang_module:
|
|
338
|
+
logger.warning(f"No language module available for {language}")
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
# Handle special case for TypeScript/TSX which have different API
|
|
342
|
+
if language == 'typescript':
|
|
343
|
+
lang_obj = Language(lang_module.language_typescript())
|
|
344
|
+
elif language == 'tsx':
|
|
345
|
+
lang_obj = Language(lang_module.language_tsx())
|
|
346
|
+
else:
|
|
347
|
+
# Create Language object from module for other languages
|
|
348
|
+
lang_obj = Language(lang_module.language())
|
|
349
|
+
|
|
350
|
+
# Create parser with language
|
|
351
|
+
parser = Parser(lang_obj)
|
|
352
|
+
_parser_cache[language] = parser
|
|
353
|
+
return parser
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.warning(f"Error creating parser for {language}: {e}")
|
|
357
|
+
return None
|
|
358
|
+
|
|
359
|
+
def extract_nodes(root_node: Any, content: str, language: str) -> List[Dict[str, Any]]:
|
|
360
|
+
"""
|
|
361
|
+
Extract relevant nodes from AST based on language-specific node types.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
root_node: Root node of the AST
|
|
365
|
+
content: Source code content
|
|
366
|
+
language: Programming language
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
List of extracted nodes with metadata including hierarchy
|
|
370
|
+
"""
|
|
371
|
+
target_types = NODE_TYPES.get(language, [])
|
|
372
|
+
if not target_types:
|
|
373
|
+
return []
|
|
374
|
+
|
|
375
|
+
parent_types = set(PARENT_NODE_TYPES.get(language, []))
|
|
376
|
+
nodes = []
|
|
377
|
+
content_bytes = content.encode('utf-8')
|
|
378
|
+
|
|
379
|
+
def traverse(node: Any, parent_id: Optional[str] = None, parent_info: Optional[Dict] = None) -> None:
|
|
380
|
+
"""Recursively traverse AST and extract target nodes with hierarchy tracking."""
|
|
381
|
+
if node.type in target_types:
|
|
382
|
+
node_id = str(uuid.uuid4())
|
|
383
|
+
node_content = content_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')
|
|
384
|
+
node_name = get_node_name(node, content_bytes)
|
|
385
|
+
is_parent = node.type in parent_types
|
|
386
|
+
|
|
387
|
+
node_dict = {
|
|
388
|
+
'id': node_id,
|
|
389
|
+
'type': node.type,
|
|
390
|
+
'name': node_name,
|
|
391
|
+
'content': node_content,
|
|
392
|
+
'start_line': node.start_point[0] + 1,
|
|
393
|
+
'end_line': node.end_point[0] + 1,
|
|
394
|
+
'start_byte': node.start_byte,
|
|
395
|
+
'end_byte': node.end_byte,
|
|
396
|
+
'is_parent': is_parent,
|
|
397
|
+
'parent_id': parent_id,
|
|
398
|
+
'parent_type': parent_info.get('type') if parent_info else None,
|
|
399
|
+
'parent_name': parent_info.get('name') if parent_info else None,
|
|
400
|
+
'children_ids': []
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
nodes.append(node_dict)
|
|
404
|
+
|
|
405
|
+
if is_parent:
|
|
406
|
+
for child in node.children:
|
|
407
|
+
traverse(child, node_id, {'type': node.type, 'name': node_name})
|
|
408
|
+
else:
|
|
409
|
+
for child in node.children:
|
|
410
|
+
traverse(child, parent_id, parent_info)
|
|
411
|
+
else:
|
|
412
|
+
for child in node.children:
|
|
413
|
+
traverse(child, parent_id, parent_info)
|
|
414
|
+
|
|
415
|
+
traverse(root_node)
|
|
416
|
+
|
|
417
|
+
# Populate children_ids
|
|
418
|
+
for node in nodes:
|
|
419
|
+
if node['is_parent']:
|
|
420
|
+
node['children_ids'] = [n['id'] for n in nodes if n['parent_id'] == node['id']]
|
|
421
|
+
|
|
422
|
+
return nodes
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def get_node_name(node: Any, content_bytes: bytes) -> Optional[str]:
|
|
426
|
+
"""
|
|
427
|
+
Extract name from a node with language-aware and node-type-aware logic.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
node: AST node
|
|
431
|
+
content_bytes: Source code as bytes
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
Node name or generated identifier
|
|
435
|
+
"""
|
|
436
|
+
try:
|
|
437
|
+
node_type = node.type
|
|
438
|
+
|
|
439
|
+
# Special handling for different node types
|
|
440
|
+
if node_type in ('section', 'heading'):
|
|
441
|
+
for child in node.children:
|
|
442
|
+
if child.type in ('atx_heading', 'setext_heading'):
|
|
443
|
+
text = content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
|
|
444
|
+
return text.strip().lstrip('#').strip()[:50] # First 50 chars
|
|
445
|
+
first_line = content_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore').split('\n')[0]
|
|
446
|
+
cleaned = first_line.strip().lstrip('#').strip()[:50]
|
|
447
|
+
return cleaned if cleaned else f"section_line_{node.start_point[0] + 1}"
|
|
448
|
+
|
|
449
|
+
if node_type == 'arrow_function':
|
|
450
|
+
parent = node.parent
|
|
451
|
+
if parent and parent.type in ('variable_declarator', 'lexical_declaration'):
|
|
452
|
+
for child in parent.children:
|
|
453
|
+
if child.type == 'identifier':
|
|
454
|
+
return content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
|
|
455
|
+
return f"arrow_fn_line_{node.start_point[0] + 1}"
|
|
456
|
+
|
|
457
|
+
if node_type in ('object', 'block_mapping'):
|
|
458
|
+
parent = node.parent
|
|
459
|
+
if parent and parent.type == 'pair':
|
|
460
|
+
for child in parent.children:
|
|
461
|
+
if child.type in ('string', 'flow_node', 'identifier'):
|
|
462
|
+
key = content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
|
|
463
|
+
cleaned_key = key.strip('"\'')[:30]
|
|
464
|
+
return cleaned_key
|
|
465
|
+
return f"{node_type}_line_{node.start_point[0] + 1}"
|
|
466
|
+
|
|
467
|
+
if node_type in ('array', 'block_sequence'):
|
|
468
|
+
parent = node.parent
|
|
469
|
+
if parent and parent.type == 'pair':
|
|
470
|
+
for child in parent.children:
|
|
471
|
+
if child.type in ('string', 'flow_node', 'identifier'):
|
|
472
|
+
key = content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
|
|
473
|
+
cleaned_key = key.strip('"\'')[:20]
|
|
474
|
+
return f"{cleaned_key}_array"
|
|
475
|
+
return f"{node_type}_line_{node.start_point[0] + 1}"
|
|
476
|
+
|
|
477
|
+
# Generic identifier extraction
|
|
478
|
+
identifier_types = {'identifier', 'name', 'property_identifier', 'type_identifier', 'field_identifier'}
|
|
479
|
+
for child in node.children:
|
|
480
|
+
if child.type in identifier_types:
|
|
481
|
+
return content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
|
|
482
|
+
for child in node.children:
|
|
483
|
+
for grandchild in child.children:
|
|
484
|
+
if grandchild.type in identifier_types:
|
|
485
|
+
return content_bytes[grandchild.start_byte:grandchild.end_byte].decode('utf-8', errors='ignore')
|
|
486
|
+
|
|
487
|
+
return f"anonymous_{node_type}_line_{node.start_point[0] + 1}"
|
|
488
|
+
|
|
489
|
+
except Exception:
|
|
490
|
+
return f"unknown_line_{node.start_point[0] + 1}" if hasattr(node, 'start_point') else None
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _save_ast_safely(file_path: Path, language: str, root_node: Any, content: str,
|
|
494
|
+
nodes: List[Dict[str, Any]], chunks_dir: Path, tree_info: Optional[Dict[str, Any]]) -> None:
|
|
495
|
+
"""
|
|
496
|
+
Safely save AST visualization with error handling.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
file_path: Path to source file
|
|
500
|
+
language: Programming language
|
|
501
|
+
root_node: AST root node (None for fallback)
|
|
502
|
+
content: Source code content
|
|
503
|
+
nodes: Extracted nodes
|
|
504
|
+
chunks_dir: Directory for AST files
|
|
505
|
+
tree_info: Tree metadata
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
logger.debug(f"Saving AST for {file_path}")
|
|
509
|
+
save_ast_visualization(str(file_path), language, root_node, content, nodes, chunks_dir, tree_info)
|
|
510
|
+
except Exception as e:
|
|
511
|
+
logger.warning(f"Could not save AST for {file_path}: {e}")
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def _count_nodes(node: Any) -> int:
|
|
515
|
+
"""
|
|
516
|
+
Count total number of nodes in the AST.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
node: AST node
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
Total node count
|
|
523
|
+
"""
|
|
524
|
+
return 1 + sum(_count_nodes(child) for child in node.children)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _get_tree_depth(node: Any, current_depth: int = 0) -> int:
|
|
528
|
+
"""
|
|
529
|
+
Get the maximum depth of the AST.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
node: AST node
|
|
533
|
+
current_depth: Current depth level
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
Maximum tree depth
|
|
537
|
+
"""
|
|
538
|
+
if not node.children:
|
|
539
|
+
return current_depth
|
|
540
|
+
return max(_get_tree_depth(child, current_depth + 1) for child in node.children)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
__all__ = [
|
|
544
|
+
'parse_file',
|
|
545
|
+
'get_parser',
|
|
546
|
+
'extract_nodes',
|
|
547
|
+
'get_node_name',
|
|
548
|
+
'NODE_TYPES',
|
|
549
|
+
'LANGUAGE_MODULES',
|
|
550
|
+
'TREE_SITTER_AVAILABLE',
|
|
551
|
+
]
|