contextinator 0.0.post81__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. contextinator/__init__.py +34 -0
  2. contextinator/__main__.py +13 -0
  3. contextinator/_version.py +34 -0
  4. contextinator/chunking/__init__.py +34 -0
  5. contextinator/chunking/ast_parser.py +551 -0
  6. contextinator/chunking/ast_visualizer.py +315 -0
  7. contextinator/chunking/chunk_service.py +271 -0
  8. contextinator/chunking/context_builder.py +120 -0
  9. contextinator/chunking/file_discovery.py +163 -0
  10. contextinator/chunking/node_collector.py +157 -0
  11. contextinator/chunking/splitter.py +166 -0
  12. contextinator/cli.py +812 -0
  13. contextinator/config/__init__.py +50 -0
  14. contextinator/config/settings.py +323 -0
  15. contextinator/embedding/__init__.py +23 -0
  16. contextinator/embedding/embedding_service.py +510 -0
  17. contextinator/py.typed +0 -0
  18. contextinator/tools/__init__.py +158 -0
  19. contextinator/tools/full_text_search.py +290 -0
  20. contextinator/tools/read_file.py +206 -0
  21. contextinator/tools/regex_search.py +187 -0
  22. contextinator/tools/semantic_search.py +209 -0
  23. contextinator/tools/symbol_search.py +216 -0
  24. contextinator/utils/__init__.py +49 -0
  25. contextinator/utils/exceptions.py +212 -0
  26. contextinator/utils/hash_utils.py +30 -0
  27. contextinator/utils/logger.py +62 -0
  28. contextinator/utils/output_formatter.py +183 -0
  29. contextinator/utils/progress.py +70 -0
  30. contextinator/utils/repo_utils.py +165 -0
  31. contextinator/utils/token_counter.py +75 -0
  32. contextinator/utils/toon_encoder.py +45 -0
  33. contextinator/vectorstore/__init__.py +18 -0
  34. contextinator/vectorstore/chroma_store.py +502 -0
  35. contextinator-0.0.post81.dist-info/METADATA +576 -0
  36. contextinator-0.0.post81.dist-info/RECORD +40 -0
  37. contextinator-0.0.post81.dist-info/WHEEL +5 -0
  38. contextinator-0.0.post81.dist-info/entry_points.txt +2 -0
  39. contextinator-0.0.post81.dist-info/licenses/LICENSE +201 -0
  40. contextinator-0.0.post81.dist-info/top_level.txt +1 -0
@@ -0,0 +1,34 @@
1
+ """
2
+ Contextinator: Intelligent Codebase Understanding for AI Agents.
3
+
4
+ Transform any codebase into semantically-aware, searchable knowledge
5
+ for AI-powered workflows using AST parsing and vector embeddings.
6
+ """
7
+
8
+ try:
9
+ # the _version.py file is auto-generated when you build the package.
10
+ from ._version import version as __version__
11
+ except ImportError:
12
+ # when in dev
13
+ __version__ = "0.0.0+unknown"
14
+
15
+
16
+
17
+ # Core functionality exports
18
+ from .chunking import chunk_repository
19
+ from .embedding import embed_chunks
20
+ from .vectorstore import store_repository_embeddings, ChromaVectorStore
21
+ from .tools import semantic_search, full_text_search, symbol_search, regex_search
22
+
23
+ __all__ = [
24
+ "__version__",
25
+ "chunk_repository",
26
+ "embed_chunks",
27
+ "store_repository_embeddings",
28
+ "semantic_search",
29
+ "full_text_search",
30
+ "symbol_search",
31
+ "regex_search",
32
+ "read_file",
33
+ "ChromaVectorStore",
34
+ ]
@@ -0,0 +1,13 @@
1
+ """
2
+ Entry point for running Contextinator as a module.
3
+
4
+ This allows the package to be executed with:
5
+ python -m contextinator <command> [options]
6
+
7
+ This is the recommended way to run Contextinator when installed as a package.
8
+ """
9
+
10
+ from .cli import main
11
+
12
+ if __name__ == '__main__':
13
+ main()
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.0.post81'
32
+ __version_tuple__ = version_tuple = (0, 0, 'post81')
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,34 @@
1
+ """
2
+ Chunking module for Contextinator.
3
+
4
+ This module provides comprehensive functionality for parsing source code files,
5
+ extracting semantic chunks using AST analysis, and managing the chunking pipeline.
6
+
7
+ The main components include:
8
+ - File discovery with intelligent ignore patterns
9
+ - AST parsing using Tree-sitter for multiple languages
10
+ - Semantic node extraction (functions, classes, methods)
11
+ - Chunk splitting and deduplication
12
+ - AST visualization for debugging
13
+ """
14
+
15
+ # Core chunking functionality
16
+ from .ast_parser import parse_file
17
+ from .ast_visualizer import save_ast_overview
18
+ from .chunk_service import chunk_repository, load_chunks, save_chunks
19
+ from .context_builder import build_context
20
+ from .file_discovery import discover_files
21
+ from .node_collector import collect_nodes
22
+ from .splitter import split_chunk
23
+
24
+ __all__ = [
25
+ 'build_context',
26
+ 'chunk_repository',
27
+ 'collect_nodes',
28
+ 'discover_files',
29
+ 'load_chunks',
30
+ 'parse_file',
31
+ 'save_ast_overview',
32
+ 'save_chunks',
33
+ 'split_chunk',
34
+ ]
@@ -0,0 +1,551 @@
1
+ """
2
+ Abstract Syntax Tree (AST) parsing module for Contextinator.
3
+
4
+ This module provides functionality to parse source code files using Tree-sitter
5
+ parsers and extract semantic code chunks like functions, classes, and methods.
6
+ """
7
+
8
+ from pathlib import Path
9
+ import uuid
10
+ from typing import Any, Dict, List, Optional, TYPE_CHECKING
11
+
12
+ from ..config import SUPPORTED_EXTENSIONS
13
+ from ..utils.logger import logger
14
+
15
+ # Tree-sitter imports with graceful fallback
16
+ try:
17
+ from tree_sitter import Language, Parser
18
+
19
+ # Language-specific imports
20
+ import tree_sitter_bash
21
+ import tree_sitter_c
22
+ import tree_sitter_c_sharp
23
+ import tree_sitter_cpp
24
+ import tree_sitter_go
25
+ import tree_sitter_java
26
+ import tree_sitter_javascript
27
+ import tree_sitter_json
28
+ import tree_sitter_kotlin
29
+ import tree_sitter_lua
30
+ import tree_sitter_markdown
31
+ import tree_sitter_php
32
+ import tree_sitter_python
33
+ import tree_sitter_rust
34
+ import tree_sitter_solidity
35
+ import tree_sitter_sql
36
+ import tree_sitter_swift
37
+ import tree_sitter_toml
38
+ import tree_sitter_typescript
39
+ import tree_sitter_yaml
40
+
41
+ # Optional: dockerfile (not available on Windows)
42
+ try:
43
+ import tree_sitter_dockerfile
44
+ HAS_DOCKERFILE = True
45
+ except ImportError:
46
+ tree_sitter_dockerfile = None
47
+ HAS_DOCKERFILE = False
48
+ logger.debug("tree-sitter-dockerfile not available (Windows platform)")
49
+
50
+ from .ast_visualizer import save_ast_visualization
51
+
52
+ # Language module mapping for parser creation
53
+ LANGUAGE_MODULES: Dict[str, Any] = {
54
+ 'python': tree_sitter_python,
55
+ 'javascript': tree_sitter_javascript,
56
+ 'typescript': tree_sitter_typescript,
57
+ 'tsx': tree_sitter_typescript, # TSX uses the same TypeScript module
58
+ 'java': tree_sitter_java,
59
+ 'go': tree_sitter_go,
60
+ 'rust': tree_sitter_rust,
61
+ 'cpp': tree_sitter_cpp,
62
+ 'c': tree_sitter_c,
63
+ 'csharp': tree_sitter_c_sharp,
64
+ 'cs': tree_sitter_c_sharp, # Alternative C# extension
65
+ 'php': tree_sitter_php,
66
+ 'bash': tree_sitter_bash,
67
+ 'sh': tree_sitter_bash, # Shell scripts
68
+ 'sql': tree_sitter_sql,
69
+ 'kotlin': tree_sitter_kotlin,
70
+ 'kt': tree_sitter_kotlin, # Kotlin extension
71
+ 'yaml': tree_sitter_yaml,
72
+ 'yml': tree_sitter_yaml, # Alternative YAML extension
73
+ 'markdown': tree_sitter_markdown,
74
+ 'md': tree_sitter_markdown, # Markdown extension
75
+ 'json': tree_sitter_json,
76
+ 'toml': tree_sitter_toml,
77
+ 'swift': tree_sitter_swift,
78
+ 'solidity': tree_sitter_solidity,
79
+ 'sol': tree_sitter_solidity, # Solidity extension
80
+ 'lua': tree_sitter_lua,
81
+ }
82
+
83
+ # Add dockerfile support if available (platform-dependent)
84
+ if HAS_DOCKERFILE:
85
+ LANGUAGE_MODULES['dockerfile'] = tree_sitter_dockerfile
86
+
87
+ TREE_SITTER_AVAILABLE = True
88
+ logger.info("Tree-sitter imports successful")
89
+
90
+ except ImportError as e:
91
+ TREE_SITTER_AVAILABLE = False
92
+ LANGUAGE_MODULES = {}
93
+ logger.warning(f"Tree-sitter import failed: {e}")
94
+ logger.info("💡 Install missing modules with: pip install tree-sitter tree-sitter-python tree-sitter-javascript ...")
95
+
96
+ if TYPE_CHECKING:
97
+ from tree_sitter import Parser
98
+
99
+ # Node types to extract per language for semantic chunking
100
+ NODE_TYPES: Dict[str, List[str]] = {
101
+ 'python': ['function_definition', 'class_definition', 'decorated_definition', 'import_statement', 'import_from_statement'],
102
+ 'javascript': ['function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'method_definition', 'import_statement'],
103
+ 'typescript': ['function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'method_definition', 'interface_declaration', 'import_statement'],
104
+ 'tsx': ['function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'method_definition', 'interface_declaration', 'import_statement'],
105
+ 'java': ['class_declaration', 'method_declaration', 'constructor_declaration', 'interface_declaration', 'import_declaration'],
106
+ 'go': ['function_declaration', 'method_declaration', 'type_declaration', 'import_declaration'],
107
+ 'rust': ['function_item', 'impl_item', 'struct_item', 'enum_item', 'trait_item', 'use_declaration'],
108
+ 'cpp': ['function_definition', 'class_specifier', 'struct_specifier', 'preproc_include'],
109
+ 'c': ['function_definition', 'struct_specifier', 'preproc_include'],
110
+ 'csharp': ['class_declaration', 'method_declaration', 'constructor_declaration', 'interface_declaration', 'property_declaration', 'using_directive'],
111
+ 'cs': ['class_declaration', 'method_declaration', 'constructor_declaration', 'interface_declaration', 'property_declaration', 'using_directive'],
112
+ 'php': ['function_definition', 'class_declaration', 'method_declaration', 'namespace_use_declaration'],
113
+ 'bash': ['function_definition', 'command'],
114
+ 'sh': ['function_definition', 'command'],
115
+ 'sql': ['create_table_statement', 'create_view_statement', 'create_function_statement', 'create_procedure_statement'],
116
+ 'kotlin': ['class_declaration', 'function_declaration', 'property_declaration', 'object_declaration', 'import_header'],
117
+ 'kt': ['class_declaration', 'function_declaration', 'property_declaration', 'object_declaration', 'import_header'],
118
+ 'yaml': ['block_mapping', 'block_sequence'],
119
+ 'yml': ['block_mapping', 'block_sequence'],
120
+ 'markdown': ['section', 'heading', 'code_block'],
121
+ 'md': ['section', 'heading', 'code_block'],
122
+ 'dockerfile': ['instruction'],
123
+ 'json': ['object', 'array'],
124
+ 'toml': ['table', 'key_value'],
125
+ 'swift': ['class_declaration', 'function_declaration', 'protocol_declaration', 'struct_declaration', 'import_declaration'],
126
+ 'solidity': ['contract_declaration', 'function_definition', 'struct_definition', 'event_definition'],
127
+ 'sol': ['contract_declaration', 'function_definition', 'struct_definition', 'event_definition'],
128
+ 'lua': ['function_definition', 'local_function', 'table_constructor'],
129
+ }
130
+
131
+
132
+ PARENT_NODE_TYPES: Dict[str, List[str]] = {
133
+ 'python': ['class_definition'],
134
+ 'javascript': ['class_declaration'],
135
+ 'typescript': ['class_declaration', 'interface_declaration'],
136
+ 'tsx': ['class_declaration', 'interface_declaration'],
137
+ 'java': ['class_declaration', 'interface_declaration'],
138
+ 'go': ['type_declaration'],
139
+ 'rust': ['impl_item', 'struct_item', 'enum_item', 'trait_item'],
140
+ 'cpp': ['class_specifier', 'struct_specifier'],
141
+ 'c': ['struct_specifier'],
142
+ 'csharp': ['class_declaration', 'interface_declaration'],
143
+ 'cs': ['class_declaration', 'interface_declaration'],
144
+ 'php': ['class_declaration'],
145
+ 'bash': [],
146
+ 'sh': [],
147
+ 'sql': [],
148
+ 'kotlin': ['class_declaration', 'object_declaration'],
149
+ 'kt': ['class_declaration', 'object_declaration'],
150
+ 'yaml': [],
151
+ 'yml': [],
152
+ 'markdown': [],
153
+ 'md': [],
154
+ 'dockerfile': [],
155
+ 'json': [],
156
+ 'toml': [],
157
+ 'swift': ['class_declaration', 'struct_declaration', 'protocol_declaration'],
158
+ 'solidity': ['contract_declaration', 'struct_definition'],
159
+ 'sol': ['contract_declaration', 'struct_definition'],
160
+ 'lua': ['table_constructor'],
161
+ }
162
+
163
+ # Cache for parsers to avoid recreation
164
+ _parser_cache: Dict[str, "Parser"] = {}
165
+
166
+
167
+ def parse_file(
168
+ file_path: Path,
169
+ save_ast: bool = False,
170
+ chunks_dir: Optional[Path] = None,
171
+ repo_path: Optional[Path] = None
172
+ ) -> Optional[Dict[str, Any]]:
173
+ """
174
+ Parse a file and return its AST representation with extracted nodes.
175
+
176
+ Args:
177
+ file_path: Path to the file to parse (absolute path)
178
+ save_ast: Whether to save AST visualization data
179
+ chunks_dir: Repository-specific chunks directory for AST data (required if save_ast=True)
180
+ repo_path: Repository root path for computing relative paths (optional)
181
+
182
+ Returns:
183
+ Dictionary containing AST nodes and metadata, or None if parsing fails
184
+
185
+ Raises:
186
+ ValidationError: If save_ast is True but chunks_dir is None
187
+ FileSystemError: If file cannot be read
188
+ """
189
+ from ..utils.exceptions import ValidationError, FileSystemError, ParsingError
190
+
191
+ if save_ast and chunks_dir is None:
192
+ raise ValidationError("chunks_dir is required when save_ast=True", "chunks_dir", "Path object")
193
+
194
+ # Compute repo-relative path with forward slashes for cross-platform compatibility
195
+ if repo_path:
196
+ try:
197
+ relative_path = file_path.relative_to(repo_path)
198
+ # Convert to forward slashes for consistency
199
+ file_path_str = relative_path.as_posix()
200
+ except ValueError:
201
+ # If file is not relative to repo_path, use absolute path
202
+ logger.warning(f"File {file_path} is not within repo {repo_path}, using absolute path")
203
+ file_path_str = str(file_path)
204
+ else:
205
+ file_path_str = str(file_path)
206
+
207
+ try:
208
+ language = SUPPORTED_EXTENSIONS.get(file_path.suffix)
209
+ if not language:
210
+ logger.debug(f"Unsupported file extension: {file_path.suffix}")
211
+ return None
212
+
213
+ # Handle file reading errors gracefully
214
+ try:
215
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
216
+ content = f.read()
217
+ except (OSError, IOError, PermissionError) as e:
218
+ raise FileSystemError(f"Cannot read file: {e}", str(file_path), "read")
219
+
220
+ if not TREE_SITTER_AVAILABLE:
221
+ # Fallback when Tree-sitter unavailable
222
+ logger.warning(f"Tree-sitter not available, using fallback for {file_path}")
223
+ result = _fallback_parse(file_path, file_path_str, language, content)
224
+
225
+ if save_ast and chunks_dir:
226
+ _save_ast_safely(file_path, language, None, content, result['nodes'], chunks_dir, result.get('tree_info'))
227
+
228
+ return result
229
+
230
+ # Try AST parsing with fallback
231
+ try:
232
+ parser = get_parser(language)
233
+ if not parser:
234
+ logger.warning(f"No parser available for {language}, using fallback for {file_path}")
235
+ return _fallback_parse(file_path, file_path_str, language, content)
236
+
237
+ tree = parser.parse(bytes(content, 'utf-8'))
238
+ nodes = extract_nodes(tree.root_node, content, language)
239
+
240
+ logger.debug(f"Parsed {file_path} - Found {len(nodes)} semantic nodes")
241
+
242
+ except Exception as e:
243
+ # Fallback to file-level chunking on any parsing error
244
+ logger.warning(f"AST parsing failed for {file_path}, using fallback: {e}")
245
+ return _fallback_parse(file_path, file_path_str, language, content)
246
+
247
+ # If no nodes extracted, fallback to file-level
248
+ if not nodes:
249
+ logger.warning(f"No semantic nodes found in {file_path}, using file-level chunking")
250
+ result = _fallback_parse(file_path, file_path_str, language, content)
251
+ else:
252
+ result = {
253
+ 'file_path': file_path_str,
254
+ 'language': language,
255
+ 'content': content,
256
+ 'nodes': nodes,
257
+ 'tree_info': {
258
+ 'has_ast': True,
259
+ 'root_node_type': tree.root_node.type,
260
+ 'total_nodes': _count_nodes(tree.root_node),
261
+ 'tree_depth': _get_tree_depth(tree.root_node)
262
+ }
263
+ }
264
+
265
+ # Save AST visualization if requested
266
+ if save_ast and chunks_dir:
267
+ root_node = tree.root_node if 'tree_info' in result and result['tree_info'].get('has_ast', False) else None
268
+ _save_ast_safely(file_path, language, root_node, content, result['nodes'], chunks_dir, result.get('tree_info'))
269
+
270
+ return result
271
+
272
+ except (ValidationError, FileSystemError):
273
+ # Re-raise our custom exceptions
274
+ raise
275
+ except Exception as e:
276
+ # Pattern 1: Log unexpected errors and continue
277
+ logger.error(f"Unexpected error parsing {file_path}: {e}")
278
+ return None
279
+
280
+
281
+ def _fallback_parse(file_path: Path, file_path_str: str, language: str, content: str) -> Dict[str, Any]:
282
+ """
283
+ Fallback parsing when tree-sitter is unavailable or fails.
284
+
285
+ Args:
286
+ file_path: Absolute path to the file being parsed (for logging/display)
287
+ file_path_str: Repo-relative file path string to store in metadata
288
+ language: Programming language identifier
289
+ content: File content
290
+
291
+ Returns:
292
+ Dictionary with file-level chunk information
293
+ """
294
+ return {
295
+ 'file_path': file_path_str,
296
+ 'language': language,
297
+ 'content': content,
298
+ 'nodes': [{
299
+ 'type': 'file',
300
+ 'name': file_path.name,
301
+ 'content': content,
302
+ 'start_line': 1,
303
+ 'end_line': len(content.splitlines()),
304
+ 'start_byte': 0,
305
+ 'end_byte': len(content.encode('utf-8'))
306
+ }],
307
+ 'tree_info': {
308
+ 'has_ast': False,
309
+ 'fallback_reason': 'tree-sitter not available or language modules missing',
310
+ 'parser_available': TREE_SITTER_AVAILABLE
311
+ }
312
+ }
313
+
314
+
315
+ def get_parser(language: str) -> Optional["Parser"]:
316
+ """
317
+ Get tree-sitter parser for language with caching.
318
+
319
+ Args:
320
+ language: Programming language identifier
321
+
322
+ Returns:
323
+ Parser instance or None if unavailable
324
+ """
325
+ global _parser_cache
326
+
327
+ if not TREE_SITTER_AVAILABLE:
328
+ return None
329
+
330
+ # Return cached parser
331
+ if language in _parser_cache:
332
+ return _parser_cache[language]
333
+
334
+ try:
335
+ # Get language module
336
+ lang_module = LANGUAGE_MODULES.get(language)
337
+ if not lang_module:
338
+ logger.warning(f"No language module available for {language}")
339
+ return None
340
+
341
+ # Handle special case for TypeScript/TSX which have different API
342
+ if language == 'typescript':
343
+ lang_obj = Language(lang_module.language_typescript())
344
+ elif language == 'tsx':
345
+ lang_obj = Language(lang_module.language_tsx())
346
+ else:
347
+ # Create Language object from module for other languages
348
+ lang_obj = Language(lang_module.language())
349
+
350
+ # Create parser with language
351
+ parser = Parser(lang_obj)
352
+ _parser_cache[language] = parser
353
+ return parser
354
+
355
+ except Exception as e:
356
+ logger.warning(f"Error creating parser for {language}: {e}")
357
+ return None
358
+
359
+ def extract_nodes(root_node: Any, content: str, language: str) -> List[Dict[str, Any]]:
360
+ """
361
+ Extract relevant nodes from AST based on language-specific node types.
362
+
363
+ Args:
364
+ root_node: Root node of the AST
365
+ content: Source code content
366
+ language: Programming language
367
+
368
+ Returns:
369
+ List of extracted nodes with metadata including hierarchy
370
+ """
371
+ target_types = NODE_TYPES.get(language, [])
372
+ if not target_types:
373
+ return []
374
+
375
+ parent_types = set(PARENT_NODE_TYPES.get(language, []))
376
+ nodes = []
377
+ content_bytes = content.encode('utf-8')
378
+
379
+ def traverse(node: Any, parent_id: Optional[str] = None, parent_info: Optional[Dict] = None) -> None:
380
+ """Recursively traverse AST and extract target nodes with hierarchy tracking."""
381
+ if node.type in target_types:
382
+ node_id = str(uuid.uuid4())
383
+ node_content = content_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')
384
+ node_name = get_node_name(node, content_bytes)
385
+ is_parent = node.type in parent_types
386
+
387
+ node_dict = {
388
+ 'id': node_id,
389
+ 'type': node.type,
390
+ 'name': node_name,
391
+ 'content': node_content,
392
+ 'start_line': node.start_point[0] + 1,
393
+ 'end_line': node.end_point[0] + 1,
394
+ 'start_byte': node.start_byte,
395
+ 'end_byte': node.end_byte,
396
+ 'is_parent': is_parent,
397
+ 'parent_id': parent_id,
398
+ 'parent_type': parent_info.get('type') if parent_info else None,
399
+ 'parent_name': parent_info.get('name') if parent_info else None,
400
+ 'children_ids': []
401
+ }
402
+
403
+ nodes.append(node_dict)
404
+
405
+ if is_parent:
406
+ for child in node.children:
407
+ traverse(child, node_id, {'type': node.type, 'name': node_name})
408
+ else:
409
+ for child in node.children:
410
+ traverse(child, parent_id, parent_info)
411
+ else:
412
+ for child in node.children:
413
+ traverse(child, parent_id, parent_info)
414
+
415
+ traverse(root_node)
416
+
417
+ # Populate children_ids
418
+ for node in nodes:
419
+ if node['is_parent']:
420
+ node['children_ids'] = [n['id'] for n in nodes if n['parent_id'] == node['id']]
421
+
422
+ return nodes
423
+
424
+
425
+ def get_node_name(node: Any, content_bytes: bytes) -> Optional[str]:
426
+ """
427
+ Extract name from a node with language-aware and node-type-aware logic.
428
+
429
+ Args:
430
+ node: AST node
431
+ content_bytes: Source code as bytes
432
+
433
+ Returns:
434
+ Node name or generated identifier
435
+ """
436
+ try:
437
+ node_type = node.type
438
+
439
+ # Special handling for different node types
440
+ if node_type in ('section', 'heading'):
441
+ for child in node.children:
442
+ if child.type in ('atx_heading', 'setext_heading'):
443
+ text = content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
444
+ return text.strip().lstrip('#').strip()[:50] # First 50 chars
445
+ first_line = content_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore').split('\n')[0]
446
+ cleaned = first_line.strip().lstrip('#').strip()[:50]
447
+ return cleaned if cleaned else f"section_line_{node.start_point[0] + 1}"
448
+
449
+ if node_type == 'arrow_function':
450
+ parent = node.parent
451
+ if parent and parent.type in ('variable_declarator', 'lexical_declaration'):
452
+ for child in parent.children:
453
+ if child.type == 'identifier':
454
+ return content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
455
+ return f"arrow_fn_line_{node.start_point[0] + 1}"
456
+
457
+ if node_type in ('object', 'block_mapping'):
458
+ parent = node.parent
459
+ if parent and parent.type == 'pair':
460
+ for child in parent.children:
461
+ if child.type in ('string', 'flow_node', 'identifier'):
462
+ key = content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
463
+ cleaned_key = key.strip('"\'')[:30]
464
+ return cleaned_key
465
+ return f"{node_type}_line_{node.start_point[0] + 1}"
466
+
467
+ if node_type in ('array', 'block_sequence'):
468
+ parent = node.parent
469
+ if parent and parent.type == 'pair':
470
+ for child in parent.children:
471
+ if child.type in ('string', 'flow_node', 'identifier'):
472
+ key = content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
473
+ cleaned_key = key.strip('"\'')[:20]
474
+ return f"{cleaned_key}_array"
475
+ return f"{node_type}_line_{node.start_point[0] + 1}"
476
+
477
+ # Generic identifier extraction
478
+ identifier_types = {'identifier', 'name', 'property_identifier', 'type_identifier', 'field_identifier'}
479
+ for child in node.children:
480
+ if child.type in identifier_types:
481
+ return content_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
482
+ for child in node.children:
483
+ for grandchild in child.children:
484
+ if grandchild.type in identifier_types:
485
+ return content_bytes[grandchild.start_byte:grandchild.end_byte].decode('utf-8', errors='ignore')
486
+
487
+ return f"anonymous_{node_type}_line_{node.start_point[0] + 1}"
488
+
489
+ except Exception:
490
+ return f"unknown_line_{node.start_point[0] + 1}" if hasattr(node, 'start_point') else None
491
+
492
+
493
+ def _save_ast_safely(file_path: Path, language: str, root_node: Any, content: str,
494
+ nodes: List[Dict[str, Any]], chunks_dir: Path, tree_info: Optional[Dict[str, Any]]) -> None:
495
+ """
496
+ Safely save AST visualization with error handling.
497
+
498
+ Args:
499
+ file_path: Path to source file
500
+ language: Programming language
501
+ root_node: AST root node (None for fallback)
502
+ content: Source code content
503
+ nodes: Extracted nodes
504
+ chunks_dir: Directory for AST files
505
+ tree_info: Tree metadata
506
+ """
507
+ try:
508
+ logger.debug(f"Saving AST for {file_path}")
509
+ save_ast_visualization(str(file_path), language, root_node, content, nodes, chunks_dir, tree_info)
510
+ except Exception as e:
511
+ logger.warning(f"Could not save AST for {file_path}: {e}")
512
+
513
+
514
+ def _count_nodes(node: Any) -> int:
515
+ """
516
+ Count total number of nodes in the AST.
517
+
518
+ Args:
519
+ node: AST node
520
+
521
+ Returns:
522
+ Total node count
523
+ """
524
+ return 1 + sum(_count_nodes(child) for child in node.children)
525
+
526
+
527
+ def _get_tree_depth(node: Any, current_depth: int = 0) -> int:
528
+ """
529
+ Get the maximum depth of the AST.
530
+
531
+ Args:
532
+ node: AST node
533
+ current_depth: Current depth level
534
+
535
+ Returns:
536
+ Maximum tree depth
537
+ """
538
+ if not node.children:
539
+ return current_depth
540
+ return max(_get_tree_depth(child, current_depth + 1) for child in node.children)
541
+
542
+
543
+ __all__ = [
544
+ 'parse_file',
545
+ 'get_parser',
546
+ 'extract_nodes',
547
+ 'get_node_name',
548
+ 'NODE_TYPES',
549
+ 'LANGUAGE_MODULES',
550
+ 'TREE_SITTER_AVAILABLE',
551
+ ]