roma-debug 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,598 @@
1
+ """Tree-sitter based parser for multi-language support.
2
+
3
+ Provides semantic parsing for JavaScript, TypeScript, Go, Rust, Java,
4
+ and other languages using tree-sitter grammars.
5
+ """
6
+
7
+ import os
8
+ from typing import Optional, List, Dict, Any
9
+
10
+ from roma_debug.core.models import Language, Symbol, Import
11
+ from roma_debug.parsers.base import BaseParser
12
+
13
+ # Try to import tree-sitter
14
+ try:
15
+ import tree_sitter
16
+ TREE_SITTER_AVAILABLE = True
17
+ except ImportError:
18
+ TREE_SITTER_AVAILABLE = False
19
+ tree_sitter = None
20
+
21
+ # Language-specific tree-sitter modules
22
+ _LANGUAGE_MODULES: Dict[Language, str] = {
23
+ Language.PYTHON: "tree_sitter_python",
24
+ Language.JAVASCRIPT: "tree_sitter_javascript",
25
+ Language.TYPESCRIPT: "tree_sitter_typescript",
26
+ Language.GO: "tree_sitter_go",
27
+ Language.RUST: "tree_sitter_rust",
28
+ Language.JAVA: "tree_sitter_java",
29
+ }
30
+
31
+
32
+ def _get_tree_sitter_language(lang: Language) -> Optional[Any]:
33
+ """Get the tree-sitter language object for a language.
34
+
35
+ Args:
36
+ lang: The Language enum value
37
+
38
+ Returns:
39
+ tree-sitter Language object or None if not available
40
+ """
41
+ if not TREE_SITTER_AVAILABLE:
42
+ return None
43
+
44
+ module_name = _LANGUAGE_MODULES.get(lang)
45
+ if not module_name:
46
+ return None
47
+
48
+ try:
49
+ module = __import__(module_name)
50
+ # tree-sitter-python exposes language() function
51
+ if hasattr(module, 'language'):
52
+ return tree_sitter.Language(module.language())
53
+ return None
54
+ except ImportError:
55
+ return None
56
+ except Exception:
57
+ return None
58
+
59
+
60
+ # Node types that represent functions/methods in each language
61
+ FUNCTION_TYPES: Dict[Language, List[str]] = {
62
+ Language.PYTHON: ["function_definition", "async_function_definition"],
63
+ Language.JAVASCRIPT: ["function_declaration", "function_expression", "arrow_function", "method_definition"],
64
+ Language.TYPESCRIPT: ["function_declaration", "function_expression", "arrow_function", "method_definition", "method_signature"],
65
+ Language.GO: ["function_declaration", "method_declaration"],
66
+ Language.RUST: ["function_item", "impl_item"],
67
+ Language.JAVA: ["method_declaration", "constructor_declaration"],
68
+ }
69
+
70
+ # Node types that represent classes/structs in each language
71
+ CLASS_TYPES: Dict[Language, List[str]] = {
72
+ Language.PYTHON: ["class_definition"],
73
+ Language.JAVASCRIPT: ["class_declaration", "class"],
74
+ Language.TYPESCRIPT: ["class_declaration", "interface_declaration"],
75
+ Language.GO: ["type_declaration"], # for struct types
76
+ Language.RUST: ["struct_item", "enum_item", "impl_item"],
77
+ Language.JAVA: ["class_declaration", "interface_declaration", "enum_declaration"],
78
+ }
79
+
80
+ # Node types for imports
81
+ IMPORT_TYPES: Dict[Language, List[str]] = {
82
+ Language.PYTHON: ["import_statement", "import_from_statement"],
83
+ Language.JAVASCRIPT: ["import_statement", "import_declaration"],
84
+ Language.TYPESCRIPT: ["import_statement", "import_declaration"],
85
+ Language.GO: ["import_declaration", "import_spec"],
86
+ Language.RUST: ["use_declaration"],
87
+ Language.JAVA: ["import_declaration"],
88
+ }
89
+
90
+
91
+ class TreeSitterParser(BaseParser):
92
+ """Multi-language parser using tree-sitter.
93
+
94
+ Supports JavaScript, TypeScript, Go, Rust, Java, and more.
95
+ Falls back gracefully when tree-sitter is not installed.
96
+ """
97
+
98
+ def __init__(self, language: Language = Language.UNKNOWN):
99
+ """Initialize the tree-sitter parser.
100
+
101
+ Args:
102
+ language: The language to parse (can be set later)
103
+ """
104
+ super().__init__()
105
+ self._lang = language
106
+ self._tree: Optional[Any] = None
107
+ self._ts_language: Optional[Any] = None
108
+ self._parser: Optional[Any] = None
109
+ self._symbols: List[Symbol] = []
110
+ self._imports: List[Import] = []
111
+
112
+ @property
113
+ def language(self) -> Language:
114
+ """Return the language this parser handles."""
115
+ return self._lang
116
+
117
+ @language.setter
118
+ def language(self, lang: Language):
119
+ """Set the language and initialize the parser."""
120
+ if self._lang != lang:
121
+ self._lang = lang
122
+ self._init_parser()
123
+
124
+ @classmethod
125
+ def is_available(cls) -> bool:
126
+ """Check if tree-sitter is available."""
127
+ return TREE_SITTER_AVAILABLE
128
+
129
+ @classmethod
130
+ def supported_languages(cls) -> List[Language]:
131
+ """Get list of languages with available tree-sitter support."""
132
+ if not TREE_SITTER_AVAILABLE:
133
+ return []
134
+
135
+ available = []
136
+ for lang in _LANGUAGE_MODULES:
137
+ if _get_tree_sitter_language(lang) is not None:
138
+ available.append(lang)
139
+ return available
140
+
141
+ def _init_parser(self):
142
+ """Initialize the tree-sitter parser for the current language."""
143
+ if not TREE_SITTER_AVAILABLE:
144
+ return
145
+
146
+ self._ts_language = _get_tree_sitter_language(self._lang)
147
+ if self._ts_language is not None:
148
+ self._parser = tree_sitter.Parser(self._ts_language)
149
+
150
+ def parse(self, source: str, filepath: str = "") -> bool:
151
+ """Parse source code using tree-sitter.
152
+
153
+ Args:
154
+ source: The source code to parse
155
+ filepath: Optional file path for context
156
+
157
+ Returns:
158
+ True if parsing succeeded
159
+ """
160
+ self.reset()
161
+ self._source = source
162
+ self._filepath = filepath
163
+ self._lines = source.splitlines()
164
+
165
+ # Auto-detect language from filepath if not set
166
+ if self._lang == Language.UNKNOWN and filepath:
167
+ self._lang = Language.from_extension(os.path.splitext(filepath)[1])
168
+ self._init_parser()
169
+
170
+ if self._parser is None:
171
+ self._init_parser()
172
+
173
+ if self._parser is None:
174
+ return False
175
+
176
+ try:
177
+ self._tree = self._parser.parse(source.encode('utf-8'))
178
+ self._parsed = True
179
+ self._extract_symbols()
180
+ self._extract_imports_internal()
181
+ return True
182
+ except Exception:
183
+ return False
184
+
185
+ def reset(self):
186
+ """Reset parser state."""
187
+ super().reset()
188
+ self._tree = None
189
+ self._symbols = []
190
+ self._imports = []
191
+
192
+ def _get_node_text(self, node) -> str:
193
+ """Get the text content of a tree-sitter node."""
194
+ if self._source is None:
195
+ return ""
196
+ return self._source[node.start_byte:node.end_byte]
197
+
198
+ def _get_name_from_node(self, node) -> Optional[str]:
199
+ """Extract the name identifier from a definition node."""
200
+ # Common patterns for finding names
201
+ name_field_types = ["name", "identifier", "property_name"]
202
+
203
+ for child in node.children:
204
+ if child.type in ["identifier", "property_identifier", "type_identifier"]:
205
+ return self._get_node_text(child)
206
+ if hasattr(node, 'child_by_field_name'):
207
+ for field in name_field_types:
208
+ name_node = node.child_by_field_name(field)
209
+ if name_node:
210
+ return self._get_node_text(name_node)
211
+
212
+ # Fallback: first identifier child
213
+ for child in node.children:
214
+ if "identifier" in child.type:
215
+ return self._get_node_text(child)
216
+
217
+ return None
218
+
219
+ def _extract_symbols(self):
220
+ """Extract all function and class symbols from the parse tree."""
221
+ if self._tree is None:
222
+ return
223
+
224
+ function_types = FUNCTION_TYPES.get(self._lang, [])
225
+ class_types = CLASS_TYPES.get(self._lang, [])
226
+
227
+ def visit_node(node, parent_symbol: Optional[Symbol] = None):
228
+ symbol = None
229
+ kind = None
230
+
231
+ if node.type in function_types:
232
+ kind = "function"
233
+ if parent_symbol and parent_symbol.kind == "class":
234
+ kind = "method"
235
+ elif node.type in class_types:
236
+ kind = "class"
237
+
238
+ if kind:
239
+ name = self._get_name_from_node(node)
240
+ if name:
241
+ # Get line numbers (tree-sitter uses 0-based)
242
+ start_line = node.start_point[0] + 1
243
+ end_line = node.end_point[0] + 1
244
+
245
+ symbol = Symbol(
246
+ name=name,
247
+ kind=kind,
248
+ start_line=start_line,
249
+ end_line=end_line,
250
+ start_col=node.start_point[1],
251
+ end_col=node.end_point[1],
252
+ parent=parent_symbol,
253
+ )
254
+ self._symbols.append(symbol)
255
+
256
+ # Visit children
257
+ new_parent = symbol if symbol else parent_symbol
258
+ for child in node.children:
259
+ visit_node(child, new_parent)
260
+
261
+ visit_node(self._tree.root_node)
262
+
263
+ def _extract_imports_internal(self):
264
+ """Extract import statements from the parse tree."""
265
+ if self._tree is None:
266
+ return
267
+
268
+ import_types = IMPORT_TYPES.get(self._lang, [])
269
+
270
+ def visit_node(node):
271
+ if node.type in import_types:
272
+ imp = self._parse_import_node(node)
273
+ if imp:
274
+ self._imports.append(imp)
275
+
276
+ for child in node.children:
277
+ visit_node(child)
278
+
279
+ visit_node(self._tree.root_node)
280
+
281
+ def _parse_import_node(self, node) -> Optional[Import]:
282
+ """Parse an import node into an Import object."""
283
+ import_text = self._get_node_text(node)
284
+ line_number = node.start_point[0] + 1
285
+
286
+ if self._lang == Language.PYTHON:
287
+ return self._parse_python_import(node, import_text, line_number)
288
+ elif self._lang in (Language.JAVASCRIPT, Language.TYPESCRIPT):
289
+ return self._parse_js_import(node, import_text, line_number)
290
+ elif self._lang == Language.GO:
291
+ return self._parse_go_import(node, import_text, line_number)
292
+ elif self._lang == Language.RUST:
293
+ return self._parse_rust_import(node, import_text, line_number)
294
+ elif self._lang == Language.JAVA:
295
+ return self._parse_java_import(node, import_text, line_number)
296
+
297
+ # Generic fallback
298
+ return Import(
299
+ module_name=import_text,
300
+ line_number=line_number,
301
+ language=self._lang,
302
+ )
303
+
304
+ def _parse_python_import(self, node, text: str, line: int) -> Optional[Import]:
305
+ """Parse Python import statement."""
306
+ # Handle: import x, from x import y
307
+ module_name = ""
308
+ imported_names = []
309
+ is_relative = False
310
+ relative_level = 0
311
+
312
+ if node.type == "import_statement":
313
+ # import x, import x as y
314
+ for child in node.children:
315
+ if child.type == "dotted_name":
316
+ module_name = self._get_node_text(child)
317
+ elif child.type == "aliased_import":
318
+ for subchild in child.children:
319
+ if subchild.type == "dotted_name":
320
+ module_name = self._get_node_text(subchild)
321
+ break
322
+
323
+ elif node.type == "import_from_statement":
324
+ # from x import y
325
+ for child in node.children:
326
+ if child.type == "dotted_name":
327
+ module_name = self._get_node_text(child)
328
+ elif child.type == "relative_import":
329
+ is_relative = True
330
+ dots = self._get_node_text(child)
331
+ relative_level = dots.count('.')
332
+ # Get module name after dots
333
+ for subchild in child.children:
334
+ if subchild.type == "dotted_name":
335
+ module_name = self._get_node_text(subchild)
336
+ elif child.type in ("identifier", "wildcard_import"):
337
+ imported_names.append(self._get_node_text(child))
338
+ elif child.type == "aliased_import":
339
+ for subchild in child.children:
340
+ if subchild.type == "identifier":
341
+ imported_names.append(self._get_node_text(subchild))
342
+ break
343
+
344
+ if not module_name and not imported_names:
345
+ return None
346
+
347
+ return Import(
348
+ module_name=module_name,
349
+ imported_names=imported_names,
350
+ is_relative=is_relative,
351
+ relative_level=relative_level,
352
+ line_number=line,
353
+ language=Language.PYTHON,
354
+ )
355
+
356
+ def _parse_js_import(self, node, text: str, line: int) -> Optional[Import]:
357
+ """Parse JavaScript/TypeScript import statement."""
358
+ module_name = ""
359
+ imported_names = []
360
+ alias = None
361
+
362
+ for child in node.children:
363
+ if child.type == "string":
364
+ # The module path is in a string
365
+ module_name = self._get_node_text(child).strip("'\"")
366
+ elif child.type == "import_clause":
367
+ for subchild in child.children:
368
+ if subchild.type == "identifier":
369
+ # Default import
370
+ alias = self._get_node_text(subchild)
371
+ elif subchild.type == "named_imports":
372
+ # Named imports: { a, b, c }
373
+ for imp_spec in subchild.children:
374
+ if imp_spec.type == "import_specifier":
375
+ for name_node in imp_spec.children:
376
+ if name_node.type == "identifier":
377
+ imported_names.append(self._get_node_text(name_node))
378
+ break
379
+ elif subchild.type == "namespace_import":
380
+ # import * as X
381
+ for name_node in subchild.children:
382
+ if name_node.type == "identifier":
383
+ alias = self._get_node_text(name_node)
384
+
385
+ if not module_name:
386
+ return None
387
+
388
+ is_relative = module_name.startswith('.') or module_name.startswith('/')
389
+
390
+ return Import(
391
+ module_name=module_name,
392
+ alias=alias,
393
+ imported_names=imported_names,
394
+ is_relative=is_relative,
395
+ line_number=line,
396
+ language=self._lang,
397
+ )
398
+
399
+ def _parse_go_import(self, node, text: str, line: int) -> Optional[Import]:
400
+ """Parse Go import statement."""
401
+ module_name = ""
402
+ alias = None
403
+
404
+ # Handle both single imports and import blocks
405
+ if node.type == "import_spec":
406
+ for child in node.children:
407
+ if child.type == "interpreted_string_literal":
408
+ module_name = self._get_node_text(child).strip('"')
409
+ elif child.type == "package_identifier":
410
+ alias = self._get_node_text(child)
411
+ elif child.type == "blank_identifier":
412
+ alias = "_"
413
+ elif child.type == "dot":
414
+ alias = "."
415
+ elif node.type == "import_declaration":
416
+ # Find import_spec children
417
+ for child in node.children:
418
+ if child.type == "import_spec":
419
+ return self._parse_go_import(child, self._get_node_text(child), line)
420
+ elif child.type == "import_spec_list":
421
+ # Multiple imports - just return first one for now
422
+ for spec in child.children:
423
+ if spec.type == "import_spec":
424
+ return self._parse_go_import(spec, self._get_node_text(spec), line)
425
+ elif child.type == "interpreted_string_literal":
426
+ module_name = self._get_node_text(child).strip('"')
427
+
428
+ if not module_name:
429
+ return None
430
+
431
+ return Import(
432
+ module_name=module_name,
433
+ alias=alias,
434
+ line_number=line,
435
+ language=Language.GO,
436
+ )
437
+
438
+ def _parse_rust_import(self, node, text: str, line: int) -> Optional[Import]:
439
+ """Parse Rust use statement."""
440
+ # use statements can be complex: use std::io::{Read, Write};
441
+ module_name = ""
442
+ imported_names = []
443
+
444
+ def extract_path(n) -> str:
445
+ if n.type == "identifier" or n.type == "crate":
446
+ return self._get_node_text(n)
447
+ elif n.type == "scoped_identifier":
448
+ parts = []
449
+ for child in n.children:
450
+ if child.type in ("identifier", "crate", "scoped_identifier"):
451
+ parts.append(extract_path(child))
452
+ return "::".join(p for p in parts if p)
453
+ return ""
454
+
455
+ for child in node.children:
456
+ if child.type == "use_list":
457
+ # Multiple imports
458
+ for item in child.children:
459
+ if "identifier" in item.type:
460
+ imported_names.append(self._get_node_text(item))
461
+ elif child.type == "scoped_identifier":
462
+ module_name = extract_path(child)
463
+ elif child.type == "identifier":
464
+ module_name = self._get_node_text(child)
465
+ elif child.type == "scoped_use_list":
466
+ # use std::io::{Read, Write}
467
+ for subchild in child.children:
468
+ if subchild.type == "scoped_identifier":
469
+ module_name = extract_path(subchild)
470
+ elif subchild.type == "use_list":
471
+ for item in subchild.children:
472
+ if "identifier" in item.type:
473
+ imported_names.append(self._get_node_text(item))
474
+
475
+ if not module_name and not imported_names:
476
+ return None
477
+
478
+ return Import(
479
+ module_name=module_name,
480
+ imported_names=imported_names,
481
+ line_number=line,
482
+ language=Language.RUST,
483
+ )
484
+
485
+ def _parse_java_import(self, node, text: str, line: int) -> Optional[Import]:
486
+ """Parse Java import statement."""
487
+ module_name = ""
488
+ imported_names = []
489
+
490
+ for child in node.children:
491
+ if child.type == "scoped_identifier":
492
+ # Build full path: com.example.MyClass
493
+ parts = []
494
+
495
+ def collect_parts(n):
496
+ for c in n.children:
497
+ if c.type == "identifier":
498
+ parts.append(self._get_node_text(c))
499
+ elif c.type == "scoped_identifier":
500
+ collect_parts(c)
501
+
502
+ collect_parts(child)
503
+ module_name = ".".join(parts)
504
+
505
+ elif child.type == "asterisk":
506
+ imported_names.append("*")
507
+
508
+ if not module_name:
509
+ return None
510
+
511
+ return Import(
512
+ module_name=module_name,
513
+ imported_names=imported_names,
514
+ line_number=line,
515
+ language=Language.JAVA,
516
+ )
517
+
518
+ def find_enclosing_symbol(self, line_number: int) -> Optional[Symbol]:
519
+ """Find the innermost symbol containing the given line.
520
+
521
+ Args:
522
+ line_number: 1-based line number
523
+
524
+ Returns:
525
+ The innermost Symbol containing the line, or None
526
+ """
527
+ best_match: Optional[Symbol] = None
528
+ best_size = float('inf')
529
+
530
+ for symbol in self._symbols:
531
+ if symbol.contains_line(line_number):
532
+ size = symbol.end_line - symbol.start_line
533
+ if size < best_size:
534
+ best_match = symbol
535
+ best_size = size
536
+
537
+ return best_match
538
+
539
+ def extract_imports(self) -> List[Import]:
540
+ """Return all extracted imports.
541
+
542
+ Returns:
543
+ List of Import objects
544
+ """
545
+ return self._imports.copy()
546
+
547
+ def find_all_symbols(self) -> List[Symbol]:
548
+ """Return all extracted symbols.
549
+
550
+ Returns:
551
+ List of all Symbol objects
552
+ """
553
+ return self._symbols.copy()
554
+
555
+
556
+ def create_parser_for_language(language: Language) -> Optional[TreeSitterParser]:
557
+ """Factory function to create a tree-sitter parser for a language.
558
+
559
+ Args:
560
+ language: The language to create a parser for
561
+
562
+ Returns:
563
+ TreeSitterParser instance or None if not supported
564
+ """
565
+ if not TREE_SITTER_AVAILABLE:
566
+ return None
567
+
568
+ if language not in _LANGUAGE_MODULES:
569
+ return None
570
+
571
+ parser = TreeSitterParser(language)
572
+ parser._init_parser()
573
+
574
+ if parser._parser is None:
575
+ return None
576
+
577
+ return parser
578
+
579
+
580
+ # Register tree-sitter parsers with the registry
581
+ def _register_treesitter_parsers():
582
+ """Register tree-sitter parsers for all available languages."""
583
+ if not TREE_SITTER_AVAILABLE:
584
+ return
585
+
586
+ from roma_debug.parsers.registry import register_parser
587
+
588
+ for lang in TreeSitterParser.supported_languages():
589
+ if lang != Language.PYTHON: # Python uses AST parser by default
590
+ register_parser(
591
+ lang,
592
+ TreeSitterParser,
593
+ factory=lambda l=lang: TreeSitterParser(l),
594
+ )
595
+
596
+
597
+ # Auto-register on import
598
+ _register_treesitter_parsers()