code2logic 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
code2logic/parsers.py ADDED
@@ -0,0 +1,908 @@
1
+ """
2
+ Code parsers for multiple languages.
3
+
4
+ Includes:
5
+ - TreeSitterParser: High-accuracy AST parsing using Tree-sitter
6
+ - UniversalParser: Fallback regex/AST parser for environments without Tree-sitter
7
+ """
8
+
9
+ import ast
10
+ import re
11
+ from typing import Optional, List
12
+
13
+ from .models import FunctionInfo, ClassInfo, TypeInfo, ModuleInfo
14
+ from .intent import EnhancedIntentGenerator
15
+
16
+ # Optional Tree-sitter imports
17
+ TREE_SITTER_AVAILABLE = False
18
+ try:
19
+ import tree_sitter_python as tspython
20
+ import tree_sitter_javascript as tsjavascript
21
+ from tree_sitter import Language, Parser
22
+ TREE_SITTER_AVAILABLE = True
23
+ except ImportError:
24
+ pass
25
+
26
+
27
+ class TreeSitterParser:
28
+ """
29
+ Parser using Tree-sitter for high-accuracy AST parsing.
30
+
31
+ Supports Python, JavaScript, and TypeScript with 99% accuracy.
32
+ Falls back gracefully if Tree-sitter libraries are not installed.
33
+
34
+ Example:
35
+ >>> parser = TreeSitterParser()
36
+ >>> if parser.is_available('python'):
37
+ ... module = parser.parse('main.py', content, 'python')
38
+ """
39
+
40
+ def __init__(self):
41
+ """Initialize Tree-sitter parsers for available languages."""
42
+ self.parsers: dict = {}
43
+ self.languages: dict = {}
44
+ self.intent_gen = EnhancedIntentGenerator()
45
+
46
+ if TREE_SITTER_AVAILABLE:
47
+ self._init_parsers()
48
+
49
+ def _init_parsers(self):
50
+ """Initialize parsers for each supported language."""
51
+ try:
52
+ # Python
53
+ self.languages['python'] = Language(tspython.language())
54
+ self.parsers['python'] = Parser(self.languages['python'])
55
+
56
+ # JavaScript
57
+ self.languages['javascript'] = Language(tsjavascript.language())
58
+ self.parsers['javascript'] = Parser(self.languages['javascript'])
59
+
60
+ # TypeScript - try dedicated parser, fall back to JS
61
+ try:
62
+ import tree_sitter_typescript as tstypescript
63
+ self.languages['typescript'] = Language(tstypescript.language_typescript())
64
+ self.parsers['typescript'] = Parser(self.languages['typescript'])
65
+ except ImportError:
66
+ self.languages['typescript'] = self.languages['javascript']
67
+ self.parsers['typescript'] = self.parsers['javascript']
68
+
69
+ except Exception as e:
70
+ import sys
71
+ print(f"Tree-sitter init warning: {e}", file=sys.stderr)
72
+
73
+ def is_available(self, language: str) -> bool:
74
+ """Check if Tree-sitter parser is available for a language."""
75
+ return language in self.parsers
76
+
77
+ @classmethod
78
+ def get_supported_languages(cls) -> List[str]:
79
+ """Get list of potentially supported languages."""
80
+ return ['python', 'javascript', 'typescript']
81
+
82
+ def parse(self, filepath: str, content: str, language: str) -> Optional[ModuleInfo]:
83
+ """
84
+ Parse a source file using Tree-sitter.
85
+
86
+ Args:
87
+ filepath: Relative path to the file
88
+ content: File content as string
89
+ language: Programming language
90
+
91
+ Returns:
92
+ ModuleInfo if parsing succeeds, None otherwise
93
+ """
94
+ if language not in self.parsers:
95
+ return None
96
+
97
+ parser = self.parsers[language]
98
+ tree = parser.parse(bytes(content, 'utf8'))
99
+
100
+ if language == 'python':
101
+ return self._parse_python(filepath, content, tree)
102
+ elif language in ('javascript', 'typescript'):
103
+ return self._parse_js_ts(filepath, content, tree, language)
104
+
105
+ return None
106
+
107
+ def _parse_python(self, filepath: str, content: str, tree) -> ModuleInfo:
108
+ """Parse Python source using Tree-sitter AST."""
109
+ root = tree.root_node
110
+ imports, classes, functions, constants, exports = [], [], [], [], []
111
+ docstring = None
112
+
113
+ for child in root.children:
114
+ node_type = child.type
115
+
116
+ # Module docstring
117
+ if node_type == 'expression_statement' and not docstring:
118
+ expr = child.children[0] if child.children else None
119
+ if expr and expr.type == 'string':
120
+ docstring = self._extract_string(expr, content)
121
+
122
+ # Imports
123
+ elif node_type == 'import_statement':
124
+ imports.extend(self._extract_py_import(child, content))
125
+ elif node_type == 'import_from_statement':
126
+ imports.extend(self._extract_py_from_import(child, content))
127
+
128
+ # Functions
129
+ elif node_type == 'function_definition':
130
+ func = self._extract_py_function(child, content)
131
+ if func:
132
+ functions.append(func)
133
+ if not func.name.startswith('_'):
134
+ exports.append(func.name)
135
+
136
+ # Decorated functions
137
+ elif node_type == 'decorated_definition':
138
+ inner = self._find_child(child, 'function_definition')
139
+ if inner:
140
+ func = self._extract_py_function(inner, content, child)
141
+ if func:
142
+ functions.append(func)
143
+ if not func.name.startswith('_'):
144
+ exports.append(func.name)
145
+
146
+ # Classes
147
+ elif node_type == 'class_definition':
148
+ cls = self._extract_py_class(child, content)
149
+ if cls:
150
+ classes.append(cls)
151
+ if not cls.name.startswith('_'):
152
+ exports.append(cls.name)
153
+
154
+ # Constants
155
+ elif node_type == 'expression_statement':
156
+ const = self._extract_py_constant(child, content)
157
+ if const:
158
+ constants.append(const)
159
+
160
+ lines = content.split('\n')
161
+ return ModuleInfo(
162
+ path=filepath,
163
+ language='python',
164
+ imports=imports[:20],
165
+ exports=exports,
166
+ classes=classes,
167
+ functions=functions,
168
+ types=[],
169
+ constants=constants[:10],
170
+ docstring=docstring[:100] if docstring else None,
171
+ lines_total=len(lines),
172
+ lines_code=len([l for l in lines if l.strip() and not l.strip().startswith('#')])
173
+ )
174
+
175
+ def _extract_py_function(self, node, content: str,
176
+ decorated_node=None) -> Optional[FunctionInfo]:
177
+ """Extract Python function from AST node."""
178
+ name_node = self._find_child(node, 'identifier')
179
+ if not name_node:
180
+ return None
181
+ name = self._text(name_node, content)
182
+
183
+ # Parameters
184
+ params = []
185
+ params_node = self._find_child(node, 'parameters')
186
+ if params_node:
187
+ for child in params_node.children:
188
+ if child.type == 'identifier':
189
+ params.append(self._text(child, content))
190
+ elif child.type in ('typed_parameter', 'typed_default_parameter'):
191
+ n = self._find_child(child, 'identifier')
192
+ t = self._find_child(child, 'type')
193
+ if n:
194
+ p = self._text(n, content)
195
+ if t:
196
+ p += ':' + self._text(t, content)
197
+ params.append(p)
198
+ elif child.type == 'default_parameter' and child.children:
199
+ params.append(self._text(child.children[0], content))
200
+
201
+ # Return type
202
+ return_type = None
203
+ ret_node = self._find_child(node, 'type')
204
+ if ret_node:
205
+ return_type = self._text(ret_node, content)
206
+
207
+ # Docstring
208
+ docstring = None
209
+ body = self._find_child(node, 'block')
210
+ if body and body.children:
211
+ first = body.children[0]
212
+ if first.type == 'expression_statement':
213
+ expr = first.children[0] if first.children else None
214
+ if expr and expr.type == 'string':
215
+ docstring = self._extract_string(expr, content)
216
+
217
+ # Decorators
218
+ decorators = []
219
+ if decorated_node:
220
+ for c in decorated_node.children:
221
+ if c.type == 'decorator':
222
+ decorators.append(self._text(c, content).lstrip('@').split('(')[0])
223
+
224
+ is_async = node.type == 'async_function_definition'
225
+
226
+ return FunctionInfo(
227
+ name=name,
228
+ params=params[:8],
229
+ return_type=return_type,
230
+ docstring=docstring[:100] if docstring else None,
231
+ calls=[],
232
+ raises=[],
233
+ complexity=1,
234
+ lines=node.end_point[0] - node.start_point[0] + 1,
235
+ decorators=decorators,
236
+ is_async=is_async,
237
+ is_static='staticmethod' in decorators,
238
+ is_private=name.startswith('_') and not name.startswith('__'),
239
+ intent=self.intent_gen.generate(name, docstring),
240
+ start_line=node.start_point[0] + 1,
241
+ end_line=node.end_point[0] + 1
242
+ )
243
+
244
+ def _extract_py_class(self, node, content: str) -> Optional[ClassInfo]:
245
+ """Extract Python class from AST node."""
246
+ name_node = self._find_child(node, 'identifier')
247
+ if not name_node:
248
+ return None
249
+ name = self._text(name_node, content)
250
+
251
+ # Base classes
252
+ bases = []
253
+ arg_list = self._find_child(node, 'argument_list')
254
+ if arg_list:
255
+ for c in arg_list.children:
256
+ if c.type in ('identifier', 'attribute'):
257
+ bases.append(self._text(c, content))
258
+
259
+ # Docstring and methods
260
+ docstring = None
261
+ methods = []
262
+ body = self._find_child(node, 'block')
263
+ if body:
264
+ for i, child in enumerate(body.children):
265
+ if i == 0 and child.type == 'expression_statement':
266
+ expr = child.children[0] if child.children else None
267
+ if expr and expr.type == 'string':
268
+ docstring = self._extract_string(expr, content)
269
+
270
+ if child.type == 'function_definition':
271
+ m = self._extract_py_function(child, content)
272
+ if m:
273
+ methods.append(m)
274
+ elif child.type == 'decorated_definition':
275
+ inner = self._find_child(child, 'function_definition')
276
+ if inner:
277
+ m = self._extract_py_function(inner, content, child)
278
+ if m:
279
+ methods.append(m)
280
+
281
+ return ClassInfo(
282
+ name=name,
283
+ bases=bases,
284
+ docstring=docstring[:100] if docstring else None,
285
+ methods=methods,
286
+ properties=[],
287
+ is_interface=False,
288
+ is_abstract='ABC' in bases or 'ABCMeta' in bases,
289
+ generic_params=[]
290
+ )
291
+
292
+ def _extract_py_import(self, node, content: str) -> List[str]:
293
+ """Extract import statement."""
294
+ imports = []
295
+ for c in node.children:
296
+ if c.type == 'dotted_name':
297
+ imports.append(self._text(c, content))
298
+ elif c.type == 'aliased_import':
299
+ n = self._find_child(c, 'dotted_name')
300
+ if n:
301
+ imports.append(self._text(n, content))
302
+ return imports
303
+
304
+ def _extract_py_from_import(self, node, content: str) -> List[str]:
305
+ """Extract from ... import ... statement."""
306
+ imports = []
307
+ module = None
308
+ for c in node.children:
309
+ if c.type in ('dotted_name', 'import_prefix'):
310
+ module = self._text(c, content)
311
+ if module:
312
+ for c in node.children:
313
+ if c.type == 'identifier':
314
+ imports.append(f"{module}.{self._text(c, content)}")
315
+ elif c.type == 'aliased_import':
316
+ n = self._find_child(c, 'identifier')
317
+ if n:
318
+ imports.append(f"{module}.{self._text(n, content)}")
319
+ return imports
320
+
321
+ def _extract_py_constant(self, node, content: str) -> Optional[str]:
322
+ """Extract constant (UPPERCASE assignment)."""
323
+ if node.children:
324
+ expr = node.children[0]
325
+ if expr.type == 'assignment':
326
+ left = expr.children[0] if expr.children else None
327
+ if left and left.type == 'identifier':
328
+ name = self._text(left, content)
329
+ if name.isupper():
330
+ return name
331
+ return None
332
+
333
+ def _parse_js_ts(self, filepath: str, content: str, tree, language: str) -> ModuleInfo:
334
+ """Parse JavaScript/TypeScript source using Tree-sitter AST."""
335
+ root = tree.root_node
336
+ imports, classes, functions, types, constants, exports = [], [], [], [], [], []
337
+ docstring = None
338
+
339
+ for child in root.children:
340
+ node_type = child.type
341
+
342
+ # Imports
343
+ if node_type == 'import_statement':
344
+ for c in child.children:
345
+ if c.type == 'string':
346
+ imports.append(self._text(c, content).strip('"\''))
347
+
348
+ # Exports
349
+ elif node_type == 'export_statement':
350
+ for c in child.children:
351
+ if c.type == 'class_declaration':
352
+ cls = self._extract_js_class(c, content)
353
+ if cls:
354
+ classes.append(cls)
355
+ exports.append(cls.name)
356
+ elif c.type == 'function_declaration':
357
+ func = self._extract_js_function(c, content)
358
+ if func:
359
+ functions.append(func)
360
+ exports.append(func.name)
361
+ elif c.type == 'lexical_declaration':
362
+ func = self._extract_js_arrow_fn(c, content)
363
+ if func:
364
+ functions.append(func)
365
+ exports.append(func.name)
366
+ elif c.type in ('interface_declaration', 'type_alias_declaration'):
367
+ t = self._extract_ts_type(c, content)
368
+ if t:
369
+ types.append(t)
370
+ exports.append(t.name)
371
+ elif c.type == 'enum_declaration':
372
+ t = self._extract_ts_enum(c, content)
373
+ if t:
374
+ types.append(t)
375
+ exports.append(t.name)
376
+
377
+ # Non-exported declarations
378
+ elif node_type == 'class_declaration':
379
+ cls = self._extract_js_class(child, content)
380
+ if cls:
381
+ classes.append(cls)
382
+ exports.append(cls.name)
383
+ elif node_type == 'function_declaration':
384
+ func = self._extract_js_function(child, content)
385
+ if func:
386
+ functions.append(func)
387
+ exports.append(func.name)
388
+ elif node_type == 'lexical_declaration':
389
+ func = self._extract_js_arrow_fn(child, content)
390
+ if func:
391
+ functions.append(func)
392
+ const = self._extract_js_constant(child, content)
393
+ if const:
394
+ constants.append(const)
395
+ elif node_type in ('interface_declaration', 'type_alias_declaration'):
396
+ t = self._extract_ts_type(child, content)
397
+ if t:
398
+ types.append(t)
399
+ exports.append(t.name)
400
+
401
+ # Leading comment as docstring
402
+ elif node_type == 'comment' and not docstring:
403
+ docstring = self._extract_js_comment(child, content)
404
+
405
+ lines = content.split('\n')
406
+ return ModuleInfo(
407
+ path=filepath,
408
+ language=language,
409
+ imports=imports[:20],
410
+ exports=list(set(exports)),
411
+ classes=classes,
412
+ functions=functions,
413
+ types=types,
414
+ constants=constants[:10],
415
+ docstring=docstring[:100] if docstring else None,
416
+ lines_total=len(lines),
417
+ lines_code=len([l for l in lines if l.strip() and not l.strip().startswith('//')])
418
+ )
419
+
420
+ def _extract_js_class(self, node, content: str) -> Optional[ClassInfo]:
421
+ """Extract JS/TS class from AST node."""
422
+ name_node = self._find_child(node, 'type_identifier') or self._find_child(node, 'identifier')
423
+ if not name_node:
424
+ return None
425
+ name = self._text(name_node, content)
426
+
427
+ # Base classes
428
+ bases = []
429
+ heritage = self._find_child(node, 'class_heritage')
430
+ if heritage:
431
+ for c in heritage.children:
432
+ if c.type == 'identifier':
433
+ bases.append(self._text(c, content))
434
+
435
+ # Methods
436
+ methods = []
437
+ body = self._find_child(node, 'class_body')
438
+ if body:
439
+ for c in body.children:
440
+ if c.type == 'method_definition':
441
+ m = self._extract_js_method(c, content)
442
+ if m:
443
+ methods.append(m)
444
+
445
+ return ClassInfo(
446
+ name=name,
447
+ bases=bases,
448
+ docstring=None,
449
+ methods=methods,
450
+ properties=[],
451
+ is_interface=False,
452
+ is_abstract='abstract' in self._text(node, content)[:50],
453
+ generic_params=[]
454
+ )
455
+
456
+ def _extract_js_method(self, node, content: str) -> Optional[FunctionInfo]:
457
+ """Extract JS/TS method from AST node."""
458
+ name_node = self._find_child(node, 'property_identifier')
459
+ if not name_node:
460
+ return None
461
+ name = self._text(name_node, content)
462
+
463
+ node_text = self._text(node, content)[:100]
464
+ is_async = 'async' in node_text.split(name)[0] if name in node_text else False
465
+ is_static = 'static' in node_text.split(name)[0] if name in node_text else False
466
+
467
+ # Parameters
468
+ params = []
469
+ params_node = self._find_child(node, 'formal_parameters')
470
+ if params_node:
471
+ params = self._extract_js_params(params_node, content)
472
+
473
+ # Return type
474
+ return_type = None
475
+ type_ann = self._find_child(node, 'type_annotation')
476
+ if type_ann:
477
+ return_type = self._text(type_ann, content).lstrip(':').strip()
478
+
479
+ return FunctionInfo(
480
+ name=name,
481
+ params=params[:8],
482
+ return_type=return_type,
483
+ docstring=None,
484
+ calls=[],
485
+ raises=[],
486
+ complexity=1,
487
+ lines=node.end_point[0] - node.start_point[0] + 1,
488
+ decorators=[],
489
+ is_async=is_async,
490
+ is_static=is_static,
491
+ is_private=name.startswith('_') or name.startswith('#'),
492
+ intent=self.intent_gen.generate(name),
493
+ start_line=node.start_point[0] + 1,
494
+ end_line=node.end_point[0] + 1
495
+ )
496
+
497
+ def _extract_js_function(self, node, content: str) -> Optional[FunctionInfo]:
498
+ """Extract JS/TS function from AST node."""
499
+ name_node = self._find_child(node, 'identifier')
500
+ if not name_node:
501
+ return None
502
+ name = self._text(name_node, content)
503
+ is_async = self._text(node, content)[:50].strip().startswith('async')
504
+
505
+ params = []
506
+ params_node = self._find_child(node, 'formal_parameters')
507
+ if params_node:
508
+ params = self._extract_js_params(params_node, content)
509
+
510
+ return_type = None
511
+ type_ann = self._find_child(node, 'type_annotation')
512
+ if type_ann:
513
+ return_type = self._text(type_ann, content).lstrip(':').strip()
514
+
515
+ return FunctionInfo(
516
+ name=name,
517
+ params=params[:8],
518
+ return_type=return_type,
519
+ docstring=None,
520
+ calls=[],
521
+ raises=[],
522
+ complexity=1,
523
+ lines=node.end_point[0] - node.start_point[0] + 1,
524
+ decorators=[],
525
+ is_async=is_async,
526
+ is_static=False,
527
+ is_private=name.startswith('_'),
528
+ intent=self.intent_gen.generate(name),
529
+ start_line=node.start_point[0] + 1,
530
+ end_line=node.end_point[0] + 1
531
+ )
532
+
533
+ def _extract_js_arrow_fn(self, node, content: str) -> Optional[FunctionInfo]:
534
+ """Extract arrow function assigned to const."""
535
+ for c in node.children:
536
+ if c.type == 'variable_declarator':
537
+ name_node = self._find_child(c, 'identifier')
538
+ arrow = self._find_child(c, 'arrow_function')
539
+ if name_node and arrow:
540
+ name = self._text(name_node, content)
541
+ is_async = 'async' in self._text(arrow, content)[:30]
542
+ params = []
543
+ pn = self._find_child(arrow, 'formal_parameters')
544
+ if pn:
545
+ params = self._extract_js_params(pn, content)
546
+ return FunctionInfo(
547
+ name=name,
548
+ params=params[:8],
549
+ return_type=None,
550
+ docstring=None,
551
+ calls=[],
552
+ raises=[],
553
+ complexity=1,
554
+ lines=node.end_point[0] - node.start_point[0] + 1,
555
+ decorators=[],
556
+ is_async=is_async,
557
+ is_static=False,
558
+ is_private=name.startswith('_'),
559
+ intent=self.intent_gen.generate(name),
560
+ start_line=node.start_point[0] + 1,
561
+ end_line=node.end_point[0] + 1
562
+ )
563
+ return None
564
+
565
+ def _extract_js_params(self, params_node, content: str) -> List[str]:
566
+ """Extract JS/TS function parameters."""
567
+ params = []
568
+ for c in params_node.children:
569
+ if c.type == 'identifier':
570
+ params.append(self._text(c, content))
571
+ elif c.type == 'required_parameter':
572
+ n = self._find_child(c, 'identifier')
573
+ t = self._find_child(c, 'type_annotation')
574
+ if n:
575
+ p = self._text(n, content)
576
+ if t:
577
+ p += self._text(t, content)
578
+ params.append(p)
579
+ elif c.type == 'optional_parameter':
580
+ n = self._find_child(c, 'identifier')
581
+ if n:
582
+ params.append(self._text(n, content) + '?')
583
+ return params
584
+
585
+ def _extract_ts_type(self, node, content: str) -> Optional[TypeInfo]:
586
+ """Extract TypeScript interface or type alias."""
587
+ name_node = self._find_child(node, 'type_identifier') or self._find_child(node, 'identifier')
588
+ if not name_node:
589
+ return None
590
+ name = self._text(name_node, content)
591
+ kind = 'interface' if node.type == 'interface_declaration' else 'type'
592
+ return TypeInfo(name=name, kind=kind, definition=self._text(node, content)[:100])
593
+
594
+ def _extract_ts_enum(self, node, content: str) -> Optional[TypeInfo]:
595
+ """Extract TypeScript enum."""
596
+ name_node = self._find_child(node, 'identifier')
597
+ if not name_node:
598
+ return None
599
+ name = self._text(name_node, content)
600
+ return TypeInfo(name=name, kind='enum', definition='')
601
+
602
+ def _extract_js_constant(self, node, content: str) -> Optional[str]:
603
+ """Extract constant (UPPERCASE const)."""
604
+ for c in node.children:
605
+ if c.type == 'variable_declarator':
606
+ n = self._find_child(c, 'identifier')
607
+ if n:
608
+ name = self._text(n, content)
609
+ if name.isupper():
610
+ return name
611
+ return None
612
+
613
+ def _extract_js_comment(self, node, content: str) -> Optional[str]:
614
+ """Extract JS comment content."""
615
+ text = self._text(node, content)
616
+ if text.startswith('/**'):
617
+ lines = text[3:-2].split('\n')
618
+ clean = [l.strip().lstrip('*').strip() for l in lines
619
+ if l.strip().lstrip('*').strip() and not l.strip().startswith('@')]
620
+ return ' '.join(clean)[:100] if clean else None
621
+ elif text.startswith('//'):
622
+ return text[2:].strip()[:100]
623
+ return None
624
+
625
+ # Helper methods
626
+ def _find_child(self, node, type_name: str):
627
+ """Find first child of given type."""
628
+ for c in node.children:
629
+ if c.type == type_name:
630
+ return c
631
+ return None
632
+
633
+ def _text(self, node, content: str) -> str:
634
+ """Get text content of node."""
635
+ return content[node.start_byte:node.end_byte]
636
+
637
+ def _extract_string(self, node, content: str) -> str:
638
+ """Extract string content without quotes."""
639
+ text = self._text(node, content)
640
+ if text.startswith('"""') or text.startswith("'''"):
641
+ return text[3:-3].strip()
642
+ elif text.startswith('"') or text.startswith("'"):
643
+ return text[1:-1].strip()
644
+ return text
645
+
646
+
647
+ class UniversalParser:
648
+ """
649
+ Fallback parser using Python AST and regex.
650
+
651
+ Used when Tree-sitter is not available. Provides reasonable
652
+ accuracy for Python (using built-in AST) and basic support
653
+ for JavaScript/TypeScript using regex patterns.
654
+
655
+ Example:
656
+ >>> parser = UniversalParser()
657
+ >>> module = parser.parse('main.py', content, 'python')
658
+ """
659
+
660
+ def __init__(self):
661
+ """Initialize the universal parser."""
662
+ self.intent_gen = EnhancedIntentGenerator()
663
+
664
+ def parse(self, filepath: str, content: str, language: str) -> Optional[ModuleInfo]:
665
+ """
666
+ Parse a source file using AST or regex.
667
+
668
+ Args:
669
+ filepath: Relative path to the file
670
+ content: File content as string
671
+ language: Programming language
672
+
673
+ Returns:
674
+ ModuleInfo if parsing succeeds, None otherwise
675
+ """
676
+ if language == 'python':
677
+ return self._parse_python(filepath, content)
678
+ elif language in ('javascript', 'typescript'):
679
+ return self._parse_js_ts(filepath, content, language)
680
+ return None
681
+
682
+ def _parse_python(self, filepath: str, content: str) -> Optional[ModuleInfo]:
683
+ """Parse Python using built-in AST."""
684
+ try:
685
+ tree = ast.parse(content)
686
+ except SyntaxError:
687
+ lines = content.split('\n')
688
+ return ModuleInfo(
689
+ path=filepath, language='python', imports=[], exports=[],
690
+ classes=[], functions=[], types=[], constants=[], docstring=None,
691
+ lines_total=len(lines), lines_code=len([l for l in lines if l.strip()])
692
+ )
693
+
694
+ imports, classes, functions, constants = [], [], [], []
695
+
696
+ for node in ast.iter_child_nodes(tree):
697
+ if isinstance(node, ast.Import):
698
+ imports.extend(a.name for a in node.names)
699
+ elif isinstance(node, ast.ImportFrom):
700
+ module = node.module or ''
701
+ imports.extend(f"{module}.{a.name}" for a in node.names if a.name != '*')
702
+ elif isinstance(node, ast.ClassDef):
703
+ cls = self._extract_ast_class(node)
704
+ if cls:
705
+ classes.append(cls)
706
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
707
+ func = self._extract_ast_function(node)
708
+ if func:
709
+ functions.append(func)
710
+ elif isinstance(node, ast.Assign):
711
+ for t in node.targets:
712
+ if isinstance(t, ast.Name) and t.id.isupper():
713
+ constants.append(t.id)
714
+
715
+ exports = [c.name for c in classes if not c.name.startswith('_')]
716
+ exports += [f.name for f in functions if not f.name.startswith('_')]
717
+ lines = content.split('\n')
718
+
719
+ return ModuleInfo(
720
+ path=filepath,
721
+ language='python',
722
+ imports=imports[:20],
723
+ exports=exports,
724
+ classes=classes,
725
+ functions=functions,
726
+ types=[],
727
+ constants=constants[:10],
728
+ docstring=ast.get_docstring(tree)[:100] if ast.get_docstring(tree) else None,
729
+ lines_total=len(lines),
730
+ lines_code=len([l for l in lines if l.strip() and not l.strip().startswith('#')])
731
+ )
732
+
733
+ def _extract_ast_function(self, node) -> FunctionInfo:
734
+ """Extract function from Python AST node."""
735
+ is_async = isinstance(node, ast.AsyncFunctionDef)
736
+ params = []
737
+ for arg in node.args.args:
738
+ p = arg.arg
739
+ if arg.annotation:
740
+ p += ':' + self._ann_str(arg.annotation)
741
+ params.append(p)
742
+
743
+ decorators = []
744
+ for dec in node.decorator_list:
745
+ if isinstance(dec, ast.Name):
746
+ decorators.append(dec.id)
747
+ elif isinstance(dec, ast.Call) and isinstance(dec.func, ast.Name):
748
+ decorators.append(dec.func.id)
749
+
750
+ docstring = ast.get_docstring(node)
751
+ return FunctionInfo(
752
+ name=node.name,
753
+ params=params[:8],
754
+ return_type=self._ann_str(node.returns) if node.returns else None,
755
+ docstring=docstring[:100] if docstring else None,
756
+ calls=[],
757
+ raises=[],
758
+ complexity=1,
759
+ lines=node.end_lineno - node.lineno + 1 if hasattr(node, 'end_lineno') else 1,
760
+ decorators=decorators,
761
+ is_async=is_async,
762
+ is_static='staticmethod' in decorators,
763
+ is_private=node.name.startswith('_') and not node.name.startswith('__'),
764
+ intent=self.intent_gen.generate(node.name, docstring),
765
+ start_line=node.lineno,
766
+ end_line=node.end_lineno if hasattr(node, 'end_lineno') else node.lineno
767
+ )
768
+
769
+ def _extract_ast_class(self, node: ast.ClassDef) -> ClassInfo:
770
+ """Extract class from Python AST node."""
771
+ bases = []
772
+ for b in node.bases:
773
+ if isinstance(b, ast.Name):
774
+ bases.append(b.id)
775
+ elif isinstance(b, ast.Attribute):
776
+ bases.append(b.attr)
777
+
778
+ methods = []
779
+ for item in node.body:
780
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
781
+ methods.append(self._extract_ast_function(item))
782
+
783
+ return ClassInfo(
784
+ name=node.name,
785
+ bases=bases,
786
+ docstring=ast.get_docstring(node)[:100] if ast.get_docstring(node) else None,
787
+ methods=methods,
788
+ properties=[],
789
+ is_interface=False,
790
+ is_abstract='ABC' in bases,
791
+ generic_params=[]
792
+ )
793
+
794
+ def _ann_str(self, node) -> str:
795
+ """Convert AST annotation to string."""
796
+ if isinstance(node, ast.Name):
797
+ return node.id
798
+ elif isinstance(node, ast.Constant):
799
+ return str(node.value)
800
+ elif isinstance(node, ast.Subscript):
801
+ base = self._ann_str(node.value)
802
+ if isinstance(node.slice, ast.Tuple):
803
+ args = ','.join(self._ann_str(e) for e in node.slice.elts)
804
+ else:
805
+ args = self._ann_str(node.slice)
806
+ return f"{base}[{args}]"
807
+ return "Any"
808
+
809
+ def _parse_js_ts(self, filepath: str, content: str, language: str) -> ModuleInfo:
810
+ """Parse JS/TS using regex patterns."""
811
+ imports, classes, functions, types, constants, exports = [], [], [], [], [], []
812
+
813
+ # Import patterns
814
+ for m in re.finditer(r"import\s+.*?from\s+['\"]([^'\"]+)['\"]", content):
815
+ imports.append(m.group(1))
816
+
817
+ # Class patterns
818
+ for m in re.finditer(
819
+ r'(?:export\s+)?(?:abstract\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?',
820
+ content
821
+ ):
822
+ classes.append(ClassInfo(
823
+ name=m.group(1),
824
+ bases=[m.group(2)] if m.group(2) else [],
825
+ docstring=None,
826
+ methods=[],
827
+ properties=[],
828
+ is_interface=False,
829
+ is_abstract='abstract' in m.group(0),
830
+ generic_params=[]
831
+ ))
832
+ exports.append(m.group(1))
833
+
834
+ # Function patterns
835
+ for m in re.finditer(
836
+ r'(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)(?:\s*:\s*([^{]+))?',
837
+ content
838
+ ):
839
+ name = m.group(1)
840
+ params = [p.strip() for p in (m.group(2) or '').split(',') if p.strip()][:8]
841
+ functions.append(FunctionInfo(
842
+ name=name,
843
+ params=params,
844
+ return_type=m.group(3).strip() if m.group(3) else None,
845
+ docstring=None,
846
+ calls=[],
847
+ raises=[],
848
+ complexity=1,
849
+ lines=1,
850
+ decorators=[],
851
+ is_async='async' in m.group(0),
852
+ is_static=False,
853
+ is_private=name.startswith('_'),
854
+ intent=self.intent_gen.generate(name)
855
+ ))
856
+ exports.append(name)
857
+
858
+ # Arrow function patterns
859
+ for m in re.finditer(
860
+ r'(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*(?::\s*[^=]+)?\s*=>',
861
+ content
862
+ ):
863
+ name = m.group(1)
864
+ functions.append(FunctionInfo(
865
+ name=name,
866
+ params=[],
867
+ return_type=None,
868
+ docstring=None,
869
+ calls=[],
870
+ raises=[],
871
+ complexity=1,
872
+ lines=1,
873
+ decorators=[],
874
+ is_async='async' in m.group(0),
875
+ is_static=False,
876
+ is_private=name.startswith('_'),
877
+ intent=self.intent_gen.generate(name)
878
+ ))
879
+ exports.append(name)
880
+
881
+ # Interface/Type patterns
882
+ for m in re.finditer(r'(?:export\s+)?(interface|type)\s+(\w+)', content):
883
+ types.append(TypeInfo(name=m.group(2), kind=m.group(1), definition=''))
884
+ exports.append(m.group(2))
885
+
886
+ # Constant patterns
887
+ for m in re.finditer(r'const\s+([A-Z][A-Z0-9_]+)\s*=', content):
888
+ constants.append(m.group(1))
889
+
890
+ lines = content.split('\n')
891
+ return ModuleInfo(
892
+ path=filepath,
893
+ language=language,
894
+ imports=imports[:20],
895
+ exports=list(set(exports)),
896
+ classes=classes,
897
+ functions=functions,
898
+ types=types,
899
+ constants=constants[:10],
900
+ docstring=None,
901
+ lines_total=len(lines),
902
+ lines_code=len([l for l in lines if l.strip() and not l.strip().startswith('//')])
903
+ )
904
+
905
+
906
+ def is_tree_sitter_available() -> bool:
907
+ """Check if Tree-sitter is available."""
908
+ return TREE_SITTER_AVAILABLE