codebrain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. codebrain/__init__.py +3 -0
  2. codebrain/__main__.py +6 -0
  3. codebrain/agent_bridge.py +162 -0
  4. codebrain/analyzer.py +943 -0
  5. codebrain/api.py +578 -0
  6. codebrain/api_models.py +102 -0
  7. codebrain/cli.py +1927 -0
  8. codebrain/comprehension.py +1939 -0
  9. codebrain/config.py +46 -0
  10. codebrain/context.py +276 -0
  11. codebrain/export.py +334 -0
  12. codebrain/graph/__init__.py +0 -0
  13. codebrain/graph/query.py +656 -0
  14. codebrain/graph/schema.py +113 -0
  15. codebrain/graph/store.py +295 -0
  16. codebrain/hook_runner.py +71 -0
  17. codebrain/hooks.py +107 -0
  18. codebrain/indexer.py +450 -0
  19. codebrain/llm.py +676 -0
  20. codebrain/logging.py +42 -0
  21. codebrain/mcp_server.py +1635 -0
  22. codebrain/memory/__init__.py +5 -0
  23. codebrain/memory/store.py +270 -0
  24. codebrain/parser/__init__.py +0 -0
  25. codebrain/parser/base.py +27 -0
  26. codebrain/parser/config_parser.py +228 -0
  27. codebrain/parser/models.py +44 -0
  28. codebrain/parser/python_parser.py +658 -0
  29. codebrain/parser/registry.py +144 -0
  30. codebrain/parser/typescript_parser.py +1189 -0
  31. codebrain/parser/typescript_treesitter.py +535 -0
  32. codebrain/py.typed +0 -0
  33. codebrain/resolver.py +171 -0
  34. codebrain/settings.py +88 -0
  35. codebrain/utils.py +59 -0
  36. codebrain/validator.py +563 -0
  37. codebrain/watcher/__init__.py +0 -0
  38. codebrain/watcher/file_watcher.py +173 -0
  39. codebrain-0.1.0.dist-info/METADATA +360 -0
  40. codebrain-0.1.0.dist-info/RECORD +44 -0
  41. codebrain-0.1.0.dist-info/WHEEL +5 -0
  42. codebrain-0.1.0.dist-info/entry_points.txt +6 -0
  43. codebrain-0.1.0.dist-info/licenses/LICENSE +21 -0
  44. codebrain-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,658 @@
1
+ """Python AST visitor that extracts structural nodes and edges."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import json
7
+ from pathlib import Path
8
+
9
+ from codebrain.parser.base import BaseParser
10
+ from codebrain.parser.models import ParsedEdge, ParsedFile, ParsedNode
11
+ from codebrain.utils import content_hash
12
+
13
+
14
+ def _get_docstring(node: ast.AST) -> str:
15
+ """Extract first-line docstring from a function/class/module, or return ''."""
16
+ ds = ast.get_docstring(node)
17
+ if ds:
18
+ first_line = ds.split("\n", 1)[0].strip()
19
+ return first_line[:200]
20
+ return ""
21
+
22
+
23
+ def _get_decorator_names(node: ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef) -> list[str]:
24
+ """Return a list of decorator name strings."""
25
+ names: list[str] = []
26
+ for dec in node.decorator_list:
27
+ if isinstance(dec, ast.Name):
28
+ names.append(dec.id)
29
+ elif isinstance(dec, ast.Attribute):
30
+ names.append(ast.unparse(dec))
31
+ elif isinstance(dec, ast.Call):
32
+ if isinstance(dec.func, ast.Name):
33
+ names.append(dec.func.id)
34
+ elif isinstance(dec.func, ast.Attribute):
35
+ names.append(ast.unparse(dec.func))
36
+ else:
37
+ names.append(ast.unparse(dec.func))
38
+ else:
39
+ names.append(ast.unparse(dec))
40
+ return names
41
+
42
+
43
+ def _build_signature(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
44
+ """Build a human-readable function signature string."""
45
+ params: list[str] = []
46
+ args = node.args
47
+
48
+ # positional-only
49
+ for i, arg in enumerate(args.posonlyargs):
50
+ p = arg.arg
51
+ if arg.annotation:
52
+ p += f": {ast.unparse(arg.annotation)}"
53
+ params.append(p)
54
+ if args.posonlyargs:
55
+ params.append("/")
56
+
57
+ # normal positional/keyword
58
+ num_defaults = len(args.defaults)
59
+ num_args = len(args.args)
60
+ for i, arg in enumerate(args.args):
61
+ p = arg.arg
62
+ if arg.annotation:
63
+ p += f": {ast.unparse(arg.annotation)}"
64
+ default_idx = i - (num_args - num_defaults)
65
+ if default_idx >= 0:
66
+ p += f" = {ast.unparse(args.defaults[default_idx])}"
67
+ params.append(p)
68
+
69
+ # *args
70
+ if args.vararg:
71
+ p = f"*{args.vararg.arg}"
72
+ if args.vararg.annotation:
73
+ p += f": {ast.unparse(args.vararg.annotation)}"
74
+ params.append(p)
75
+ elif args.kwonlyargs:
76
+ params.append("*")
77
+
78
+ # keyword-only
79
+ for i, arg in enumerate(args.kwonlyargs):
80
+ p = arg.arg
81
+ if arg.annotation:
82
+ p += f": {ast.unparse(arg.annotation)}"
83
+ if args.kw_defaults[i] is not None:
84
+ p += f" = {ast.unparse(args.kw_defaults[i])}"
85
+ params.append(p)
86
+
87
+ # **kwargs
88
+ if args.kwarg:
89
+ p = f"**{args.kwarg.arg}"
90
+ if args.kwarg.annotation:
91
+ p += f": {ast.unparse(args.kwarg.annotation)}"
92
+ params.append(p)
93
+
94
+ sig = f"({', '.join(params)})"
95
+ if node.returns:
96
+ sig += f" -> {ast.unparse(node.returns)}"
97
+ return sig
98
+
99
+
100
+ def _call_target_name(node: ast.Call) -> str | None:
101
+ """Best-effort extraction of the callable name from a Call node."""
102
+ func = node.func
103
+ if isinstance(func, ast.Name):
104
+ return func.id
105
+ if isinstance(func, ast.Attribute):
106
+ return ast.unparse(func)
107
+ return None
108
+
109
+
110
+ # Route decorator patterns for API endpoint extraction
111
+ _ROUTE_METHODS = frozenset({"get", "post", "put", "delete", "patch", "head", "options", "route"})
112
+
113
+
114
+ def _extract_route_info(decorators: list[ast.expr]) -> str | None:
115
+ """Extract API route path from decorators like @app.get('/path') or @router.post('/path')."""
116
+ for dec in decorators:
117
+ if isinstance(dec, ast.Call) and isinstance(dec.func, ast.Attribute):
118
+ method = dec.func.attr
119
+ if method in _ROUTE_METHODS and dec.args:
120
+ arg = dec.args[0]
121
+ if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
122
+ return f"{method.upper()} {arg.value}"
123
+ return None
124
+
125
+
126
+ # MongoDB collection access patterns
127
+ _MONGO_PATTERNS = (
128
+ # db["collection_name"] or db['collection_name']
129
+ r"""(?:db|database|mongo_db|self\.db|self\.database)\s*\[\s*['"](\w+)['"]\s*\]""",
130
+ # db.collection_name (attribute access)
131
+ r"""(?:db|database|mongo_db|self\.db|self\.database)\.(\w+)\b(?!\s*\()""",
132
+ # get_collection("name") or collection("name")
133
+ r"""(?:get_collection|collection)\s*\(\s*['"](\w+)['"]\s*\)""",
134
+ )
135
+
136
+ import re as _re
137
+ _MONGO_RE = [_re.compile(p) for p in _MONGO_PATTERNS]
138
+
139
+ # Significant comment patterns
140
+ _SIGNIFICANT_COMMENT = _re.compile(
141
+ r"""#\s*(TODO|FIXME|HACK|NOTE|WARNING|BUG|XXX|IMPORTANT|REFACTOR)\b[:\s]*(.+)""",
142
+ _re.IGNORECASE,
143
+ )
144
+
145
+
146
+ def _extract_significant_comments(source: str, max_comments: int = 20) -> list[str]:
147
+ """Extract TODO, FIXME, HACK, NOTE and other significant comments."""
148
+ comments: list[str] = []
149
+ for m in _SIGNIFICANT_COMMENT.finditer(source):
150
+ tag = m.group(1).upper()
151
+ text = m.group(2).strip()[:100]
152
+ line = source.count("\n", 0, m.start()) + 1
153
+ comments.append(f"L{line} {tag}: {text}")
154
+ if len(comments) >= max_comments:
155
+ break
156
+ return comments
157
+
158
+ # Common MongoDB method names to filter out
159
+ _MONGO_METHODS = frozenset({
160
+ "find", "find_one", "insert_one", "insert_many", "update_one", "update_many",
161
+ "delete_one", "delete_many", "aggregate", "count_documents", "create_index",
162
+ "drop", "distinct", "bulk_write", "watch", "list_collection_names",
163
+ "get_database", "get_collection", "client", "close", "command",
164
+ })
165
+
166
+
167
+ def _extract_mongo_collections(source: str) -> list[str]:
168
+ """Extract MongoDB collection names from source code."""
169
+ collections: set[str] = set()
170
+ for pattern in _MONGO_RE:
171
+ for m in pattern.finditer(source):
172
+ name = m.group(1)
173
+ if name and name not in _MONGO_METHODS and not name.startswith("_"):
174
+ collections.add(name)
175
+ return sorted(collections)
176
+
177
+
178
+ class PythonVisitor(ast.NodeVisitor):
179
+ """Walk a Python AST and collect structural nodes and edges."""
180
+
181
+ def __init__(self, file_path: str, module_name: str) -> None:
182
+ self.file_path = file_path
183
+ self.module_name = module_name
184
+ self.nodes: list[ParsedNode] = []
185
+ self.edges: list[ParsedEdge] = []
186
+ self._scope_stack: list[str] = [] # qualified name parts
187
+ self._all_names: set[str] | None = None # populated if __all__ found
188
+ self._current_class: str | None = None # current class node id
189
+
190
+ def _make_id(self, qualified_name: str) -> str:
191
+ return f"{self.file_path}::{qualified_name}"
192
+
193
+ def _resolve_call_target(self, name: str, class_node_id: str | None) -> str:
194
+ """Resolve ``self.X`` to ``ClassName.X`` using the enclosing class context."""
195
+ if class_node_id and name.startswith("self."):
196
+ class_qname = class_node_id.split("::", 1)[1] if "::" in class_node_id else class_node_id
197
+ return f"{class_qname}.{name[5:]}"
198
+ return name
199
+
200
+ def _qualified_name(self, name: str) -> str:
201
+ if self._scope_stack:
202
+ return ".".join(self._scope_stack) + "." + name
203
+ return name
204
+
205
+ def _is_exported(self, name: str) -> bool:
206
+ if self._all_names is not None:
207
+ return name in self._all_names
208
+ # If no __all__, top-level non-underscore names are considered exported
209
+ return len(self._scope_stack) == 0 and not name.startswith("_")
210
+
211
+ # ------------------------------------------------------------------
212
+ # Pre-scan for __all__
213
+ # ------------------------------------------------------------------
214
+ def _scan_all(self, tree: ast.Module) -> None:
215
+ for node in ast.iter_child_nodes(tree):
216
+ if isinstance(node, ast.Assign):
217
+ for target in node.targets:
218
+ if isinstance(target, ast.Name) and target.id == "__all__":
219
+ if isinstance(node.value, (ast.List, ast.Tuple)):
220
+ self._all_names = set()
221
+ for elt in node.value.elts:
222
+ if isinstance(elt, ast.Constant) and isinstance(elt.value, str):
223
+ self._all_names.add(elt.value)
224
+
225
+ # ------------------------------------------------------------------
226
+ # Visitors
227
+ # ------------------------------------------------------------------
228
+ def visit_Module(self, node: ast.Module) -> None:
229
+ self._scan_all(node)
230
+ self.generic_visit(node)
231
+
232
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
233
+ self._handle_function(node)
234
+
235
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
236
+ self._handle_function(node)
237
+
238
+ def _handle_function(self, node: ast.FunctionDef | ast.AsyncFunctionDef) -> None:
239
+ qname = self._qualified_name(node.name)
240
+ node_id = self._make_id(qname)
241
+ node_type = "method" if self._current_class else "function"
242
+
243
+ decorators = _get_decorator_names(node)
244
+
245
+ # Check for API route decorators
246
+ route_info = _extract_route_info(node.decorator_list)
247
+ if route_info:
248
+ decorators.append(f"endpoint:{route_info}")
249
+
250
+ pnode = ParsedNode(
251
+ id=node_id,
252
+ name=node.name,
253
+ qualified_name=qname,
254
+ type=node_type,
255
+ file_path=self.file_path,
256
+ line_start=node.lineno,
257
+ line_end=node.end_lineno or node.lineno,
258
+ signature=_build_signature(node),
259
+ decorators=decorators,
260
+ docstring=_get_docstring(node),
261
+ is_exported=self._is_exported(node.name),
262
+ )
263
+ self.nodes.append(pnode)
264
+
265
+ # CONTAINS edge from parent
266
+ if self._current_class:
267
+ self.edges.append(ParsedEdge(
268
+ source=self._current_class,
269
+ target=node_id,
270
+ type="CONTAINS",
271
+ file_path=self.file_path,
272
+ line=node.lineno,
273
+ ))
274
+ else:
275
+ # file contains this function
276
+ file_node_id = self._make_id(self.module_name)
277
+ self.edges.append(ParsedEdge(
278
+ source=file_node_id,
279
+ target=node_id,
280
+ type="CONTAINS",
281
+ file_path=self.file_path,
282
+ line=node.lineno,
283
+ ))
284
+
285
+ # Walk body for calls, nested defs, etc.
286
+ old_class = self._current_class
287
+ self._current_class = None # nested funcs are not methods
288
+ self._scope_stack.append(node.name)
289
+ self._visit_body_for_calls(node, class_node_id=old_class)
290
+ self.generic_visit(node)
291
+ self._scope_stack.pop()
292
+ self._current_class = old_class
293
+
294
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
295
+ qname = self._qualified_name(node.name)
296
+ node_id = self._make_id(qname)
297
+
298
+ pnode = ParsedNode(
299
+ id=node_id,
300
+ name=node.name,
301
+ qualified_name=qname,
302
+ type="class",
303
+ file_path=self.file_path,
304
+ line_start=node.lineno,
305
+ line_end=node.end_lineno or node.lineno,
306
+ decorators=_get_decorator_names(node),
307
+ docstring=_get_docstring(node),
308
+ is_exported=self._is_exported(node.name),
309
+ )
310
+ self.nodes.append(pnode)
311
+
312
+ # CONTAINS edge from file
313
+ file_node_id = self._make_id(self.module_name)
314
+ self.edges.append(ParsedEdge(
315
+ source=file_node_id,
316
+ target=node_id,
317
+ type="CONTAINS",
318
+ file_path=self.file_path,
319
+ line=node.lineno,
320
+ ))
321
+
322
+ # EXTENDS edges for base classes
323
+ for base in node.bases:
324
+ base_name = ast.unparse(base)
325
+ self.edges.append(ParsedEdge(
326
+ source=node_id,
327
+ target=base_name,
328
+ type="EXTENDS",
329
+ file_path=self.file_path,
330
+ line=node.lineno,
331
+ ))
332
+
333
+ # Visit children
334
+ old_class = self._current_class
335
+ self._current_class = node_id
336
+ self._scope_stack.append(node.name)
337
+ self.generic_visit(node)
338
+ self._scope_stack.pop()
339
+ self._current_class = old_class
340
+
341
+ def visit_Import(self, node: ast.Import) -> None:
342
+ container_id = self._make_id(
343
+ ".".join(self._scope_stack) if self._scope_stack else self.module_name
344
+ )
345
+ for alias in node.names:
346
+ self.edges.append(ParsedEdge(
347
+ source=container_id,
348
+ target=alias.name,
349
+ type="IMPORTS",
350
+ file_path=self.file_path,
351
+ line=node.lineno,
352
+ ))
353
+
354
+ def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
355
+ container_id = self._make_id(
356
+ ".".join(self._scope_stack) if self._scope_stack else self.module_name
357
+ )
358
+ module = node.module or ""
359
+ if node.names:
360
+ for alias in node.names:
361
+ if alias.name == "*":
362
+ # Star import: create edge to the module itself so dependency
363
+ # tracking knows this file depends on the entire module.
364
+ if module:
365
+ self.edges.append(ParsedEdge(
366
+ source=container_id,
367
+ target=module,
368
+ type="IMPORTS",
369
+ file_path=self.file_path,
370
+ line=node.lineno,
371
+ ))
372
+ continue
373
+ target = f"{module}.{alias.name}" if module else alias.name
374
+ self.edges.append(ParsedEdge(
375
+ source=container_id,
376
+ target=target,
377
+ type="IMPORTS",
378
+ file_path=self.file_path,
379
+ line=node.lineno,
380
+ ))
381
+
382
+ def _visit_body_for_calls(self, parent: ast.AST, class_node_id: str | None = None) -> None:
383
+ """Walk all Call nodes inside *parent* and emit CALLS and DATAFLOW edges.
384
+
385
+ DATAFLOW edges represent data dependencies between symbols — cases where
386
+ the return value of one callable flows into another. They use the same
387
+ ``source=consumer, target=producer`` convention as CALLS edges, except
388
+ for nested-call patterns where ``source=inner, target=outer`` to show
389
+ the inner call's result feeding into the outer call.
390
+
391
+ DATAFLOW edges are created for the following patterns:
392
+
393
+ 1. **Assignment from call**: ``x = foo()`` — the enclosing function
394
+ (container) has a DATAFLOW edge targeting ``foo`` because the
395
+ container consumes the value produced by ``foo``.
396
+ 2. **Return of call**: ``return foo()`` — same as (1); the container's
397
+ return value depends on ``foo``'s output.
398
+ 3. **Nested calls**: ``outer(inner())`` — ``inner``'s output flows into
399
+ ``outer``, so ``source=inner, target=outer``.
400
+ 4. **Keyword-arg calls**: ``outer(key=inner())`` — same as (3).
401
+ 5. **Yield of call**: ``yield foo()`` — container yields ``foo``'s
402
+ output, so ``source=container, target=foo``.
403
+ 6. **Await of call**: ``await foo()`` — container awaits ``foo``'s
404
+ output, so ``source=container, target=foo``.
405
+
406
+ Note: ``impact_of_change`` in ``graph/query.py`` currently traverses
407
+ only CALLS and IMPORTS edges, so DATAFLOW edges do not inflate impact
408
+ results. They are available for finer-grained data-dependency analysis.
409
+ """
410
+ container_qname = ".".join(self._scope_stack) if self._scope_stack else self.module_name
411
+ container_id = self._make_id(container_qname)
412
+ for child in ast.walk(parent):
413
+ if isinstance(child, ast.Call):
414
+ name = _call_target_name(child)
415
+ if name:
416
+ name = self._resolve_call_target(name, class_node_id)
417
+ self.edges.append(ParsedEdge(
418
+ source=container_id,
419
+ target=name,
420
+ type="CALLS",
421
+ file_path=self.file_path,
422
+ line=getattr(child, "lineno", 0),
423
+ ))
424
+ # Data flow: variable = function_call()
425
+ if isinstance(child, ast.Assign):
426
+ if isinstance(child.value, ast.Call):
427
+ call_name = _call_target_name(child.value)
428
+ if call_name:
429
+ call_name = self._resolve_call_target(call_name, class_node_id)
430
+ for target in child.targets:
431
+ if isinstance(target, ast.Name):
432
+ self.edges.append(ParsedEdge(
433
+ source=container_id,
434
+ target=call_name,
435
+ type="DATAFLOW",
436
+ file_path=self.file_path,
437
+ line=getattr(child, "lineno", 0),
438
+ ))
439
+ # Data flow: return function_call()
440
+ if isinstance(child, ast.Return) and child.value:
441
+ if isinstance(child.value, ast.Call):
442
+ call_name = _call_target_name(child.value)
443
+ if call_name:
444
+ call_name = self._resolve_call_target(call_name, class_node_id)
445
+ self.edges.append(ParsedEdge(
446
+ source=container_id,
447
+ target=call_name,
448
+ type="DATAFLOW",
449
+ file_path=self.file_path,
450
+ line=getattr(child, "lineno", 0),
451
+ ))
452
+ # Data flow: function_call(other_call()) — nested calls
453
+ if isinstance(child, ast.Call):
454
+ for arg in child.args:
455
+ if isinstance(arg, ast.Call):
456
+ inner = _call_target_name(arg)
457
+ outer = _call_target_name(child)
458
+ if inner and outer:
459
+ inner = self._resolve_call_target(inner, class_node_id)
460
+ outer = self._resolve_call_target(outer, class_node_id)
461
+ self.edges.append(ParsedEdge(
462
+ source=inner,
463
+ target=outer,
464
+ type="DATAFLOW",
465
+ file_path=self.file_path,
466
+ line=getattr(child, "lineno", 0),
467
+ ))
468
+ # keyword args: func(x=other_call())
469
+ for kw in child.keywords:
470
+ if isinstance(kw.value, ast.Call):
471
+ inner = _call_target_name(kw.value)
472
+ outer = _call_target_name(child)
473
+ if inner and outer:
474
+ inner = self._resolve_call_target(inner, class_node_id)
475
+ outer = self._resolve_call_target(outer, class_node_id)
476
+ self.edges.append(ParsedEdge(
477
+ source=inner,
478
+ target=outer,
479
+ type="DATAFLOW",
480
+ file_path=self.file_path,
481
+ line=getattr(child, "lineno", 0),
482
+ ))
483
+ # Data flow: yield call()
484
+ if isinstance(child, ast.Yield) and child.value and isinstance(child.value, ast.Call):
485
+ call_name = _call_target_name(child.value)
486
+ if call_name:
487
+ call_name = self._resolve_call_target(call_name, class_node_id)
488
+ self.edges.append(ParsedEdge(
489
+ source=container_id,
490
+ target=call_name,
491
+ type="DATAFLOW",
492
+ file_path=self.file_path,
493
+ line=getattr(child, "lineno", 0),
494
+ ))
495
+ # Data flow: await call()
496
+ if isinstance(child, ast.Await) and isinstance(child.value, ast.Call):
497
+ call_name = _call_target_name(child.value)
498
+ if call_name:
499
+ call_name = self._resolve_call_target(call_name, class_node_id)
500
+ self.edges.append(ParsedEdge(
501
+ source=container_id,
502
+ target=call_name,
503
+ type="DATAFLOW",
504
+ file_path=self.file_path,
505
+ line=getattr(child, "lineno", 0),
506
+ ))
507
+
508
+ def visit_Assign(self, node: ast.Assign) -> None:
509
+ # Only capture module-level variable assignments
510
+ if self._scope_stack:
511
+ self.generic_visit(node)
512
+ return
513
+ for target in node.targets:
514
+ if isinstance(target, ast.Name):
515
+ if target.id == "__all__":
516
+ continue
517
+ qname = self._qualified_name(target.id)
518
+ node_id = self._make_id(qname)
519
+ pnode = ParsedNode(
520
+ id=node_id,
521
+ name=target.id,
522
+ qualified_name=qname,
523
+ type="variable",
524
+ file_path=self.file_path,
525
+ line_start=node.lineno,
526
+ line_end=node.end_lineno or node.lineno,
527
+ is_exported=self._is_exported(target.id),
528
+ )
529
+ self.nodes.append(pnode)
530
+ file_node_id = self._make_id(self.module_name)
531
+ self.edges.append(ParsedEdge(
532
+ source=file_node_id,
533
+ target=node_id,
534
+ type="CONTAINS",
535
+ file_path=self.file_path,
536
+ line=node.lineno,
537
+ ))
538
+ self.generic_visit(node)
539
+
540
+ def visit_AnnAssign(self, node: ast.AnnAssign) -> None:
541
+ if self._scope_stack:
542
+ self.generic_visit(node)
543
+ return
544
+ if isinstance(node.target, ast.Name):
545
+ name = node.target.id
546
+ qname = self._qualified_name(name)
547
+ node_id = self._make_id(qname)
548
+ pnode = ParsedNode(
549
+ id=node_id,
550
+ name=name,
551
+ qualified_name=qname,
552
+ type="variable",
553
+ file_path=self.file_path,
554
+ line_start=node.lineno,
555
+ line_end=node.end_lineno or node.lineno,
556
+ is_exported=self._is_exported(name),
557
+ )
558
+ self.nodes.append(pnode)
559
+ file_node_id = self._make_id(self.module_name)
560
+ self.edges.append(ParsedEdge(
561
+ source=file_node_id,
562
+ target=node_id,
563
+ type="CONTAINS",
564
+ file_path=self.file_path,
565
+ line=node.lineno,
566
+ ))
567
+ self.generic_visit(node)
568
+
569
+
570
+ def parse_file(path: Path, repo_root: Path) -> ParsedFile:
571
+ """Parse a Python file and return a ParsedFile with nodes and edges."""
572
+ rel_path = path.relative_to(repo_root).as_posix()
573
+ source = path.read_bytes()
574
+ hash_val = content_hash(source)
575
+
576
+ try:
577
+ tree = ast.parse(source, filename=str(path))
578
+ except (SyntaxError, ValueError):
579
+ # ValueError: null bytes in source. SyntaxError: bad syntax.
580
+ # Retry with explicit UTF-8 decoding and null-byte stripping.
581
+ try:
582
+ text = source.decode("utf-8", errors="replace").replace("\x00", "")
583
+ tree = ast.parse(text, filename=str(path))
584
+ except (SyntaxError, ValueError):
585
+ return ParsedFile(path=rel_path, content_hash=hash_val, line_count=source.count(b"\n") + 1)
586
+
587
+ line_count = source.count(b"\n") + 1
588
+
589
+ # Derive a module-like name from the relative path
590
+ module_name = rel_path.replace("/", ".").removesuffix(".py")
591
+ if module_name.endswith(".__init__"):
592
+ module_name = module_name.removesuffix(".__init__")
593
+
594
+ # Create a file-level node
595
+ try:
596
+ source_text_for_comments = source.decode("utf-8", errors="replace")
597
+ except Exception:
598
+ source_text_for_comments = ""
599
+ significant_comments = _extract_significant_comments(source_text_for_comments)
600
+ file_docstring = _get_docstring(tree)
601
+ if significant_comments:
602
+ comment_block = " | ".join(significant_comments)
603
+ if file_docstring:
604
+ file_docstring += f" [Comments: {comment_block}]"
605
+ else:
606
+ file_docstring = f"[Comments: {comment_block}]"
607
+
608
+ file_node = ParsedNode(
609
+ id=f"{rel_path}::{module_name}",
610
+ name=module_name.rsplit(".", 1)[-1],
611
+ qualified_name=module_name,
612
+ type="file",
613
+ file_path=rel_path,
614
+ line_start=1,
615
+ line_end=line_count,
616
+ docstring=file_docstring,
617
+ is_exported=True,
618
+ )
619
+
620
+ visitor = PythonVisitor(file_path=rel_path, module_name=module_name)
621
+ visitor.visit(tree)
622
+
623
+ nodes = [file_node] + visitor.nodes
624
+ edges = visitor.edges
625
+
626
+ # Extract MongoDB collection references
627
+ try:
628
+ source_text = source.decode("utf-8", errors="replace")
629
+ except Exception:
630
+ source_text = ""
631
+ collections = _extract_mongo_collections(source_text)
632
+ file_node_id = f"{rel_path}::{module_name}"
633
+ for coll_name in collections:
634
+ edges.append(ParsedEdge(
635
+ source=file_node_id,
636
+ target=f"mongodb:{coll_name}",
637
+ type="DATAFLOW",
638
+ file_path=rel_path,
639
+ line=0,
640
+ ))
641
+
642
+ return ParsedFile(
643
+ path=rel_path,
644
+ content_hash=hash_val,
645
+ nodes=nodes,
646
+ edges=edges,
647
+ line_count=line_count,
648
+ )
649
+
650
+
651
+ class PythonParser(BaseParser):
652
+ """BaseParser wrapper around the Python AST parser."""
653
+
654
+ def extensions(self) -> frozenset[str]:
655
+ return frozenset({".py"})
656
+
657
+ def parse(self, path: Path, repo_root: Path) -> ParsedFile:
658
+ return parse_file(path, repo_root)