PyPI - codegraph-ai - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

codegraph-ai 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{codegraph_ai-0.2.1 → codegraph_ai-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codegraph-ai
-Version: 0.2.1
+Version: 0.2.2
 Summary: Hybrid graph + vector code intelligence powered by NeuG and zvec
 Requires-Python: >=3.10
 Requires-Dist: neug

codegraph_ai-0.2.2/codegraph/adapters/python_adapter.py ADDED Viewed

@@ -0,0 +1,692 @@
+"""Python source code adapter using tree-sitter."""
+from __future__ import annotations
+from tree_sitter_language_pack import get_parser
+from codegraph.adapters.base import BaseAdapter
+from codegraph.models import (
+    CallInfo,
+    ParsedClass,
+    ParsedField,
+    ParsedFunction,
+    ParsedImport,
+    ParseResult,
+)
+def _node_text(node) -> str:
+    """Return the UTF-8 text of a tree-sitter node."""
+    return node.text.decode("utf-8") if node.text else ""
+def _extract_docstring(body_node) -> str:
+    """Return the leading docstring of a function/class body, if any."""
+    if body_node is None or body_node.child_count == 0:
+        return ""
+    first = body_node.children[0]
+    # tree-sitter-python may represent the docstring as:
+    #   1. expression_statement > string
+    #   2. string  (directly under block)
+    string_node = None
+    if first.type == "expression_statement" and first.child_count > 0:
+        candidate = first.children[0]
+        if candidate.type == "string":
+            string_node = candidate
+    elif first.type == "string":
+        string_node = first
+    if string_node is not None:
+        # Try extracting from string_content child first (newer grammar)
+        for child in string_node.children:
+            if child.type == "string_content":
+                return _node_text(child).strip()
+        # Fallback: strip surrounding quotes manually
+        raw = _node_text(string_node)
+        for q in ('"""', "'''", '"', "'"):
+            if raw.startswith(q) and raw.endswith(q):
+                return raw[len(q) : -len(q)].strip()
+        return raw
+    return ""
+def _build_signature(func_node, source_lines: list[str]) -> str:
+    """Build a human-readable signature from a function_definition node."""
+    name_node = func_node.child_by_field_name("name")
+    params_node = func_node.child_by_field_name("parameters")
+    ret_node = func_node.child_by_field_name("return_type")
+    name = _node_text(name_node) if name_node else "?"
+    params = _node_text(params_node) if params_node else "()"
+    ret = ""
+    if ret_node:
+        ret = f" -> {_node_text(ret_node)}"
+    return f"def {name}{params}{ret}"
+def _collect_calls(node, calls: list[CallInfo]) -> None:
+    """Recursively collect function calls with receiver context."""
+    if node.type == "call":
+        func = node.child_by_field_name("function")
+        if func:
+            if func.type == "attribute":
+                obj_node = func.child_by_field_name("object")
+                attr_node = func.child_by_field_name("attribute")
+                receiver = _node_text(obj_node) if obj_node else None
+                callee = _node_text(attr_node) if attr_node else _node_text(func)
+                if receiver and "." in receiver:
+                    receiver = receiver.rsplit(".", 1)[-1]
+                calls.append(CallInfo(
+                    callee_name=callee,
+                    receiver=receiver,
+                    raw_expression=_node_text(func),
+                ))
+            else:
+                callee = _node_text(func)
+                calls.append(CallInfo(
+                    callee_name=callee,
+                    receiver=None,
+                    raw_expression=callee,
+                ))
+    for child in node.children:
+        _collect_calls(child, calls)
+def _extract_type_name(type_node) -> tuple[str | None, bool, bool]:
+    """Extract the base type name from a type annotation node.
+    Returns:
+        (type_name, is_optional, is_list)
+    """
+    if type_node is None:
+        return None, False, False
+    type_text = _node_text(type_node).strip()
+    if not type_text:
+        return None, False, False
+    is_optional = False
+    is_list = False
+    # Handle Optional[X] or X | None patterns
+    if type_text.startswith("Optional[") and type_text.endswith("]"):
+        is_optional = True
+        type_text = type_text[9:-1].strip()
+    elif " | None" in type_text or "None | " in type_text:
+        is_optional = True
+        type_text = type_text.replace(" | None", "").replace("None | ", "").strip()
+    # Handle List[X], list[X], Sequence[X]
+    for prefix in ("List[", "list[", "Sequence[", "Iterable["):
+        if type_text.startswith(prefix) and type_text.endswith("]"):
+            is_list = True
+            type_text = type_text[len(prefix):-1].strip()
+            break
+    # Handle nested Optional in List
+    if type_text.startswith("Optional[") and type_text.endswith("]"):
+        is_optional = True
+        type_text = type_text[9:-1].strip()
+    # Extract simple type name (ignore generics like Dict[str, int])
+    if "[" in type_text:
+        # For complex types, just take what's before the bracket
+        type_text = type_text.split("[")[0].strip()
+    # Handle qualified names like module.ClassName
+    if "." in type_text:
+        type_text = type_text.rsplit(".", 1)[-1]
+    return type_text if type_text else None, is_optional, is_list
+def _extract_init_assignments(init_body) -> list[tuple[str, str | None]]:
+    """Extract self.xxx = assignments from __init__ body.
+    Returns:
+        List of (field_name, assigned_type) where assigned_type is the
+        constructor call name (e.g., 'LlamaModel' from 'LlamaModel(...)').
+    """
+    assignments: list[tuple[str, str | None]] = []
+    if init_body is None:
+        return assignments
+    for stmt in init_body.children:
+        # Handle expression_statement containing assignment
+        if stmt.type == "expression_statement":
+            for child in stmt.children:
+                if child.type == "assignment":
+                    _process_assignment(child, assignments)
+        elif stmt.type == "assignment":
+            _process_assignment(stmt, assignments)
+        # Handle with statements (context managers)
+        elif stmt.type == "with_statement":
+            body = stmt.child_by_field_name("body")
+            if body:
+                assignments.extend(_extract_init_assignments(body))
+        # Handle if/try statements
+        elif stmt.type in ("if_statement", "try_statement"):
+            for child in stmt.children:
+                if child.type == "block":
+                    assignments.extend(_extract_init_assignments(child))
+    return assignments
+def _process_assignment(assign_node, assignments: list[tuple[str, str | None, bool]]) -> None:
+    """Process a single assignment node to extract self.xxx = patterns.
+    Now returns (field_name, type_hint, is_optional) tuples.
+    """
+    left = assign_node.child_by_field_name("left")
+    right = assign_node.child_by_field_name("right")
+    type_node = assign_node.child_by_field_name("type")
+    if left is None:
+        return
+    # Check if left side is self.xxx
+    if left.type == "attribute":
+        obj = left.child_by_field_name("object")
+        attr = left.child_by_field_name("attribute")
+        if obj and _node_text(obj) == "self" and attr:
+            field_name = _node_text(attr)
+            assigned_type = None
+            is_optional = False
+            # First, check if there's a type annotation (e.g., self.cache: Optional[X] = None)
+            if type_node:
+                assigned_type, is_optional, _ = _extract_type_name(type_node)
+            else:
+                # Try to extract from constructor call in RHS
+                assigned_type = _extract_constructor_type(right)
+            # If right side is None, mark as optional
+            if right and _node_text(right) == "None":
+                is_optional = True
+            assignments.append((field_name, assigned_type, is_optional))
+def _extract_constructor_type(node) -> str | None:
+    """Recursively extract the constructor type from a call expression.
+    Handles patterns like:
+    - LlamaModel(...)
+    - internals.LlamaModel(...)
+    - self._stack.enter_context(contextlib.closing(internals.LlamaModel(...)))
+    - tokenizer or LlamaTokenizer(self)  # conditional/or expression
+    """
+    if node is None:
+        return None
+    # Handle conditional expression: x or Y() / x if cond else Y()
+    if node.type == "boolean_operator":
+        # For "a or b", try both sides
+        for child in node.children:
+            if child.is_named:
+                result = _extract_constructor_type(child)
+                if result:
+                    return result
+        return None
+    if node.type == "conditional_expression":
+        # For "a if cond else b", check consequence and alternative
+        for child in node.children:
+            if child.is_named and child.type != "identifier":
+                result = _extract_constructor_type(child)
+                if result:
+                    return result
+        return None
+    if node.type == "call":
+        func = node.child_by_field_name("function")
+        args = node.child_by_field_name("arguments")
+        if func:
+            func_text = _node_text(func)
+            # Check if this looks like a constructor call (starts with uppercase or is qualified)
+            if func.type == "identifier":
+                # Simple call like LlamaModel(...)
+                if func_text and func_text[0].isupper():
+                    return func_text
+            elif func.type == "attribute":
+                # Qualified call like internals.LlamaModel(...)
+                attr_node = func.child_by_field_name("attribute")
+                if attr_node:
+                    attr_text = _node_text(attr_node)
+                    if attr_text and attr_text[0].isupper():
+                        return attr_text
+            # This might be a wrapper call like enter_context(...) or closing(...)
+            # Look into the arguments for constructor calls
+            if args:
+                for arg in args.children:
+                    if arg.is_named:
+                        result = _extract_constructor_type(arg)
+                        if result:
+                            return result
+    return None
+class PythonAdapter(BaseAdapter):
+    """Extract functions, classes, calls and imports from Python files."""
+    def __init__(self) -> None:
+        self._parser = get_parser("python")
+    # -- BaseAdapter interface ------------------------------------------------
+    def language_name(self) -> str:
+        return "python"
+    def supported_extensions(self) -> list[str]:
+        return [".py"]
+    def parse_file(self, source: bytes, file_path: str) -> ParseResult:
+        tree = self._parser.parse(source)
+        root = tree.root_node
+        source_lines = source.decode("utf-8", errors="replace").splitlines()
+        functions: list[ParsedFunction] = []
+        classes: list[ParsedClass] = []
+        imports: list[ParsedImport] = []
+        self._walk_top_level(
+            root, file_path, source_lines, functions, classes, imports
+        )
+        return ParseResult(functions=functions, classes=classes, imports=imports)
+    # -- Internal helpers -----------------------------------------------------
+    def _walk_top_level(
+        self,
+        node,
+        file_path: str,
+        source_lines: list[str],
+        functions: list[ParsedFunction],
+        classes: list[ParsedClass],
+        imports: list[ParsedImport],
+    ) -> None:
+        """Walk top-level children of *node* and populate lists."""
+        for child in node.children:
+            if child.type == "function_definition":
+                self._extract_function(
+                    child, file_path, source_lines, functions, class_name=None
+                )
+            elif child.type == "decorated_definition":
+                inner = _decorated_inner(child)
+                if inner is not None and inner.type == "function_definition":
+                    self._extract_function(
+                        inner, file_path, source_lines, functions, class_name=None
+                    )
+                elif inner is not None and inner.type == "class_definition":
+                    self._extract_class(
+                        inner, file_path, source_lines, functions, classes
+                    )
+            elif child.type == "class_definition":
+                self._extract_class(
+                    child, file_path, source_lines, functions, classes
+                )
+            elif child.type in ("import_statement", "import_from_statement"):
+                self._extract_import(child, file_path, imports)
+    def _extract_function(
+        self,
+        func_node,
+        file_path: str,
+        source_lines: list[str],
+        functions: list[ParsedFunction],
+        class_name: str | None,
+    ) -> None:
+        name_node = func_node.child_by_field_name("name")
+        name = _node_text(name_node) if name_node else "unknown"
+        start_line = func_node.start_point[0] + 1
+        end_line = func_node.end_point[0] + 1
+        qualified = f"{file_path}:{name}" if not class_name else f"{file_path}:{class_name}.{name}"
+        sig = _build_signature(func_node, source_lines)
+        body_node = func_node.child_by_field_name("body")
+        doc = _extract_docstring(body_node)
+        calls: list[CallInfo] = []
+        if body_node:
+            _collect_calls(body_node, calls)
+        functions.append(
+            ParsedFunction(
+                name=name,
+                qualified_name=qualified,
+                signature=sig,
+                file_path=file_path,
+                start_line=start_line,
+                end_line=end_line,
+                doc_comment=doc,
+                call_names=[c.callee_name for c in calls],
+                calls=calls,
+                class_name=class_name,
+            )
+        )
+    def _extract_class(
+        self,
+        class_node,
+        file_path: str,
+        source_lines: list[str],
+        functions: list[ParsedFunction],
+        classes: list[ParsedClass],
+    ) -> None:
+        name_node = class_node.child_by_field_name("name")
+        cls_name = _node_text(name_node) if name_node else "unknown"
+        start_line = class_node.start_point[0] + 1
+        end_line = class_node.end_point[0] + 1
+        qualified = f"{file_path}:{cls_name}"
+        base_classes: list[str] = []
+        superclasses = class_node.child_by_field_name("superclasses")
+        if superclasses:
+            for child in superclasses.children:
+                if child.is_named:
+                    text = _node_text(child)
+                    if text and text not in ("object",):
+                        base_classes.append(text)
+        method_names: list[str] = []
+        fields: list[ParsedField] = []
+        init_method = None
+        # Maps field_name -> ParsedField for merging info from multiple sources
+        field_map: dict[str, ParsedField] = {}
+        body = class_node.child_by_field_name("body")
+        if body:
+            for child in body.children:
+                # Extract class-level annotated assignments: field: Type = value
+                if child.type == "expression_statement":
+                    for inner in child.children:
+                        if inner.type == "assignment":
+                            self._extract_class_level_field(inner, field_map)
+                elif child.type == "typed_parameter" or (
+                    child.type == "expression_statement" and
+                    child.child_count > 0 and
+                    child.children[0].type == "typed_parameter"
+                ):
+                    # Handle standalone type annotations (Python dataclass style)
+                    self._extract_annotated_field(child, field_map)
+                # Extract methods
+                if child.type == "function_definition":
+                    m_name = _node_text(child.child_by_field_name("name"))
+                    method_names.append(m_name)
+                    self._extract_function(
+                        child, file_path, source_lines, functions, class_name=cls_name
+                    )
+                    if m_name == "__init__":
+                        init_method = child
+                elif child.type == "decorated_definition":
+                    inner = _decorated_inner(child)
+                    if inner is not None and inner.type == "function_definition":
+                        m_name = _node_text(inner.child_by_field_name("name"))
+                        method_names.append(m_name)
+                        self._extract_function(
+                            inner, file_path, source_lines, functions, class_name=cls_name
+                        )
+                        if m_name == "__init__":
+                            init_method = inner
+        # Extract fields from __init__ method
+        if init_method:
+            self._extract_init_fields(init_method, field_map)
+        # Convert field_map to list
+        fields = list(field_map.values())
+        classes.append(
+            ParsedClass(
+                name=cls_name,
+                qualified_name=qualified,
+                file_path=file_path,
+                start_line=start_line,
+                end_line=end_line,
+                method_names=method_names,
+                base_classes=base_classes,
+                fields=fields,
+            )
+        )
+    def _extract_class_level_field(
+        self,
+        assign_node,
+        field_map: dict[str, ParsedField],
+    ) -> None:
+        """Extract class-level field from assignment with optional type annotation."""
+        left = assign_node.child_by_field_name("left")
+        right = assign_node.child_by_field_name("right")
+        type_node = assign_node.child_by_field_name("type")
+        if left is None:
+            return
+        # Handle annotated assignment: field: Type = value
+        if left.type == "identifier":
+            field_name = _node_text(left)
+            type_hint, is_optional, is_list = _extract_type_name(type_node)
+            # Check if default is None
+            if right and _node_text(right) == "None":
+                is_optional = True
+            if field_name not in field_map:
+                field_map[field_name] = ParsedField(
+                    name=field_name,
+                    type_hint=type_hint,
+                    is_optional=is_optional,
+                    is_list=is_list,
+                    assigned_in_init=False,
+                )
+            else:
+                # Merge with existing info
+                if type_hint:
+                    field_map[field_name].type_hint = type_hint
+                if is_optional:
+                    field_map[field_name].is_optional = True
+                if is_list:
+                    field_map[field_name].is_list = True
+    def _extract_annotated_field(
+        self,
+        node,
+        field_map: dict[str, ParsedField],
+    ) -> None:
+        """Extract a type-annotated field declaration (dataclass style)."""
+        # Handle: field_name: Type
+        if node.type == "typed_parameter":
+            name_node = node.child_by_field_name("name")
+            type_node = node.child_by_field_name("type")
+            if name_node:
+                field_name = _node_text(name_node)
+                type_hint, is_optional, is_list = _extract_type_name(type_node)
+                if field_name not in field_map:
+                    field_map[field_name] = ParsedField(
+                        name=field_name,
+                        type_hint=type_hint,
+                        is_optional=is_optional,
+                        is_list=is_list,
+                    )
+    def _extract_init_fields(
+        self,
+        init_node,
+        field_map: dict[str, ParsedField],
+    ) -> None:
+        """Extract fields from __init__ parameters and body assignments."""
+        # Extract parameter types
+        params_node = init_node.child_by_field_name("parameters")
+        param_types: dict[str, tuple[str | None, bool, bool]] = {}
+        if params_node:
+            for child in params_node.children:
+                if child.type == "typed_parameter":
+                    name_node = child.child_by_field_name("name")
+                    type_node = child.child_by_field_name("type")
+                    if name_node:
+                        param_name = _node_text(name_node)
+                        param_types[param_name] = _extract_type_name(type_node)
+                elif child.type == "typed_default_parameter":
+                    name_node = child.child_by_field_name("name")
+                    type_node = child.child_by_field_name("type")
+                    default_node = child.child_by_field_name("value")
+                    if name_node:
+                        param_name = _node_text(name_node)
+                        type_hint, is_optional, is_list = _extract_type_name(type_node)
+                        # If default is None, mark as optional
+                        if default_node and _node_text(default_node) == "None":
+                            is_optional = True
+                        param_types[param_name] = (type_hint, is_optional, is_list)
+                elif child.type == "default_parameter":
+                    name_node = child.child_by_field_name("name")
+                    default_node = child.child_by_field_name("value")
+                    if name_node:
+                        param_name = _node_text(name_node)
+                        is_optional = default_node and _node_text(default_node) == "None"
+                        param_types[param_name] = (None, is_optional, False)
+        # Extract body assignments
+        body = init_node.child_by_field_name("body")
+        assignments = _extract_init_assignments(body)
+        for field_name, assigned_type, assign_is_optional in assignments:
+            # Skip private implementation details and primitive assignments
+            if field_name.startswith("__") and field_name.endswith("__"):
+                continue
+            # Try to determine type from various sources
+            type_hint = assigned_type
+            is_optional = assign_is_optional
+            is_list = False
+            # Check if this field comes from a parameter with the same name
+            # e.g., self.cache = cache, where cache: Optional[BaseLlamaCache]
+            clean_name = field_name.lstrip("_")
+            for param_name, (param_type, param_opt, param_list) in param_types.items():
+                if param_name == clean_name or param_name == field_name:
+                    if param_type and not type_hint:
+                        type_hint = param_type
+                    is_optional = is_optional or param_opt
+                    is_list = is_list or param_list
+                    break
+            if field_name in field_map:
+                # Merge with existing
+                existing = field_map[field_name]
+                if type_hint and not existing.type_hint:
+                    existing.type_hint = type_hint
+                existing.assigned_in_init = True
+                if is_optional:
+                    existing.is_optional = True
+                if is_list:
+                    existing.is_list = True
+            else:
+                # Only add if we have type information
+                if type_hint:
+                    field_map[field_name] = ParsedField(
+                        name=field_name,
+                        type_hint=type_hint,
+                        is_optional=is_optional,
+                        is_list=is_list,
+                        assigned_in_init=True,
+                    )
+    @staticmethod
+    def _extract_import(
+        node,
+        file_path: str,
+        imports: list[ParsedImport],
+    ) -> None:
+        """Extract import with imported names and relative import support."""
+        if node.type == "import_statement":
+            for child in node.children:
+                if child.type == "dotted_name":
+                    imports.append(
+                        ParsedImport(
+                            source_path=file_path,
+                            target_module=_node_text(child),
+                        )
+                    )
+                elif child.type == "aliased_import":
+                    name_node = child.child_by_field_name("name")
+                    if name_node:
+                        imports.append(
+                            ParsedImport(
+                                source_path=file_path,
+                                target_module=_node_text(name_node),
+                            )
+                        )
+        elif node.type == "import_from_statement":
+            module_node = node.child_by_field_name("module_name")
+            if module_node is None:
+                return
+            raw_module = _node_text(module_node)
+            is_relative = False
+            relative_level = 0
+            target_module = raw_module
+            if module_node.type == "relative_import":
+                is_relative = True
+                for ch in raw_module:
+                    if ch == ".":
+                        relative_level += 1
+                    else:
+                        break
+                target_module = raw_module[relative_level:]
+            elif raw_module.startswith("."):
+                is_relative = True
+                for ch in raw_module:
+                    if ch == ".":
+                        relative_level += 1
+                    else:
+                        break
+                target_module = raw_module[relative_level:]
+            imported_names: list[str] = []
+            past_import = False
+            for child in node.children:
+                if not child.is_named and _node_text(child) == "import":
+                    past_import = True
+                    continue
+                if not past_import:
+                    continue
+                if child.type == "dotted_name":
+                    imported_names.append(_node_text(child))
+                elif child.type == "aliased_import":
+                    name_node = child.child_by_field_name("name")
+                    if name_node:
+                        imported_names.append(_node_text(name_node))
+                elif child.type == "wildcard_import":
+                    imported_names.append("*")
+            imports.append(
+                ParsedImport(
+                    source_path=file_path,
+                    target_module=target_module,
+                    imported_names=imported_names,
+                    is_relative=is_relative,
+                    relative_level=relative_level,
+                )
+            )
+# -- Utilities ---------------------------------------------------------------
+def _decorated_inner(node):
+    """Return the actual definition node wrapped by a decorated_definition."""
+    for child in node.children:
+        if child.type in ("function_definition", "class_definition"):
+            return child
+    return None

codegraph-ai 0.2.1__tar.gz → 0.2.2__tar.gz

codegraph-ai 0.2.1tar.gz → 0.2.2tar.gz