PyPI - codetree-rag - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codetree-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

codetree/__init__.py +13 -0
codetree/cli.py +220 -0
codetree/config.py +110 -0
codetree/core.py +179 -0
codetree/indexer.py +322 -0
codetree/llm.py +116 -0
codetree/parser.py +352 -0
codetree/retriever.py +192 -0
codetree_rag-0.1.0.dist-info/METADATA +496 -0
codetree_rag-0.1.0.dist-info/RECORD +14 -0
codetree_rag-0.1.0.dist-info/WHEEL +5 -0
codetree_rag-0.1.0.dist-info/entry_points.txt +2 -0
codetree_rag-0.1.0.dist-info/licenses/LICENSE +21 -0
codetree_rag-0.1.0.dist-info/top_level.txt +1 -0

codetree/parser.py ADDED Viewed

@@ -0,0 +1,352 @@
+"""Code parser using tree-sitter for AST extraction."""
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+# Language file extensions mapping
+LANGUAGE_EXTENSIONS = {
+    "python": [".py", ".pyi"],
+    "javascript": [".js", ".jsx", ".mjs"],
+    "typescript": [".ts", ".tsx"],
+    "go": [".go"],
+    "rust": [".rs"],
+    "java": [".java"],
+    "c": [".c", ".h"],
+    "cpp": [".cpp", ".hpp", ".cc", ".cxx"],
+}
+@dataclass
+class CodeEntity:
+    """Represents a code entity (function, class, etc.)."""
+    name: str
+    type: str  # function, class, method, variable
+    start_line: int
+    end_line: int
+    docstring: Optional[str] = None
+    signature: Optional[str] = None
+    decorators: list[str] = field(default_factory=list)
+    children: list["CodeEntity"] = field(default_factory=list)
+@dataclass
+class FileInfo:
+    """Parsed information about a code file."""
+    path: Path
+    language: str
+    imports: list[str] = field(default_factory=list)
+    functions: list[CodeEntity] = field(default_factory=list)
+    classes: list[CodeEntity] = field(default_factory=list)
+    variables: list[str] = field(default_factory=list)
+    summary: Optional[str] = None
+    line_count: int = 0
+class CodeParser:
+    """Parse code files to extract structure information."""
+    def __init__(self):
+        self._parsers = {}
+    def detect_language(self, file_path: Path) -> Optional[str]:
+        """Detect programming language from file extension."""
+        suffix = file_path.suffix.lower()
+        for lang, extensions in LANGUAGE_EXTENSIONS.items():
+            if suffix in extensions:
+                return lang
+        return None
+    def parse_file(self, file_path: Path, content: Optional[str] = None) -> Optional[FileInfo]:
+        """Parse a code file and extract structure information."""
+        language = self.detect_language(file_path)
+        if not language:
+            return None
+        if content is None:
+            try:
+                content = file_path.read_text(encoding="utf-8")
+            except (UnicodeDecodeError, IOError):
+                return None
+        lines = content.split("\n")
+        # Use regex-based parsing for now (simpler than tree-sitter for MVP)
+        if language == "python":
+            return self._parse_python(file_path, content, lines)
+        elif language in ("javascript", "typescript"):
+            return self._parse_javascript(file_path, content, lines, language)
+        elif language == "go":
+            return self._parse_go(file_path, content, lines)
+        elif language == "rust":
+            return self._parse_rust(file_path, content, lines)
+        elif language == "java":
+            return self._parse_java(file_path, content, lines)
+        else:
+            # Basic fallback
+            return FileInfo(
+                path=file_path,
+                language=language,
+                line_count=len(lines),
+            )
+    def _parse_python(self, file_path: Path, content: str, lines: list[str]) -> FileInfo:
+        """Parse Python file."""
+        imports = []
+        functions = []
+        classes = []
+        variables = []
+        # Extract imports
+        import_pattern = re.compile(r"^(?:from\s+[\w.]+\s+)?import\s+.+", re.MULTILINE)
+        for match in import_pattern.finditer(content):
+            imports.append(match.group().strip())
+        # Extract functions
+        func_pattern = re.compile(
+            r"^(?P<decorators>(?:@[\w.]+(?:\([^)]*\))?\s*\n)*)"
+            r"(?P<async>async\s+)?def\s+(?P<name>\w+)\s*\((?P<params>[^)]*)\)",
+            re.MULTILINE
+        )
+        for match in func_pattern.finditer(content):
+            start_line = content[:match.start()].count("\n") + 1
+            name = match.group("name")
+            signature = f"def {name}({match.group('params')})"
+            if match.group("async"):
+                signature = "async " + signature
+            decorators = []
+            if match.group("decorators"):
+                decorators = [d.strip() for d in match.group("decorators").strip().split("\n") if d.strip()]
+            # Find docstring
+            docstring = self._extract_python_docstring(content, match.end())
+            functions.append(CodeEntity(
+                name=name,
+                type="function",
+                start_line=start_line,
+                end_line=start_line,  # Simplified
+                signature=signature,
+                decorators=decorators,
+                docstring=docstring,
+            ))
+        # Extract classes
+        class_pattern = re.compile(
+            r"^(?P<decorators>(?:@[\w.]+(?:\([^)]*\))?\s*\n)*)"
+            r"class\s+(?P<name>\w+)(?:\((?P<bases>[^)]*)\))?:",
+            re.MULTILINE
+        )
+        for match in class_pattern.finditer(content):
+            start_line = content[:match.start()].count("\n") + 1
+            name = match.group("name")
+            bases = match.group("bases") or ""
+            decorators = []
+            if match.group("decorators"):
+                decorators = [d.strip() for d in match.group("decorators").strip().split("\n") if d.strip()]
+            docstring = self._extract_python_docstring(content, match.end())
+            classes.append(CodeEntity(
+                name=name,
+                type="class",
+                start_line=start_line,
+                end_line=start_line,  # Simplified
+                signature=f"class {name}({bases})" if bases else f"class {name}",
+                decorators=decorators,
+                docstring=docstring,
+            ))
+        # Extract module-level variables (simplified)
+        var_pattern = re.compile(r"^([A-Z][A-Z_0-9]*)\s*=", re.MULTILINE)
+        for match in var_pattern.finditer(content):
+            variables.append(match.group(1))
+        return FileInfo(
+            path=file_path,
+            language="python",
+            imports=imports,
+            functions=functions,
+            classes=classes,
+            variables=variables,
+            line_count=len(lines),
+        )
+    def _extract_python_docstring(self, content: str, pos: int) -> Optional[str]:
+        """Extract Python docstring after a definition."""
+        remaining = content[pos:pos + 500]
+        # Look for triple-quoted string
+        match = re.search(r'^\s*:\s*\n\s*("""|\'\'\')(.+?)\1', remaining, re.DOTALL)
+        if match:
+            return match.group(2).strip()[:200]  # Truncate
+        return None
+    def _parse_javascript(self, file_path: Path, content: str, lines: list[str], language: str) -> FileInfo:
+        """Parse JavaScript/TypeScript file."""
+        imports = []
+        functions = []
+        classes = []
+        # Extract imports
+        import_pattern = re.compile(r"^(?:import|export)\s+.+?['\"];?$", re.MULTILINE)
+        for match in import_pattern.finditer(content):
+            imports.append(match.group().strip())
+        # Extract functions
+        func_patterns = [
+            # function declaration
+            re.compile(r"(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)"),
+            # arrow function with const
+            re.compile(r"(?:export\s+)?const\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>"),
+        ]
+        for pattern in func_patterns:
+            for match in pattern.finditer(content):
+                start_line = content[:match.start()].count("\n") + 1
+                name = match.group(1)
+                functions.append(CodeEntity(
+                    name=name,
+                    type="function",
+                    start_line=start_line,
+                    end_line=start_line,
+                ))
+        # Extract classes
+        class_pattern = re.compile(r"(?:export\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?")
+        for match in class_pattern.finditer(content):
+            start_line = content[:match.start()].count("\n") + 1
+            name = match.group(1)
+            extends = match.group(2)
+            classes.append(CodeEntity(
+                name=name,
+                type="class",
+                start_line=start_line,
+                end_line=start_line,
+                signature=f"class {name}" + (f" extends {extends}" if extends else ""),
+            ))
+        return FileInfo(
+            path=file_path,
+            language=language,
+            imports=imports,
+            functions=functions,
+            classes=classes,
+            line_count=len(lines),
+        )
+    def _parse_go(self, file_path: Path, content: str, lines: list[str]) -> FileInfo:
+        """Parse Go file."""
+        imports = []
+        functions = []
+        # Extract imports
+        import_pattern = re.compile(r'import\s+(?:\(\s*([^)]+)\s*\)|"([^"]+)")')
+        for match in import_pattern.finditer(content):
+            if match.group(1):
+                for line in match.group(1).strip().split("\n"):
+                    line = line.strip().strip('"')
+                    if line:
+                        imports.append(line)
+            elif match.group(2):
+                imports.append(match.group(2))
+        # Extract functions
+        func_pattern = re.compile(r"func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(([^)]*)\)")
+        for match in func_pattern.finditer(content):
+            start_line = content[:match.start()].count("\n") + 1
+            name = match.group(1)
+            params = match.group(2)
+            functions.append(CodeEntity(
+                name=name,
+                type="function",
+                start_line=start_line,
+                end_line=start_line,
+                signature=f"func {name}({params})",
+            ))
+        return FileInfo(
+            path=file_path,
+            language="go",
+            imports=imports,
+            functions=functions,
+            line_count=len(lines),
+        )
+    def _parse_rust(self, file_path: Path, content: str, lines: list[str]) -> FileInfo:
+        """Parse Rust file."""
+        imports = []
+        functions = []
+        # Extract use statements
+        use_pattern = re.compile(r"^use\s+.+;", re.MULTILINE)
+        for match in use_pattern.finditer(content):
+            imports.append(match.group().strip())
+        # Extract functions
+        func_pattern = re.compile(r"(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*(?:<[^>]+>)?\s*\(([^)]*)\)")
+        for match in func_pattern.finditer(content):
+            start_line = content[:match.start()].count("\n") + 1
+            name = match.group(1)
+            functions.append(CodeEntity(
+                name=name,
+                type="function",
+                start_line=start_line,
+                end_line=start_line,
+            ))
+        return FileInfo(
+            path=file_path,
+            language="rust",
+            imports=imports,
+            functions=functions,
+            line_count=len(lines),
+        )
+    def _parse_java(self, file_path: Path, content: str, lines: list[str]) -> FileInfo:
+        """Parse Java file."""
+        imports = []
+        functions = []
+        classes = []
+        # Extract imports
+        import_pattern = re.compile(r"^import\s+.+;", re.MULTILINE)
+        for match in import_pattern.finditer(content):
+            imports.append(match.group().strip())
+        # Extract classes
+        class_pattern = re.compile(r"(?:public\s+)?(?:abstract\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?")
+        for match in class_pattern.finditer(content):
+            start_line = content[:match.start()].count("\n") + 1
+            name = match.group(1)
+            classes.append(CodeEntity(
+                name=name,
+                type="class",
+                start_line=start_line,
+                end_line=start_line,
+            ))
+        # Extract methods
+        method_pattern = re.compile(
+            r"(?:public|private|protected)?\s*(?:static\s+)?(?:\w+)\s+(\w+)\s*\(([^)]*)\)"
+        )
+        for match in method_pattern.finditer(content):
+            name = match.group(1)
+            if name not in ("if", "while", "for", "switch", "catch"):
+                start_line = content[:match.start()].count("\n") + 1
+                functions.append(CodeEntity(
+                    name=name,
+                    type="method",
+                    start_line=start_line,
+                    end_line=start_line,
+                ))
+        return FileInfo(
+            path=file_path,
+            language="java",
+            imports=imports,
+            functions=functions,
+            classes=classes,
+            line_count=len(lines),
+        )

codetree/retriever.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""Reasoning-based code retriever."""
+import json
+from pathlib import Path
+from typing import Optional
+from .config import Config
+from .indexer import CodeIndex, TreeNode
+from .llm import create_llm_client, LLMClient
+RETRIEVAL_SYSTEM_PROMPT = """You are a code navigation expert. Your task is to analyze a code repository structure and identify the most relevant files and code sections to answer a user's question.
+You will be given:
+1. A tree structure of the codebase showing directories, files, functions, and classes
+2. A user's question about the code
+Your job is to:
+1. Analyze the question to understand what the user is looking for
+2. Navigate the tree structure using your reasoning
+3. Identify the most relevant files and specific functions/classes
+4. Return a JSON list of file paths that should be examined
+Think step by step:
+- What concepts does the question involve? (authentication, database, API, etc.)
+- Which directories/modules are likely to contain relevant code?
+- Which specific files have functions or classes related to the question?
+Return your answer as JSON in this format:
+{
+  "reasoning": "Brief explanation of your navigation logic",
+  "relevant_files": [
+    {"path": "path/to/file.py", "relevance": "why this file is relevant", "focus": ["function_name", "ClassName"]}
+  ]
+}
+Only include files that are truly relevant. Aim for 1-5 files maximum."""
+ANSWER_SYSTEM_PROMPT = """You are a helpful code assistant. You have been given relevant code sections from a repository to answer a user's question.
+Guidelines:
+- Answer the question directly and concisely
+- Reference specific code sections when relevant
+- Include code snippets if they help explain the answer
+- If the provided code doesn't fully answer the question, say so
+- Use markdown formatting for code blocks"""
+class CodeRetriever:
+    """Retrieves relevant code using LLM reasoning."""
+    def __init__(self, index: CodeIndex, config: Optional[Config] = None):
+        self.index = index
+        self.config = config or Config.load()
+        self.llm = create_llm_client(self.config.llm)
+        self.repo_path = Path(index.repo_path)
+    def retrieve(self, query: str, max_files: int = 5) -> list[dict]:
+        """Retrieve relevant files for a query using LLM reasoning."""
+        # Get compact tree representation
+        tree_str = self.index.get_compact_tree(max_depth=4)
+        # Ask LLM to identify relevant files
+        messages = [
+            {"role": "system", "content": RETRIEVAL_SYSTEM_PROMPT},
+            {"role": "user", "content": f"""## Repository Structure
+{tree_str}
+## Question
+{query}
+Analyze the repository structure and identify the most relevant files to answer this question. Return JSON."""}
+        ]
+        response = self.llm.chat(messages)
+        # Parse response
+        try:
+            # Extract JSON from response
+            json_start = response.find("{")
+            json_end = response.rfind("}") + 1
+            if json_start >= 0 and json_end > json_start:
+                result = json.loads(response[json_start:json_end])
+                return result.get("relevant_files", [])[:max_files]
+        except json.JSONDecodeError:
+            pass
+        return []
+    def get_file_content(self, file_path: str, focus: Optional[list[str]] = None) -> Optional[str]:
+        """Get content of a file, optionally focusing on specific functions/classes."""
+        full_path = self.repo_path / file_path
+        if not full_path.exists():
+            return None
+        try:
+            content = full_path.read_text(encoding="utf-8")
+        except (UnicodeDecodeError, IOError):
+            return None
+        # If no focus specified, return full content (truncated)
+        if not focus:
+            lines = content.split("\n")
+            if len(lines) > 200:
+                return "\n".join(lines[:200]) + f"\n\n... ({len(lines) - 200} more lines)"
+            return content
+        # TODO: Extract only focused sections
+        # For now, return full content
+        return content
+    def query(self, question: str) -> str:
+        """Query the codebase and get an answer."""
+        # Step 1: Retrieve relevant files
+        relevant_files = self.retrieve(question)
+        if not relevant_files:
+            return "I couldn't identify any relevant files for your question. Please try rephrasing or being more specific."
+        # Step 2: Get file contents
+        context_parts = []
+        for file_info in relevant_files:
+            path = file_info.get("path", "")
+            focus = file_info.get("focus", [])
+            content = self.get_file_content(path, focus)
+            if content:
+                context_parts.append(f"## File: {path}\n\n```\n{content}\n```")
+        if not context_parts:
+            return "I found relevant files but couldn't read their contents."
+        # Step 3: Generate answer
+        context = "\n\n".join(context_parts)
+        messages = [
+            {"role": "system", "content": ANSWER_SYSTEM_PROMPT},
+            {"role": "user", "content": f"""## Relevant Code
+{context}
+## Question
+{question}
+Please answer the question based on the code provided above."""}
+        ]
+        return self.llm.chat(messages)
+    def find_references(self, symbol: str) -> list[dict]:
+        """Find all references to a symbol across the codebase."""
+        references = []
+        def search_node(node: TreeNode):
+            if node.type == "file":
+                # Check functions
+                for func in node.functions:
+                    if symbol.lower() in func.get("name", "").lower():
+                        references.append({
+                            "type": "function",
+                            "file": node.path,
+                            "name": func["name"],
+                            "line": func.get("line"),
+                        })
+                # Check classes
+                for cls in node.classes:
+                    if symbol.lower() in cls.get("name", "").lower():
+                        references.append({
+                            "type": "class",
+                            "file": node.path,
+                            "name": cls["name"],
+                            "line": cls.get("line"),
+                        })
+                # Check imports
+                for imp in node.imports:
+                    if symbol.lower() in imp.lower():
+                        references.append({
+                            "type": "import",
+                            "file": node.path,
+                            "statement": imp,
+                        })
+            else:
+                for child in node.children:
+                    search_node(child)
+        search_node(self.index.root)
+        return references