PyPI - codebase-cortex - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codebase-cortex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

codebase_cortex/__init__.py +3 -0
codebase_cortex/agents/__init__.py +0 -0
codebase_cortex/agents/base.py +69 -0
codebase_cortex/agents/code_analyzer.py +122 -0
codebase_cortex/agents/doc_writer.py +356 -0
codebase_cortex/agents/semantic_finder.py +64 -0
codebase_cortex/agents/sprint_reporter.py +152 -0
codebase_cortex/agents/task_creator.py +138 -0
codebase_cortex/auth/__init__.py +0 -0
codebase_cortex/auth/callback_server.py +80 -0
codebase_cortex/auth/oauth.py +173 -0
codebase_cortex/auth/token_store.py +90 -0
codebase_cortex/cli.py +855 -0
codebase_cortex/config.py +150 -0
codebase_cortex/embeddings/__init__.py +0 -0
codebase_cortex/embeddings/clustering.py +140 -0
codebase_cortex/embeddings/indexer.py +208 -0
codebase_cortex/embeddings/store.py +126 -0
codebase_cortex/git/__init__.py +0 -0
codebase_cortex/git/diff_parser.py +185 -0
codebase_cortex/git/github_client.py +46 -0
codebase_cortex/graph.py +111 -0
codebase_cortex/mcp_client.py +94 -0
codebase_cortex/notion/__init__.py +0 -0
codebase_cortex/notion/bootstrap.py +298 -0
codebase_cortex/notion/page_cache.py +107 -0
codebase_cortex/state.py +77 -0
codebase_cortex/utils/__init__.py +0 -0
codebase_cortex/utils/json_parsing.py +59 -0
codebase_cortex/utils/logging.py +62 -0
codebase_cortex/utils/rate_limiter.py +56 -0
codebase_cortex/utils/section_parser.py +139 -0
codebase_cortex-0.1.0.dist-info/METADATA +209 -0
codebase_cortex-0.1.0.dist-info/RECORD +37 -0
codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0

codebase_cortex/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Codebase Cortex - LangGraph multi-agent system for keeping docs in sync with code."""
+__version__ = "0.1.0"

codebase_cortex/agents/__init__.py ADDED Viewed

File without changes

codebase_cortex/agents/base.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Base agent with MCP tool access."""
+from __future__ import annotations
+import logging
+from abc import ABC, abstractmethod
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import BaseMessage
+from codebase_cortex.state import CortexState
+class BaseAgent(ABC):
+    """Base class for all Cortex agents.
+    Provides access to the LLM and MCP tools from state.
+    """
+    def __init__(self, llm: BaseChatModel) -> None:
+        self.llm = llm
+        self._logger = logging.getLogger("cortex")
+    @abstractmethod
+    async def run(self, state: CortexState) -> dict:
+        """Execute this agent's logic and return state updates."""
+        ...
+    async def _invoke_llm(self, messages: list[BaseMessage]) -> str:
+        """Invoke the LLM with logging. Returns response content."""
+        agent_name = self.__class__.__name__
+        # Log prompt summary
+        total_chars = sum(
+            len(m.content) if isinstance(m.content, str) else
+            sum(len(p.get("text", "")) if isinstance(p, dict) else len(str(p)) for p in m.content)
+            for m in messages
+        )
+        self._logger.debug(
+            f"LLM CALL [{agent_name}]: {len(messages)} messages, {total_chars} chars"
+        )
+        for m in messages:
+            preview = m.content[:200] if isinstance(m.content, str) else str(m.content)[:200]
+            self._logger.debug(f"  {m.type}: {preview}...")
+        response = await self.llm.ainvoke(messages)
+        content = response.content
+        # Some models (e.g. Gemini 3) return structured content blocks
+        # instead of a plain string. Extract text from them.
+        if isinstance(content, list):
+            content = "\n".join(
+                part["text"] if isinstance(part, dict) else str(part)
+                for part in content
+                if not isinstance(part, dict) or part.get("type") == "text"
+            )
+        self._logger.debug(
+            f"LLM RESPONSE [{agent_name}]: {len(content)} chars — {content[:200]}..."
+        )
+        return content
+    def _get_mcp_tools(self, state: CortexState) -> list:
+        return state.get("mcp_tools", [])
+    def _append_error(self, state: CortexState, error: str) -> list[str]:
+        errors = list(state.get("errors", []))
+        errors.append(f"[{self.__class__.__name__}] {error}")
+        return errors

codebase_cortex/agents/code_analyzer.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""CodeAnalyzer agent — analyzes git diffs and identifies what changed and why."""
+from __future__ import annotations
+from langchain_core.messages import HumanMessage, SystemMessage
+from codebase_cortex.agents.base import BaseAgent
+from codebase_cortex.git.diff_parser import get_recent_diff, get_full_codebase_summary, parse_diff
+from codebase_cortex.state import CortexState
+DIFF_SYSTEM_PROMPT = """You are a senior software engineer analyzing code changes.
+Given a git diff, provide a clear, structured analysis covering:
+1. **Summary**: One-paragraph overview of what changed and why.
+2. **Changed Components**: List each file/module changed with a brief description.
+3. **Impact Assessment**: What parts of the system are affected? Any breaking changes?
+4. **Documentation Needs**: What documentation should be created or updated?
+Be concise but thorough. Focus on the "why" behind changes, not just the "what".
+If the diff is too large, focus on the most significant changes."""
+FULL_SYSTEM_PROMPT = """You are a senior software engineer analyzing an entire codebase.
+Given a summary of all source files, provide a comprehensive analysis covering:
+1. **Project Overview**: What this project does, its purpose and architecture.
+2. **Components**: List each major module/package with its responsibility.
+3. **Key APIs and Interfaces**: Public functions, classes, endpoints, and contracts.
+4. **Architecture**: How components relate to each other, data flow, dependencies.
+5. **Documentation Needs**: What documentation pages should be created?
+Be thorough — this is the initial documentation for a project that has none.
+Focus on what a new developer would need to understand the codebase."""
+class CodeAnalyzerAgent(BaseAgent):
+    """Analyzes git diffs or full codebases to identify documentation needs."""
+    async def run(self, state: CortexState) -> dict:
+        full_scan = state.get("full_scan", False)
+        repo_path = state.get("repo_path", ".")
+        if full_scan:
+            return await self._run_full_scan(state, repo_path)
+        return await self._run_diff(state, repo_path)
+    async def _run_diff(self, state: CortexState, repo_path: str) -> dict:
+        """Analyze the most recent git diff."""
+        diff_text = state.get("diff_text", "")
+        if not diff_text:
+            try:
+                diff_text = get_recent_diff(repo_path)
+            except Exception as e:
+                return {"errors": self._append_error(state, f"Failed to get diff: {e}")}
+        if not diff_text:
+            return {"analysis": "", "changed_files": []}
+        changed_files = parse_diff(diff_text)
+        file_summary = "\n".join(
+            f"- {f['path']} ({f['status']}: +{f['additions']}/-{f['deletions']})"
+            for f in changed_files
+        )
+        prompt = f"""Analyze the following code changes:
+## Files Changed
+{file_summary}
+## Full Diff
+```
+{diff_text[:15000]}
+```"""
+        try:
+            messages = [
+                SystemMessage(content=DIFF_SYSTEM_PROMPT),
+                HumanMessage(content=prompt),
+            ]
+            analysis = await self._invoke_llm(messages)
+        except Exception as e:
+            return {
+                "diff_text": diff_text,
+                "changed_files": changed_files,
+                "errors": self._append_error(state, f"LLM analysis failed: {e}"),
+            }
+        return {
+            "diff_text": diff_text,
+            "changed_files": changed_files,
+            "analysis": analysis,
+        }
+    async def _run_full_scan(self, state: CortexState, repo_path: str) -> dict:
+        """Analyze the entire codebase for initial documentation."""
+        try:
+            summary = get_full_codebase_summary(repo_path)
+        except Exception as e:
+            return {"errors": self._append_error(state, f"Failed to scan codebase: {e}")}
+        if not summary:
+            return {"analysis": "", "changed_files": []}
+        prompt = f"""Analyze this entire codebase and produce a comprehensive analysis for documentation:
+{summary}"""
+        try:
+            messages = [
+                SystemMessage(content=FULL_SYSTEM_PROMPT),
+                HumanMessage(content=prompt),
+            ]
+            analysis = await self._invoke_llm(messages)
+        except Exception as e:
+            return {
+                "errors": self._append_error(state, f"LLM analysis failed: {e}"),
+            }
+        return {
+            "analysis": analysis,
+            "changed_files": [],
+        }

codebase_cortex/agents/doc_writer.py ADDED Viewed

@@ -0,0 +1,356 @@
+"""DocWriter agent — updates or creates Notion pages to reflect code changes."""
+from __future__ import annotations
+from langchain_core.messages import HumanMessage, SystemMessage
+import re
+from codebase_cortex.agents.base import BaseAgent
+from codebase_cortex.config import Settings
+from codebase_cortex.notion.bootstrap import extract_page_id
+from codebase_cortex.notion.page_cache import PageCache
+from codebase_cortex.state import CortexState, DocUpdate
+from codebase_cortex.utils.json_parsing import parse_json_array
+from codebase_cortex.utils.section_parser import merge_sections, parse_sections
+def _unescape_notion_text(text: str) -> str:
+    """Convert literal escape sequences from Notion MCP responses to real characters.
+    The Notion MCP server returns page content with literal \\n and \\t
+    (two-character sequences) instead of real newline/tab characters.
+    This converts them back so markdown parsing works correctly.
+    """
+    # Replace literal \n and \t with real characters
+    # Use a single pass to handle \n and \t without touching \\n (escaped backslash + n)
+    result = []
+    i = 0
+    while i < len(text):
+        if text[i] == '\\' and i + 1 < len(text):
+            next_char = text[i + 1]
+            if next_char == 'n':
+                result.append('\n')
+                i += 2
+                continue
+            elif next_char == 't':
+                result.append('\t')
+                i += 2
+                continue
+        result.append(text[i])
+        i += 1
+    return ''.join(result)
+def strip_notion_metadata(raw_text: str) -> str:
+    """Extract just the page content from a notion-fetch response.
+    The notion-fetch tool returns XML-like wrapper with metadata:
+        Here is the result of "view" for the Page ...
+        <page url="...">
+        <ancestor-path>...</ancestor-path>
+        <properties>...</properties>
+        <content>
+        ... actual markdown content ...
+        </content>
+        </page>
+    This function extracts only the content between <content> tags,
+    or falls back to stripping all XML-like tags.
+    """
+    # Notion MCP returns literal \n instead of real newlines — unescape first
+    raw_text = _unescape_notion_text(raw_text)
+    # Try to extract content between <content> and </content>
+    content_match = re.search(
+        r"<content>\s*(.*?)\s*</content>",
+        raw_text,
+        re.DOTALL,
+    )
+    if content_match:
+        return content_match.group(1).strip()
+    # Fallback: strip the "Here is the result..." header and XML tags
+    # Remove the leading metadata line
+    text = re.sub(r'^Here is the result of "view".*?\n', "", raw_text)
+    # Remove XML-like tags
+    text = re.sub(r"</?(?:page|ancestor-path|parent-page|properties|content)[^>]*>", "", text)
+    # Remove JSON property lines like {"title":"..."}
+    text = re.sub(r'^\s*\{.*?"title".*?\}\s*$', "", text, flags=re.MULTILINE)
+    # Clean up excessive blank lines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+SYSTEM_PROMPT = """You are a technical documentation writer. Given a code analysis
+and related existing documentation, generate documentation updates for a Notion workspace.
+Output a JSON array of page updates. Each element has:
+- "title": Page title (must match an existing page title when updating)
+- "action": "update" or "create"
+For "update" actions (modifying an existing page):
+- Include "section_updates": a JSON array of ONLY the sections that changed.
+- Each section update has:
+  - "heading": The exact markdown heading (e.g., "## API Endpoints", "### Authentication")
+  - "content": The new content for that section (everything below the heading until the next heading)
+  - "action": "update" to replace an existing section, or "create" to add a new section
+- Do NOT include sections that haven't changed.
+- Match headings exactly to existing page headings (case-insensitive matching is applied).
+For "create" actions (new page):
+- Include "content": Full markdown content for the new page.
+- Do NOT include "section_updates".
+Focus on:
+- Architecture decisions and component relationships
+- API contracts and interfaces
+- How components interact
+- Breaking changes and migration notes
+Keep content concise and actionable. Use markdown headings, lists, and code blocks."""
+class DocWriterAgent(BaseAgent):
+    """Writes and updates documentation in Notion via MCP tools.
+    Uses LLM to generate documentation content based on code analysis
+    and related docs, then writes to Notion via MCP (or logs in dry_run mode).
+    """
+    async def run(self, state: CortexState) -> dict:
+        analysis = state.get("analysis", "")
+        if not analysis:
+            return {"doc_updates": []}
+        related_docs = state.get("related_docs", [])
+        dry_run = state.get("dry_run", False)
+        settings = Settings.from_env()
+        cache = PageCache(cache_path=settings.page_cache_path)
+        # Step 1: Fetch existing content from all Notion doc pages
+        existing_pages = await self._fetch_existing_pages(settings, cache)
+        # Build context from related code chunks (actual content, not just titles)
+        related_context = ""
+        if related_docs:
+            related_context = "\n\n## Related Code\n"
+            for doc in related_docs[:5]:
+                related_context += f"\n### {doc['title']} (similarity: {doc['similarity']:.2f})\n"
+                if doc.get("content"):
+                    related_context += f"```\n{doc['content'][:1500]}\n```\n"
+        # Build existing page content section for the LLM
+        # Show section structure so the LLM knows which headings exist
+        existing_content_section = ""
+        if existing_pages:
+            existing_content_section = "\n\n## Current Page Contents\n"
+            for title, content in existing_pages.items():
+                truncated = content[:3000] + ("..." if len(content) > 3000 else "")
+                existing_content_section += f"\n### {title}\n```\n{truncated}\n```\n"
+        # Build dynamic page list from cache
+        doc_pages = cache.find_all_doc_pages()
+        page_list = "\n".join(f"- {p.title}" for p in doc_pages) if doc_pages else "- (no pages yet)"
+        # Ask LLM to generate doc updates
+        prompt = f"""Based on this code analysis, determine what documentation should be updated or created.
+## Code Analysis
+{analysis}
+{related_context}
+{existing_content_section}
+## Available Pages in Notion
+{page_list}
+Generate documentation updates as a JSON array.
+For "update" actions: include "title", "action", and "section_updates" (array of sections to change).
+  Each section_update has "heading" (e.g. "## API Endpoints"), "content" (new content for that section), and "action" ("update" or "create").
+  Only include sections that actually changed — unchanged sections will be preserved automatically.
+For "create" actions: include "title", "action", and "content" (full markdown for new page).
+Only include pages that genuinely need updating. Respond with ONLY the JSON array."""
+        try:
+            messages = [
+                SystemMessage(content=SYSTEM_PROMPT),
+                HumanMessage(content=prompt),
+            ]
+            raw = await self._invoke_llm(messages)
+            updates_data = parse_json_array(raw)
+        except Exception as e:
+            return {
+                "doc_updates": [],
+                "errors": self._append_error(state, f"Doc generation failed: {e}"),
+            }
+        doc_updates: list[DocUpdate] = []
+        for update in updates_data:
+            title = update.get("title", "Untitled")
+            action = update.get("action", "update")
+            # Look up existing page ID from cache
+            cached = cache.find_by_title(title)
+            page_id = cached.page_id if cached else None
+            if action == "update" and title in existing_pages:
+                # Section-level merge for existing pages
+                section_updates = update.get("section_updates")
+                if section_updates:
+                    # New format: merge only changed sections
+                    existing_sections = parse_sections(existing_pages[title])
+                    content = merge_sections(existing_sections, section_updates)
+                elif update.get("content"):
+                    # Backward compatibility: LLM returned full content
+                    content = update["content"]
+                else:
+                    continue
+            else:
+                # New page or page not in existing_pages
+                content = update.get("content", "")
+                if not content:
+                    continue
+            doc_updates.append(DocUpdate(
+                page_id=page_id,
+                title=title,
+                content=content,
+                action=action,
+            ))
+        # Write to Notion (unless dry_run)
+        if not dry_run and doc_updates:
+            await self._write_to_notion(doc_updates, cache, state)
+        return {"doc_updates": doc_updates}
+    async def _fetch_existing_pages(
+        self, settings: Settings, cache: PageCache
+    ) -> dict[str, str]:
+        """Fetch current content of all doc pages from Notion.
+        Also syncs page titles back to cache (detects renames).
+        """
+        from codebase_cortex.mcp_client import notion_mcp_session, rate_limiter
+        from codebase_cortex.utils.logging import get_logger
+        logger = get_logger()
+        existing: dict[str, str] = {}
+        # Fetch all doc pages (skip infrastructure-only pages)
+        doc_pages = cache.find_all_doc_pages()
+        # Limit to 10 pages to avoid excessive API calls
+        pages_to_fetch = doc_pages[:10]
+        if not pages_to_fetch:
+            return existing
+        try:
+            async with notion_mcp_session(settings) as session:
+                for cached_page in pages_to_fetch:
+                    await rate_limiter.acquire()
+                    try:
+                        result = await session.call_tool(
+                            "notion-fetch",
+                            arguments={"id": cached_page.page_id},
+                        )
+                        if not result.isError and result.content:
+                            raw = result.content[0].text
+                            content = strip_notion_metadata(raw)
+                            existing[cached_page.title] = content
+                            # Sync title back from Notion (detect renames)
+                            # Extract actual title from the raw response
+                            title_match = re.search(
+                                r'"title"\s*:\s*"([^"]+)"', raw
+                            )
+                            if title_match:
+                                actual_title = title_match.group(1)
+                                normalized_actual = cache._normalize_title(actual_title)
+                                normalized_cached = cache._normalize_title(cached_page.title)
+                                if normalized_actual != normalized_cached and normalized_actual:
+                                    logger.info(
+                                        f"Page renamed: '{cached_page.title}' → '{actual_title}'"
+                                    )
+                                    cache.upsert(
+                                        cached_page.page_id, actual_title
+                                    )
+                    except Exception as e:
+                        logger.warning(f"Could not fetch {cached_page.title}: {e}")
+        except Exception as e:
+            logger.warning(f"Could not fetch existing pages: {e}")
+        return existing
+    async def _write_to_notion(
+        self,
+        updates: list[DocUpdate],
+        cache: PageCache,
+        state: CortexState,
+    ) -> None:
+        """Write documentation updates to Notion via MCP."""
+        from codebase_cortex.mcp_client import notion_mcp_session, rate_limiter
+        from codebase_cortex.config import Settings
+        from codebase_cortex.utils.logging import get_logger
+        logger = get_logger()
+        settings = Settings.from_env()
+        # Get parent page for new pages
+        parent_page = cache.find_by_title("Codebase Cortex")
+        parent_id = parent_page.page_id if parent_page else None
+        try:
+            async with notion_mcp_session(settings) as session:
+                for update in updates:
+                    await rate_limiter.acquire()
+                    page_id = update["page_id"]
+                    # Only update pages we already track in the cache.
+                    # Never search the whole workspace — that risks
+                    # overwriting unrelated user pages.
+                    if page_id:
+                        # Content already merged locally via section_parser
+                        await session.call_tool(
+                            "notion-update-page",
+                            arguments={
+                                "page_id": page_id,
+                                "command": "replace_content",
+                                "new_str": update["content"],
+                            },
+                        )
+                        # Mark as written with a content hash so first-run detection works
+                        import hashlib
+                        content_hash = hashlib.md5(update["content"].encode()).hexdigest()[:8]
+                        cache.upsert(page_id, update["title"], content_hash=content_hash)
+                        logger.info(f"Updated: {update['title']}")
+                    else:
+                        # Create new page under parent
+                        create_args: dict = {
+                            "pages": [
+                                {
+                                    "properties": {"title": update["title"]},
+                                    "content": update["content"],
+                                }
+                            ],
+                        }
+                        if parent_id:
+                            create_args["parent"] = {"page_id": parent_id}
+                        result = await session.call_tool(
+                            "notion-create-pages",
+                            arguments=create_args,
+                        )
+                        new_page_id = extract_page_id(result)
+                        if new_page_id:
+                            import hashlib
+                            content_hash = hashlib.md5(update["content"].encode()).hexdigest()[:8]
+                            cache.upsert(new_page_id, update["title"], content_hash=content_hash)
+                        logger.info(f"Created: {update['title']}")
+        except Exception as e:
+            logger.error(f"Failed to write docs to Notion: {e}")

codebase_cortex/agents/semantic_finder.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""SemanticFinder agent — finds related docs via FAISS embedding similarity."""
+from __future__ import annotations
+from pathlib import Path
+from codebase_cortex.agents.base import BaseAgent
+from codebase_cortex.config import Settings
+from codebase_cortex.embeddings.indexer import EmbeddingIndexer
+from codebase_cortex.embeddings.store import FAISSStore
+from codebase_cortex.state import CortexState, RelatedDoc
+class SemanticFinderAgent(BaseAgent):
+    """Finds semantically related code chunks using FAISS embeddings.
+    Embeds the analysis text from CodeAnalyzer, queries the FAISS index
+    for similar code chunks, and returns them as RelatedDoc entries.
+    """
+    async def run(self, state: CortexState) -> dict:
+        analysis = state.get("analysis", "")
+        if not analysis:
+            return {"related_docs": []}
+        repo_path = Path(state.get("repo_path", "."))
+        settings = Settings.from_env(repo_path)
+        index_dir = settings.faiss_index_dir
+        try:
+            # Always rebuild the index to capture new/changed files
+            indexer = EmbeddingIndexer(repo_path=repo_path)
+            chunks = indexer.collect_chunks()
+            if not chunks:
+                return {"related_docs": []}
+            store = FAISSStore(index_dir=index_dir)
+            embeddings = indexer.embed_chunks(chunks)
+            store.build(embeddings, chunks)
+            store.save()
+            query_emb = indexer.embed_texts([analysis])
+            if query_emb.size == 0:
+                return {"related_docs": []}
+            # Search for related chunks
+            results = store.search(query_emb[0], k=10)
+            related_docs: list[RelatedDoc] = []
+            for r in results:
+                related_docs.append(RelatedDoc(
+                    page_id=r.chunk.file_path,
+                    title=f"{r.chunk.name} ({r.chunk.file_path})",
+                    similarity=r.score,
+                    content=r.chunk.content[:2000],
+                ))
+            return {"related_docs": related_docs}
+        except Exception as e:
+            return {
+                "related_docs": [],
+                "errors": self._append_error(state, f"Semantic search failed: {e}"),
+            }