PyPI - codebase-cortex - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codebase-cortex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

codebase_cortex/__init__.py +3 -0
codebase_cortex/agents/__init__.py +0 -0
codebase_cortex/agents/base.py +69 -0
codebase_cortex/agents/code_analyzer.py +122 -0
codebase_cortex/agents/doc_writer.py +356 -0
codebase_cortex/agents/semantic_finder.py +64 -0
codebase_cortex/agents/sprint_reporter.py +152 -0
codebase_cortex/agents/task_creator.py +138 -0
codebase_cortex/auth/__init__.py +0 -0
codebase_cortex/auth/callback_server.py +80 -0
codebase_cortex/auth/oauth.py +173 -0
codebase_cortex/auth/token_store.py +90 -0
codebase_cortex/cli.py +855 -0
codebase_cortex/config.py +150 -0
codebase_cortex/embeddings/__init__.py +0 -0
codebase_cortex/embeddings/clustering.py +140 -0
codebase_cortex/embeddings/indexer.py +208 -0
codebase_cortex/embeddings/store.py +126 -0
codebase_cortex/git/__init__.py +0 -0
codebase_cortex/git/diff_parser.py +185 -0
codebase_cortex/git/github_client.py +46 -0
codebase_cortex/graph.py +111 -0
codebase_cortex/mcp_client.py +94 -0
codebase_cortex/notion/__init__.py +0 -0
codebase_cortex/notion/bootstrap.py +298 -0
codebase_cortex/notion/page_cache.py +107 -0
codebase_cortex/state.py +77 -0
codebase_cortex/utils/__init__.py +0 -0
codebase_cortex/utils/json_parsing.py +59 -0
codebase_cortex/utils/logging.py +62 -0
codebase_cortex/utils/rate_limiter.py +56 -0
codebase_cortex/utils/section_parser.py +139 -0
codebase_cortex-0.1.0.dist-info/METADATA +209 -0
codebase_cortex-0.1.0.dist-info/RECORD +37 -0
codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0

codebase_cortex/notion/bootstrap.py ADDED Viewed

@@ -0,0 +1,298 @@
+"""Bootstrap starter Notion pages via MCP on first run."""
+from __future__ import annotations
+from mcp import ClientSession
+from codebase_cortex.config import Settings
+from codebase_cortex.notion.page_cache import PageCache
+from codebase_cortex.utils.logging import get_logger
+logger = get_logger()
+PARENT_PAGE_TITLE = "Codebase Cortex"
+def normalize_page_id(raw_id: str) -> str:
+    """Normalize a Notion page ID to dashed UUID format.
+    Notion URLs use dashless IDs, but our cache stores dashed format.
+    This ensures consistent lookups.
+    """
+    clean = raw_id.replace("-", "").lower()
+    if len(clean) == 32:
+        return f"{clean[:8]}-{clean[8:12]}-{clean[12:16]}-{clean[16:20]}-{clean[20:]}"
+    return raw_id
+STARTER_PAGES = [
+    {
+        "title": "Architecture Overview",
+        "icon": "🏗️",
+        "description": "System design, component relationships, and architectural decisions.",
+    },
+    {
+        "title": "API Reference",
+        "icon": "📡",
+        "description": "Endpoints, schemas, contracts, and integration points.",
+    },
+    {
+        "title": "Sprint Log",
+        "icon": "📋",
+        "description": "Weekly auto-generated summaries of code changes and documentation updates.",
+    },
+    {
+        "title": "Task Board",
+        "icon": "✅",
+        "description": "Undocumented areas, documentation debt, and improvement tasks.",
+    },
+]
+def extract_page_id(result) -> str | None:
+    """Extract a page ID from an MCP CallToolResult.
+    The response text typically contains markdown with page URLs.
+    We look for a UUID pattern which is the page ID.
+    """
+    import re
+    if result.isError:
+        return None
+    if not result.content:
+        return None
+    text = result.content[0].text
+    # Look for UUID pattern (with or without dashes)
+    uuid_pattern = r"[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}"
+    match = re.search(uuid_pattern, text, re.IGNORECASE)
+    if match:
+        return normalize_page_id(match.group(0))
+    return text
+async def search_page_by_title(session: ClientSession, title: str) -> str | None:
+    """Search Notion for a page by title, return page_id if found."""
+    from codebase_cortex.utils.rate_limiter import NotionRateLimiter
+    rate_limiter = NotionRateLimiter()
+    await rate_limiter.acquire()
+    try:
+        result = await session.call_tool(
+            "notion-search",
+            arguments={"query": title},
+        )
+        if result.isError or not result.content:
+            return None
+        # The search result text contains page info with IDs
+        import re
+        text = result.content[0].text
+        # Look for the title in results and extract its page ID
+        # Notion search returns markdown with page URLs/IDs
+        uuid_pattern = r"[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}"
+        match = re.search(uuid_pattern, text, re.IGNORECASE)
+        if match and title.lower() in text.lower():
+            return normalize_page_id(match.group(0))
+        return None
+    except Exception:
+        return None
+async def discover_child_pages(settings: Settings) -> int:
+    """Discover child pages under the parent Notion page and cache them.
+    Fetches the parent page via MCP, extracts child page references
+    from the content, and caches any pages not already tracked.
+    Returns the number of newly discovered pages.
+    """
+    import re
+    from codebase_cortex.mcp_client import notion_mcp_session
+    from codebase_cortex.utils.rate_limiter import NotionRateLimiter
+    from codebase_cortex.notion.page_cache import PageCache
+    logger = get_logger()
+    cache = PageCache(cache_path=settings.page_cache_path)
+    parent_page = cache.find_by_title("Codebase Cortex")
+    if not parent_page:
+        return 0
+    rate_limiter = NotionRateLimiter()
+    discovered = 0
+    try:
+        async with notion_mcp_session(settings) as session:
+            await rate_limiter.acquire()
+            result = await session.call_tool(
+                "notion-fetch",
+                arguments={"id": parent_page.page_id},
+            )
+            if result.isError or not result.content:
+                return 0
+            response_text = result.content[0].text
+            # Extract content section (child pages are referenced there)
+            content_match = re.search(
+                r"<content>\s*(.*?)\s*</content>",
+                response_text,
+                re.DOTALL,
+            )
+            content = content_match.group(1) if content_match else response_text
+            # Find all UUID patterns in the content (child page references)
+            uuid_pattern = r"[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}"
+            raw_ids = re.findall(uuid_pattern, content, re.IGNORECASE)
+            found_ids = {normalize_page_id(pid) for pid in raw_ids}
+            # Remove parent's own ID
+            found_ids.discard(parent_page.page_id)
+            # Filter to only truly new pages (not already cached)
+            new_ids = [pid for pid in found_ids if not cache.get(pid)]
+            for page_id in new_ids:
+                await rate_limiter.acquire()
+                try:
+                    fetch_result = await session.call_tool(
+                        "notion-fetch",
+                        arguments={"id": page_id},
+                    )
+                    if not fetch_result.isError and fetch_result.content:
+                        page_text = fetch_result.content[0].text
+                        title_match = re.search(
+                            r'"title"\s*:\s*"([^"]+)"', page_text
+                        )
+                        if title_match:
+                            title = title_match.group(1)
+                            cache.upsert(page_id, title)
+                            discovered += 1
+                            logger.info(f"Discovered child page: {title}")
+                except Exception:
+                    pass
+    except Exception as e:
+        logger.warning(f"Child page discovery failed: {e}")
+    return discovered
+async def bootstrap_notion_pages(settings: Settings) -> list[dict]:
+    """Create the starter Notion pages via MCP tools.
+    Creates a parent "Codebase Cortex" page, then child pages under it.
+    Searches for existing pages first to avoid duplicates.
+    Seeds the page cache with all created/found pages.
+    Args:
+        settings: Application settings with Notion token path.
+    Returns:
+        List of page info dicts with page_id and title.
+    """
+    from codebase_cortex.mcp_client import notion_mcp_session
+    from codebase_cortex.utils.rate_limiter import NotionRateLimiter
+    rate_limiter = NotionRateLimiter()
+    cache = PageCache(cache_path=settings.page_cache_path)
+    pages = []
+    repo_name = settings.repo_path.name
+    parent_title = repo_name
+    async with notion_mcp_session(settings) as session:
+        # Step 1: Search for existing parent page
+        parent_id = await search_page_by_title(session, PARENT_PAGE_TITLE)
+        # Step 2: Create parent page if not found
+        if not parent_id:
+            await rate_limiter.acquire()
+            try:
+                result = await session.call_tool(
+                    "notion-create-pages",
+                    arguments={
+                        "pages": [
+                            {
+                                "properties": {"title": parent_title},
+                                "content": (
+                                    f"# {repo_name}\n\n"
+                                    f"Auto-generated documentation hub for **{repo_name}**.\n\n"
+                                    "Managed by [Codebase Cortex](https://github.com/sarupurisailalith/codebase-cortex)."
+                                ),
+                            }
+                        ],
+                    },
+                )
+                parent_id = extract_page_id(result)
+                if parent_id:
+                    cache.upsert(parent_id, PARENT_PAGE_TITLE)
+                    logger.info(f"Created parent page: {parent_title}")
+                else:
+                    logger.error("Failed to extract parent page ID from response")
+                    return []
+            except Exception as e:
+                logger.error(f"Failed to create parent page: {e}")
+                return []
+        else:
+            cache.upsert(parent_id, PARENT_PAGE_TITLE)
+            logger.info(f"Found existing parent page: {PARENT_PAGE_TITLE}")
+        # Step 3: Create child pages under parent
+        for page_info in STARTER_PAGES:
+            title = page_info["title"]
+            display_title = f"{page_info['icon']} {title}"
+            # Check cache first, then search Notion
+            cached = cache.find_by_title(title)
+            if cached:
+                pages.append({"title": title, "page_id": cached.page_id})
+                logger.info(f"Already exists (cached): {display_title}")
+                continue
+            existing_id = await search_page_by_title(session, title)
+            if existing_id:
+                cache.upsert(existing_id, title)
+                pages.append({"title": title, "page_id": existing_id})
+                logger.info(f"Found existing: {display_title}")
+                continue
+            # Create new page under parent
+            await rate_limiter.acquire()
+            try:
+                content = (
+                    f"# {title}\n\n"
+                    f"{page_info['description']}\n\n"
+                    "---\n*Auto-generated by Codebase Cortex*"
+                )
+                result = await session.call_tool(
+                    "notion-create-pages",
+                    arguments={
+                        "parent": {"page_id": parent_id},
+                        "pages": [
+                            {
+                                "properties": {"title": display_title},
+                                "content": content,
+                            }
+                        ],
+                    },
+                )
+                page_id = extract_page_id(result)
+                if page_id:
+                    cache.upsert(page_id, title)
+                    pages.append({"title": title, "page_id": page_id})
+                    logger.info(f"Created: {display_title}")
+                else:
+                    logger.error(f"Failed to extract page ID for '{title}'")
+            except Exception as e:
+                logger.error(f"Failed to create page '{title}': {e}")
+    return pages

codebase_cortex/notion/page_cache.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Local cache for Notion page metadata with staleness tracking."""
+from __future__ import annotations
+import json
+import re
+import time
+import unicodedata
+from pathlib import Path
+from dataclasses import dataclass, field
+@dataclass
+class CachedPage:
+    """A cached Notion page entry."""
+    page_id: str
+    title: str
+    last_synced: float
+    content_hash: str = ""
+    def is_stale(self, max_age: float = 3600.0) -> bool:
+        """Check if the cache entry is older than max_age seconds."""
+        return (time.time() - self.last_synced) > max_age
+@dataclass
+class PageCache:
+    """In-memory cache of Notion pages, backed by a JSON file."""
+    cache_path: Path
+    pages: dict[str, CachedPage] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        self._load()
+    def _load(self) -> None:
+        if self.cache_path.exists():
+            data = json.loads(self.cache_path.read_text())
+            self.pages = {
+                pid: CachedPage(**entry) for pid, entry in data.items()
+            }
+    def save(self) -> None:
+        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            pid: {
+                "page_id": p.page_id,
+                "title": p.title,
+                "last_synced": p.last_synced,
+                "content_hash": p.content_hash,
+            }
+            for pid, p in self.pages.items()
+        }
+        self.cache_path.write_text(json.dumps(data, indent=2))
+    def upsert(self, page_id: str, title: str, content_hash: str = "") -> None:
+        self.pages[page_id] = CachedPage(
+            page_id=page_id,
+            title=title,
+            last_synced=time.time(),
+            content_hash=content_hash,
+        )
+        self.save()
+    def get(self, page_id: str) -> CachedPage | None:
+        return self.pages.get(page_id)
+    def get_stale(self, max_age: float = 3600.0) -> list[CachedPage]:
+        return [p for p in self.pages.values() if p.is_stale(max_age)]
+    @staticmethod
+    def _normalize_title(title: str) -> str:
+        """Strip emojis, special characters, and normalize whitespace for comparison."""
+        # Remove characters in emoji-related Unicode categories (So = Symbol, other)
+        cleaned = "".join(
+            ch for ch in title
+            if unicodedata.category(ch) not in ("So", "Sk", "Sc", "Sm")
+        )
+        # Remove non-alphanumeric characters (keep spaces)
+        cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned)
+        # Collapse whitespace and strip
+        return re.sub(r"\s+", " ", cleaned).strip().lower()
+    def find_by_title(self, title: str) -> CachedPage | None:
+        """Find a page by title: exact match first, then fuzzy fallback."""
+        for page in self.pages.values():
+            if page.title == title:
+                return page
+        return self.find_by_title_fuzzy(title)
+    def find_by_title_fuzzy(self, title: str) -> CachedPage | None:
+        """Find the best matching page by normalized title comparison."""
+        normalized = self._normalize_title(title)
+        if not normalized:
+            return None
+        for page in self.pages.values():
+            if self._normalize_title(page.title) == normalized:
+                return page
+        return None
+    def find_all_doc_pages(self) -> list[CachedPage]:
+        """Return all cached pages except infrastructure pages."""
+        return [
+            p for p in self.pages.values()
+            if p.title != "Codebase Cortex"
+        ]

codebase_cortex/state.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""CortexState — shared state for the LangGraph pipeline."""
+from __future__ import annotations
+from typing import TypedDict
+class FileChange(TypedDict):
+    """A single file change extracted from a diff."""
+    path: str
+    status: str  # "added" | "modified" | "deleted" | "renamed"
+    additions: int
+    deletions: int
+    diff: str
+class DocUpdate(TypedDict):
+    """A documentation update to apply in Notion."""
+    page_id: str | None  # None = create new page
+    title: str
+    content: str
+    action: str  # "create" | "update"
+class TaskItem(TypedDict):
+    """A task/ticket to create in Notion."""
+    title: str
+    description: str
+    priority: str  # "high" | "medium" | "low"
+class RelatedDoc(TypedDict, total=False):
+    """A semantically related existing document."""
+    page_id: str
+    title: str
+    similarity: float
+    content: str  # Code chunk content for LLM context
+class CortexState(TypedDict, total=False):
+    """Shared state flowing through the LangGraph pipeline.
+    Fields are populated progressively by each agent node.
+    """
+    # Input / trigger
+    trigger: str  # "commit" | "pr" | "schedule" | "manual"
+    repo_path: str
+    dry_run: bool
+    full_scan: bool  # True = analyze entire codebase, not just recent diff
+    # Git data
+    diff_text: str
+    changed_files: list[FileChange]
+    # CodeAnalyzer output
+    analysis: str
+    # SemanticFinder output
+    related_docs: list[RelatedDoc]
+    # DocWriter output
+    doc_updates: list[DocUpdate]
+    # TaskCreator output
+    tasks_created: list[TaskItem]
+    # SprintReporter output
+    sprint_summary: str
+    # Pipeline metadata
+    errors: list[str]
+    mcp_tools: list

codebase_cortex/utils/__init__.py ADDED Viewed

File without changes

codebase_cortex/utils/json_parsing.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Robust JSON array parsing from LLM responses."""
+from __future__ import annotations
+import json
+import re
+def parse_json_array(raw: str) -> list[dict]:
+    """Extract a JSON array from an LLM response, handling common quirks.
+    Handles:
+    - Raw JSON arrays
+    - JSON wrapped in markdown code blocks (```json ... ```)
+    - Trailing commas
+    - Text before/after the JSON array
+    Args:
+        raw: Raw LLM response text.
+    Returns:
+        Parsed list of dicts.
+    Raises:
+        ValueError: If no valid JSON array can be extracted.
+    """
+    # Try direct parse first
+    text = raw.strip()
+    try:
+        result = json.loads(text)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+    # Extract from markdown code blocks
+    code_block_match = re.search(r"```(?:json)?\s*\n?(.*?)```", text, re.DOTALL)
+    if code_block_match:
+        try:
+            result = json.loads(code_block_match.group(1).strip())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+    # Find the outermost [ ... ] in the response
+    bracket_match = re.search(r"\[.*\]", text, re.DOTALL)
+    if bracket_match:
+        candidate = bracket_match.group(0)
+        # Remove trailing commas before ] (common LLM mistake)
+        candidate = re.sub(r",\s*\]", "]", candidate)
+        try:
+            result = json.loads(candidate)
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+    raise ValueError(f"Could not extract JSON array from LLM response: {text[:200]}")

codebase_cortex/utils/logging.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Rich-based logging for Codebase Cortex."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from rich.console import Console
+from rich.logging import RichHandler
+console = Console()
+# Module-level flag for verbose/debug mode
+_verbose = False
+def setup_logging(level: int = logging.INFO, verbose: bool = False) -> logging.Logger:
+    """Configure and return the application logger."""
+    global _verbose
+    _verbose = verbose
+    if verbose:
+        level = logging.DEBUG
+    handler = RichHandler(
+        console=console,
+        show_path=False,
+        markup=True,
+    )
+    handler.setFormatter(logging.Formatter("%(message)s"))
+    # Also log to .cortex/debug.log when verbose
+    logger = logging.getLogger("cortex")
+    logger.setLevel(level)
+    # Remove existing handlers to avoid duplicates
+    logger.handlers.clear()
+    logger.addHandler(handler)
+    if verbose:
+        cortex_dir = Path.cwd() / ".cortex"
+        if cortex_dir.exists():
+            file_handler = logging.FileHandler(cortex_dir / "debug.log")
+            file_handler.setLevel(logging.DEBUG)
+            file_handler.setFormatter(
+                logging.Formatter("%(asctime)s %(levelname)s %(message)s")
+            )
+            logger.addHandler(file_handler)
+    return logger
+def get_logger() -> logging.Logger:
+    """Get the cortex logger, creating it if needed."""
+    logger = logging.getLogger("cortex")
+    if not logger.handlers:
+        return setup_logging()
+    return logger
+def is_verbose() -> bool:
+    return _verbose

codebase_cortex/utils/rate_limiter.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Async token bucket rate limiter for Notion MCP API calls."""
+from __future__ import annotations
+import asyncio
+import time
+class TokenBucket:
+    """Token bucket rate limiter.
+    Args:
+        rate: Number of tokens added per second.
+        capacity: Maximum tokens in the bucket.
+    """
+    def __init__(self, rate: float, capacity: int) -> None:
+        self.rate = rate
+        self.capacity = capacity
+        self._tokens = float(capacity)
+        self._last_refill = time.monotonic()
+        self._lock = asyncio.Lock()
+    def _refill(self) -> None:
+        now = time.monotonic()
+        elapsed = now - self._last_refill
+        self._tokens = min(self.capacity, self._tokens + elapsed * self.rate)
+        self._last_refill = now
+    async def acquire(self, tokens: int = 1) -> None:
+        """Wait until the requested number of tokens are available."""
+        async with self._lock:
+            while True:
+                self._refill()
+                if self._tokens >= tokens:
+                    self._tokens -= tokens
+                    return
+                wait = (tokens - self._tokens) / self.rate
+                await asyncio.sleep(wait)
+class NotionRateLimiter:
+    """Dual token bucket for Notion MCP rate limits.
+    - General: 180 requests/minute (3/sec)
+    - Search: 30 requests/minute (0.5/sec)
+    """
+    def __init__(self) -> None:
+        self.general = TokenBucket(rate=3.0, capacity=180)
+        self.search = TokenBucket(rate=0.5, capacity=30)
+    async def acquire(self, is_search: bool = False) -> None:
+        await self.general.acquire()
+        if is_search:
+            await self.search.acquire()