codebase-cortex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. codebase_cortex/__init__.py +3 -0
  2. codebase_cortex/agents/__init__.py +0 -0
  3. codebase_cortex/agents/base.py +69 -0
  4. codebase_cortex/agents/code_analyzer.py +122 -0
  5. codebase_cortex/agents/doc_writer.py +356 -0
  6. codebase_cortex/agents/semantic_finder.py +64 -0
  7. codebase_cortex/agents/sprint_reporter.py +152 -0
  8. codebase_cortex/agents/task_creator.py +138 -0
  9. codebase_cortex/auth/__init__.py +0 -0
  10. codebase_cortex/auth/callback_server.py +80 -0
  11. codebase_cortex/auth/oauth.py +173 -0
  12. codebase_cortex/auth/token_store.py +90 -0
  13. codebase_cortex/cli.py +855 -0
  14. codebase_cortex/config.py +150 -0
  15. codebase_cortex/embeddings/__init__.py +0 -0
  16. codebase_cortex/embeddings/clustering.py +140 -0
  17. codebase_cortex/embeddings/indexer.py +208 -0
  18. codebase_cortex/embeddings/store.py +126 -0
  19. codebase_cortex/git/__init__.py +0 -0
  20. codebase_cortex/git/diff_parser.py +185 -0
  21. codebase_cortex/git/github_client.py +46 -0
  22. codebase_cortex/graph.py +111 -0
  23. codebase_cortex/mcp_client.py +94 -0
  24. codebase_cortex/notion/__init__.py +0 -0
  25. codebase_cortex/notion/bootstrap.py +298 -0
  26. codebase_cortex/notion/page_cache.py +107 -0
  27. codebase_cortex/state.py +77 -0
  28. codebase_cortex/utils/__init__.py +0 -0
  29. codebase_cortex/utils/json_parsing.py +59 -0
  30. codebase_cortex/utils/logging.py +62 -0
  31. codebase_cortex/utils/rate_limiter.py +56 -0
  32. codebase_cortex/utils/section_parser.py +139 -0
  33. codebase_cortex-0.1.0.dist-info/METADATA +209 -0
  34. codebase_cortex-0.1.0.dist-info/RECORD +37 -0
  35. codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
  36. codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
  37. codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,185 @@
1
+ """Parse git diffs into structured data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+
8
+ from git import Repo
9
+
10
+ from codebase_cortex.state import FileChange
11
+
12
+ CODE_EXTENSIONS = {
13
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
14
+ ".rb", ".php", ".c", ".cpp", ".h", ".hpp", ".cs", ".swift",
15
+ ".kt", ".scala", ".sh", ".bash", ".yml", ".yaml", ".toml",
16
+ ".json", ".md", ".rst", ".txt",
17
+ }
18
+ SKIP_DIRS = {
19
+ ".git", ".venv", "venv", "node_modules", "__pycache__",
20
+ ".pytest_cache", "dist", "build", ".eggs", ".tox",
21
+ ".mypy_cache", ".ruff_cache", ".cortex",
22
+ }
23
+
24
+
25
+ def get_recent_diff(repo_path: str, commits: int = 1) -> str:
26
+ """Get the unified diff for the most recent commit(s).
27
+
28
+ Args:
29
+ repo_path: Path to the git repository.
30
+ commits: Number of recent commits to include.
31
+
32
+ Returns:
33
+ Unified diff text, or empty string if no commits.
34
+ """
35
+ repo = Repo(repo_path)
36
+ if repo.head.is_detached or not list(repo.iter_commits(max_count=1)):
37
+ return ""
38
+
39
+ commits_list = list(repo.iter_commits(max_count=commits + 1))
40
+ if len(commits_list) < 2:
41
+ # Only one commit — show the root commit diff
42
+ return repo.git.diff_tree("--root", "--patch", "HEAD")
43
+
44
+ old = commits_list[-1]
45
+ return repo.git.diff(old.hexsha, "HEAD")
46
+
47
+
48
+ def get_staged_diff(repo_path: str) -> str:
49
+ """Get the diff of currently staged changes.
50
+
51
+ Args:
52
+ repo_path: Path to the git repository.
53
+
54
+ Returns:
55
+ Unified diff of staged changes.
56
+ """
57
+ repo = Repo(repo_path)
58
+ return repo.git.diff("--cached")
59
+
60
+
61
+ def parse_diff(diff_text: str) -> list[FileChange]:
62
+ """Parse unified diff text into structured FileChange objects.
63
+
64
+ Args:
65
+ diff_text: Raw unified diff output.
66
+
67
+ Returns:
68
+ List of FileChange dicts with path, status, additions, deletions, diff.
69
+ """
70
+ if not diff_text.strip():
71
+ return []
72
+
73
+ files: list[FileChange] = []
74
+ # Split on diff headers
75
+ file_diffs = re.split(r"(?=^diff --git )", diff_text, flags=re.MULTILINE)
76
+
77
+ for file_diff in file_diffs:
78
+ if not file_diff.strip():
79
+ continue
80
+
81
+ # Extract file path
82
+ header_match = re.match(r"diff --git a/(.*?) b/(.*)", file_diff)
83
+ if not header_match:
84
+ continue
85
+
86
+ old_path = header_match.group(1)
87
+ new_path = header_match.group(2)
88
+
89
+ # Determine status
90
+ if "new file mode" in file_diff:
91
+ status = "added"
92
+ elif "deleted file mode" in file_diff:
93
+ status = "deleted"
94
+ elif old_path != new_path:
95
+ status = "renamed"
96
+ else:
97
+ status = "modified"
98
+
99
+ # Count additions and deletions
100
+ additions = 0
101
+ deletions = 0
102
+ for line in file_diff.split("\n"):
103
+ if line.startswith("+") and not line.startswith("+++"):
104
+ additions += 1
105
+ elif line.startswith("-") and not line.startswith("---"):
106
+ deletions += 1
107
+
108
+ files.append(
109
+ FileChange(
110
+ path=new_path,
111
+ status=status,
112
+ additions=additions,
113
+ deletions=deletions,
114
+ diff=file_diff,
115
+ )
116
+ )
117
+
118
+ return files
119
+
120
+
121
+ def get_full_codebase_summary(repo_path: str) -> str:
122
+ """Walk all source files in a repo and build a virtual diff summary.
123
+
124
+ This is intended for documenting existing projects that have no
125
+ documentation yet. It produces a structured text block that looks
126
+ like a diff and can be passed directly to the CodeAnalyzer LLM.
127
+
128
+ Args:
129
+ repo_path: Absolute path to the root of the repository.
130
+
131
+ Returns:
132
+ A formatted string summarising every source file (path, line
133
+ count, first 200 lines). The total output is truncated to
134
+ ~50 000 characters to stay within LLM context limits.
135
+ """
136
+ max_chars = 50_000
137
+ max_preview_lines = 200
138
+
139
+ parts: list[str] = []
140
+ file_count = 0
141
+
142
+ for dirpath, dirnames, filenames in os.walk(repo_path):
143
+ # Prune directories we should skip (in-place so os.walk skips them)
144
+ dirnames[:] = [
145
+ d for d in dirnames if d not in SKIP_DIRS
146
+ ]
147
+
148
+ for filename in sorted(filenames):
149
+ ext = os.path.splitext(filename)[1]
150
+ if ext not in CODE_EXTENSIONS:
151
+ continue
152
+
153
+ full_path = os.path.join(dirpath, filename)
154
+ rel_path = os.path.relpath(full_path, repo_path)
155
+
156
+ try:
157
+ with open(full_path, "r", encoding="utf-8", errors="replace") as fh:
158
+ lines = fh.readlines()
159
+ except (OSError, UnicodeDecodeError):
160
+ continue
161
+
162
+ line_count = len(lines)
163
+ preview = "".join(lines[:max_preview_lines])
164
+ if line_count > max_preview_lines:
165
+ preview += f"\n... ({line_count - max_preview_lines} more lines)\n"
166
+
167
+ entry = (
168
+ f"--- /dev/null\n"
169
+ f"+++ b/{rel_path}\n"
170
+ f"## File: {rel_path} | {line_count} lines\n"
171
+ f"{preview}\n"
172
+ )
173
+ parts.append(entry)
174
+ file_count += 1
175
+
176
+ header = (
177
+ f"# Full Codebase Summary — {file_count} files\n"
178
+ f"# Repository: {repo_path}\n\n"
179
+ )
180
+ body = header + "\n".join(parts)
181
+
182
+ if len(body) > max_chars:
183
+ body = body[:max_chars] + "\n\n... [truncated at 50000 characters]\n"
184
+
185
+ return body
@@ -0,0 +1,46 @@
1
+ """GitHub client for remote repository access (optional)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from github import Github, Auth
6
+
7
+ from codebase_cortex.config import Settings
8
+
9
+
10
+ def get_github_client(settings: Settings) -> Github | None:
11
+ """Create a GitHub client if a token is available.
12
+
13
+ Returns None if no token is configured.
14
+ """
15
+ if not settings.github_token:
16
+ return None
17
+ return Github(auth=Auth.Token(settings.github_token))
18
+
19
+
20
+ def get_repo_info(settings: Settings) -> dict | None:
21
+ """Get basic repository info from GitHub.
22
+
23
+ Returns:
24
+ Dict with repo name, description, default branch, etc.
25
+ None if GitHub is not configured or repo_path is local.
26
+ """
27
+ client = get_github_client(settings)
28
+ if client is None:
29
+ return None
30
+
31
+ repo_path = settings.repo_path
32
+ if not repo_path.startswith("https://github.com"):
33
+ return None
34
+
35
+ # Extract owner/repo from URL
36
+ parts = repo_path.rstrip("/").split("/")
37
+ repo_name = f"{parts[-2]}/{parts[-1]}"
38
+
39
+ repo = client.get_repo(repo_name)
40
+ return {
41
+ "name": repo.full_name,
42
+ "description": repo.description,
43
+ "default_branch": repo.default_branch,
44
+ "language": repo.language,
45
+ "stars": repo.stargazers_count,
46
+ }
@@ -0,0 +1,111 @@
1
+ """LangGraph StateGraph definition for the Cortex pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from langgraph.graph import StateGraph, START, END
6
+
7
+ from codebase_cortex.state import CortexState
8
+
9
+
10
+ async def code_analyzer_node(state: CortexState) -> dict:
11
+ """Analyze git diffs and identify changes."""
12
+ from codebase_cortex.agents.code_analyzer import CodeAnalyzerAgent
13
+ from codebase_cortex.config import get_llm
14
+
15
+ agent = CodeAnalyzerAgent(get_llm())
16
+ return await agent.run(state)
17
+
18
+
19
+ async def semantic_finder_node(state: CortexState) -> dict:
20
+ """Find semantically related documentation."""
21
+ from codebase_cortex.agents.semantic_finder import SemanticFinderAgent
22
+ from codebase_cortex.config import get_llm
23
+
24
+ agent = SemanticFinderAgent(get_llm())
25
+ return await agent.run(state)
26
+
27
+
28
+ async def doc_writer_node(state: CortexState) -> dict:
29
+ """Write or update Notion documentation pages."""
30
+ from codebase_cortex.agents.doc_writer import DocWriterAgent
31
+ from codebase_cortex.config import get_llm
32
+
33
+ agent = DocWriterAgent(get_llm())
34
+ return await agent.run(state)
35
+
36
+
37
+ async def task_creator_node(state: CortexState) -> dict:
38
+ """Create tasks for undocumented areas."""
39
+ from codebase_cortex.agents.task_creator import TaskCreatorAgent
40
+ from codebase_cortex.config import get_llm
41
+
42
+ agent = TaskCreatorAgent(get_llm())
43
+ return await agent.run(state)
44
+
45
+
46
+ async def sprint_reporter_node(state: CortexState) -> dict:
47
+ """Generate sprint summary report."""
48
+ from codebase_cortex.agents.sprint_reporter import SprintReporterAgent
49
+ from codebase_cortex.config import get_llm
50
+
51
+ agent = SprintReporterAgent(get_llm())
52
+ return await agent.run(state)
53
+
54
+
55
+ def should_run_docs(state: CortexState) -> str:
56
+ """Route based on whether we have analysis results to act on."""
57
+ if state.get("analysis"):
58
+ return "semantic_finder"
59
+ return "end"
60
+
61
+
62
+ def should_run_sprint(state: CortexState) -> str:
63
+ """Route to sprint reporter if there are doc updates to report."""
64
+ if state.get("doc_updates") or state.get("tasks_created"):
65
+ return "sprint_reporter"
66
+ return "end"
67
+
68
+
69
+ def build_graph() -> StateGraph:
70
+ """Build and compile the Cortex pipeline graph.
71
+
72
+ Pipeline flow:
73
+ code_analyzer -> semantic_finder -> doc_writer -> task_creator -> [sprint_reporter] -> END
74
+ """
75
+ graph = StateGraph(CortexState)
76
+
77
+ # Add nodes
78
+ graph.add_node("code_analyzer", code_analyzer_node)
79
+ graph.add_node("semantic_finder", semantic_finder_node)
80
+ graph.add_node("doc_writer", doc_writer_node)
81
+ graph.add_node("task_creator", task_creator_node)
82
+ graph.add_node("sprint_reporter", sprint_reporter_node)
83
+
84
+ # Entry point
85
+ graph.add_edge(START, "code_analyzer")
86
+
87
+ # Conditional: only proceed if analysis produced results
88
+ graph.add_conditional_edges(
89
+ "code_analyzer",
90
+ should_run_docs,
91
+ {"semantic_finder": "semantic_finder", "end": END},
92
+ )
93
+
94
+ # Linear flow through doc pipeline
95
+ graph.add_edge("semantic_finder", "doc_writer")
96
+ graph.add_edge("doc_writer", "task_creator")
97
+
98
+ # Conditional: sprint reporter only on schedule
99
+ graph.add_conditional_edges(
100
+ "task_creator",
101
+ should_run_sprint,
102
+ {"sprint_reporter": "sprint_reporter", "end": END},
103
+ )
104
+ graph.add_edge("sprint_reporter", END)
105
+
106
+ return graph
107
+
108
+
109
+ def compile_graph():
110
+ """Build and compile the graph, ready for invocation."""
111
+ return build_graph().compile()
@@ -0,0 +1,94 @@
1
+ """Notion MCP client connection via Streamable HTTP."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from contextlib import asynccontextmanager
8
+ from typing import AsyncGenerator
9
+
10
+ from langchain_mcp_adapters.client import MultiServerMCPClient
11
+ from mcp.client.streamable_http import streamablehttp_client
12
+ from mcp import ClientSession
13
+
14
+ from codebase_cortex.auth.token_store import get_valid_token
15
+ from codebase_cortex.config import Settings
16
+ from codebase_cortex.utils.rate_limiter import NotionRateLimiter
17
+
18
+ NOTION_MCP_URL = "https://mcp.notion.com/mcp"
19
+
20
+ rate_limiter = NotionRateLimiter()
21
+
22
+
23
+ class LoggingSession:
24
+ """Wrapper around ClientSession that logs all tool calls."""
25
+
26
+ def __init__(self, session: ClientSession):
27
+ self._session = session
28
+ self._logger = logging.getLogger("cortex")
29
+
30
+ async def call_tool(self, name: str, arguments: dict | None = None):
31
+ args_str = json.dumps(arguments, default=str)[:500] if arguments else "{}"
32
+ self._logger.debug(f"MCP CALL: {name}({args_str})")
33
+
34
+ result = await self._session.call_tool(name, arguments=arguments)
35
+
36
+ if result.isError:
37
+ self._logger.debug(f"MCP ERROR: {name} -> {result.content}")
38
+ else:
39
+ preview = ""
40
+ if result.content:
41
+ preview = result.content[0].text[:300]
42
+ self._logger.debug(f"MCP OK: {name} -> {preview}...")
43
+
44
+ return result
45
+
46
+ async def list_tools(self):
47
+ return await self._session.list_tools()
48
+
49
+ async def initialize(self):
50
+ return await self._session.initialize()
51
+
52
+
53
+ @asynccontextmanager
54
+ async def notion_mcp_session(settings: Settings) -> AsyncGenerator[ClientSession, None]:
55
+ """Create a raw MCP client session to Notion.
56
+
57
+ Handles token refresh and provides a connected session.
58
+ Wraps in LoggingSession when verbose mode is active.
59
+ """
60
+ token = await get_valid_token(settings.notion_token_path)
61
+ headers = {"Authorization": f"Bearer {token}"}
62
+
63
+ async with streamablehttp_client(NOTION_MCP_URL, headers=headers) as (
64
+ read_stream,
65
+ write_stream,
66
+ _,
67
+ ):
68
+ async with ClientSession(read_stream, write_stream) as session:
69
+ await session.initialize()
70
+ from codebase_cortex.utils.logging import is_verbose
71
+ if is_verbose():
72
+ yield LoggingSession(session)
73
+ else:
74
+ yield session
75
+
76
+
77
+ async def get_notion_tools(settings: Settings) -> list:
78
+ """Get LangChain-compatible tools from the Notion MCP server.
79
+
80
+ Returns a list of tools that can be bound to LangChain agents.
81
+ """
82
+ token = await get_valid_token(settings.notion_token_path)
83
+
84
+ client = MultiServerMCPClient(
85
+ {
86
+ "notion": {
87
+ "url": NOTION_MCP_URL,
88
+ "transport": "streamable_http",
89
+ "headers": {"Authorization": f"Bearer {token}"},
90
+ }
91
+ }
92
+ )
93
+ async with client:
94
+ return client.get_tools()
File without changes