codebase-cortex 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_cortex/__init__.py +3 -0
- codebase_cortex/agents/__init__.py +0 -0
- codebase_cortex/agents/base.py +69 -0
- codebase_cortex/agents/code_analyzer.py +122 -0
- codebase_cortex/agents/doc_writer.py +356 -0
- codebase_cortex/agents/semantic_finder.py +64 -0
- codebase_cortex/agents/sprint_reporter.py +152 -0
- codebase_cortex/agents/task_creator.py +138 -0
- codebase_cortex/auth/__init__.py +0 -0
- codebase_cortex/auth/callback_server.py +80 -0
- codebase_cortex/auth/oauth.py +173 -0
- codebase_cortex/auth/token_store.py +90 -0
- codebase_cortex/cli.py +855 -0
- codebase_cortex/config.py +150 -0
- codebase_cortex/embeddings/__init__.py +0 -0
- codebase_cortex/embeddings/clustering.py +140 -0
- codebase_cortex/embeddings/indexer.py +208 -0
- codebase_cortex/embeddings/store.py +126 -0
- codebase_cortex/git/__init__.py +0 -0
- codebase_cortex/git/diff_parser.py +185 -0
- codebase_cortex/git/github_client.py +46 -0
- codebase_cortex/graph.py +111 -0
- codebase_cortex/mcp_client.py +94 -0
- codebase_cortex/notion/__init__.py +0 -0
- codebase_cortex/notion/bootstrap.py +298 -0
- codebase_cortex/notion/page_cache.py +107 -0
- codebase_cortex/state.py +77 -0
- codebase_cortex/utils/__init__.py +0 -0
- codebase_cortex/utils/json_parsing.py +59 -0
- codebase_cortex/utils/logging.py +62 -0
- codebase_cortex/utils/rate_limiter.py +56 -0
- codebase_cortex/utils/section_parser.py +139 -0
- codebase_cortex-0.1.0.dist-info/METADATA +209 -0
- codebase_cortex-0.1.0.dist-info/RECORD +37 -0
- codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
- codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
- codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Parse git diffs into structured data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from git import Repo
|
|
9
|
+
|
|
10
|
+
from codebase_cortex.state import FileChange
|
|
11
|
+
|
|
12
|
+
CODE_EXTENSIONS = {
|
|
13
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
|
|
14
|
+
".rb", ".php", ".c", ".cpp", ".h", ".hpp", ".cs", ".swift",
|
|
15
|
+
".kt", ".scala", ".sh", ".bash", ".yml", ".yaml", ".toml",
|
|
16
|
+
".json", ".md", ".rst", ".txt",
|
|
17
|
+
}
|
|
18
|
+
SKIP_DIRS = {
|
|
19
|
+
".git", ".venv", "venv", "node_modules", "__pycache__",
|
|
20
|
+
".pytest_cache", "dist", "build", ".eggs", ".tox",
|
|
21
|
+
".mypy_cache", ".ruff_cache", ".cortex",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_recent_diff(repo_path: str, commits: int = 1) -> str:
|
|
26
|
+
"""Get the unified diff for the most recent commit(s).
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
repo_path: Path to the git repository.
|
|
30
|
+
commits: Number of recent commits to include.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Unified diff text, or empty string if no commits.
|
|
34
|
+
"""
|
|
35
|
+
repo = Repo(repo_path)
|
|
36
|
+
if repo.head.is_detached or not list(repo.iter_commits(max_count=1)):
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
commits_list = list(repo.iter_commits(max_count=commits + 1))
|
|
40
|
+
if len(commits_list) < 2:
|
|
41
|
+
# Only one commit — show the root commit diff
|
|
42
|
+
return repo.git.diff_tree("--root", "--patch", "HEAD")
|
|
43
|
+
|
|
44
|
+
old = commits_list[-1]
|
|
45
|
+
return repo.git.diff(old.hexsha, "HEAD")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_staged_diff(repo_path: str) -> str:
|
|
49
|
+
"""Get the diff of currently staged changes.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
repo_path: Path to the git repository.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Unified diff of staged changes.
|
|
56
|
+
"""
|
|
57
|
+
repo = Repo(repo_path)
|
|
58
|
+
return repo.git.diff("--cached")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def parse_diff(diff_text: str) -> list[FileChange]:
|
|
62
|
+
"""Parse unified diff text into structured FileChange objects.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
diff_text: Raw unified diff output.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of FileChange dicts with path, status, additions, deletions, diff.
|
|
69
|
+
"""
|
|
70
|
+
if not diff_text.strip():
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
files: list[FileChange] = []
|
|
74
|
+
# Split on diff headers
|
|
75
|
+
file_diffs = re.split(r"(?=^diff --git )", diff_text, flags=re.MULTILINE)
|
|
76
|
+
|
|
77
|
+
for file_diff in file_diffs:
|
|
78
|
+
if not file_diff.strip():
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
# Extract file path
|
|
82
|
+
header_match = re.match(r"diff --git a/(.*?) b/(.*)", file_diff)
|
|
83
|
+
if not header_match:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
old_path = header_match.group(1)
|
|
87
|
+
new_path = header_match.group(2)
|
|
88
|
+
|
|
89
|
+
# Determine status
|
|
90
|
+
if "new file mode" in file_diff:
|
|
91
|
+
status = "added"
|
|
92
|
+
elif "deleted file mode" in file_diff:
|
|
93
|
+
status = "deleted"
|
|
94
|
+
elif old_path != new_path:
|
|
95
|
+
status = "renamed"
|
|
96
|
+
else:
|
|
97
|
+
status = "modified"
|
|
98
|
+
|
|
99
|
+
# Count additions and deletions
|
|
100
|
+
additions = 0
|
|
101
|
+
deletions = 0
|
|
102
|
+
for line in file_diff.split("\n"):
|
|
103
|
+
if line.startswith("+") and not line.startswith("+++"):
|
|
104
|
+
additions += 1
|
|
105
|
+
elif line.startswith("-") and not line.startswith("---"):
|
|
106
|
+
deletions += 1
|
|
107
|
+
|
|
108
|
+
files.append(
|
|
109
|
+
FileChange(
|
|
110
|
+
path=new_path,
|
|
111
|
+
status=status,
|
|
112
|
+
additions=additions,
|
|
113
|
+
deletions=deletions,
|
|
114
|
+
diff=file_diff,
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return files
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_full_codebase_summary(repo_path: str) -> str:
|
|
122
|
+
"""Walk all source files in a repo and build a virtual diff summary.
|
|
123
|
+
|
|
124
|
+
This is intended for documenting existing projects that have no
|
|
125
|
+
documentation yet. It produces a structured text block that looks
|
|
126
|
+
like a diff and can be passed directly to the CodeAnalyzer LLM.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
repo_path: Absolute path to the root of the repository.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
A formatted string summarising every source file (path, line
|
|
133
|
+
count, first 200 lines). The total output is truncated to
|
|
134
|
+
~50 000 characters to stay within LLM context limits.
|
|
135
|
+
"""
|
|
136
|
+
max_chars = 50_000
|
|
137
|
+
max_preview_lines = 200
|
|
138
|
+
|
|
139
|
+
parts: list[str] = []
|
|
140
|
+
file_count = 0
|
|
141
|
+
|
|
142
|
+
for dirpath, dirnames, filenames in os.walk(repo_path):
|
|
143
|
+
# Prune directories we should skip (in-place so os.walk skips them)
|
|
144
|
+
dirnames[:] = [
|
|
145
|
+
d for d in dirnames if d not in SKIP_DIRS
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
for filename in sorted(filenames):
|
|
149
|
+
ext = os.path.splitext(filename)[1]
|
|
150
|
+
if ext not in CODE_EXTENSIONS:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
full_path = os.path.join(dirpath, filename)
|
|
154
|
+
rel_path = os.path.relpath(full_path, repo_path)
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
with open(full_path, "r", encoding="utf-8", errors="replace") as fh:
|
|
158
|
+
lines = fh.readlines()
|
|
159
|
+
except (OSError, UnicodeDecodeError):
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
line_count = len(lines)
|
|
163
|
+
preview = "".join(lines[:max_preview_lines])
|
|
164
|
+
if line_count > max_preview_lines:
|
|
165
|
+
preview += f"\n... ({line_count - max_preview_lines} more lines)\n"
|
|
166
|
+
|
|
167
|
+
entry = (
|
|
168
|
+
f"--- /dev/null\n"
|
|
169
|
+
f"+++ b/{rel_path}\n"
|
|
170
|
+
f"## File: {rel_path} | {line_count} lines\n"
|
|
171
|
+
f"{preview}\n"
|
|
172
|
+
)
|
|
173
|
+
parts.append(entry)
|
|
174
|
+
file_count += 1
|
|
175
|
+
|
|
176
|
+
header = (
|
|
177
|
+
f"# Full Codebase Summary — {file_count} files\n"
|
|
178
|
+
f"# Repository: {repo_path}\n\n"
|
|
179
|
+
)
|
|
180
|
+
body = header + "\n".join(parts)
|
|
181
|
+
|
|
182
|
+
if len(body) > max_chars:
|
|
183
|
+
body = body[:max_chars] + "\n\n... [truncated at 50000 characters]\n"
|
|
184
|
+
|
|
185
|
+
return body
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""GitHub client for remote repository access (optional)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from github import Github, Auth
|
|
6
|
+
|
|
7
|
+
from codebase_cortex.config import Settings
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_github_client(settings: Settings) -> Github | None:
|
|
11
|
+
"""Create a GitHub client if a token is available.
|
|
12
|
+
|
|
13
|
+
Returns None if no token is configured.
|
|
14
|
+
"""
|
|
15
|
+
if not settings.github_token:
|
|
16
|
+
return None
|
|
17
|
+
return Github(auth=Auth.Token(settings.github_token))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_repo_info(settings: Settings) -> dict | None:
|
|
21
|
+
"""Get basic repository info from GitHub.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Dict with repo name, description, default branch, etc.
|
|
25
|
+
None if GitHub is not configured or repo_path is local.
|
|
26
|
+
"""
|
|
27
|
+
client = get_github_client(settings)
|
|
28
|
+
if client is None:
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
repo_path = settings.repo_path
|
|
32
|
+
if not repo_path.startswith("https://github.com"):
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
# Extract owner/repo from URL
|
|
36
|
+
parts = repo_path.rstrip("/").split("/")
|
|
37
|
+
repo_name = f"{parts[-2]}/{parts[-1]}"
|
|
38
|
+
|
|
39
|
+
repo = client.get_repo(repo_name)
|
|
40
|
+
return {
|
|
41
|
+
"name": repo.full_name,
|
|
42
|
+
"description": repo.description,
|
|
43
|
+
"default_branch": repo.default_branch,
|
|
44
|
+
"language": repo.language,
|
|
45
|
+
"stars": repo.stargazers_count,
|
|
46
|
+
}
|
codebase_cortex/graph.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""LangGraph StateGraph definition for the Cortex pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from langgraph.graph import StateGraph, START, END
|
|
6
|
+
|
|
7
|
+
from codebase_cortex.state import CortexState
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def code_analyzer_node(state: CortexState) -> dict:
|
|
11
|
+
"""Analyze git diffs and identify changes."""
|
|
12
|
+
from codebase_cortex.agents.code_analyzer import CodeAnalyzerAgent
|
|
13
|
+
from codebase_cortex.config import get_llm
|
|
14
|
+
|
|
15
|
+
agent = CodeAnalyzerAgent(get_llm())
|
|
16
|
+
return await agent.run(state)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def semantic_finder_node(state: CortexState) -> dict:
|
|
20
|
+
"""Find semantically related documentation."""
|
|
21
|
+
from codebase_cortex.agents.semantic_finder import SemanticFinderAgent
|
|
22
|
+
from codebase_cortex.config import get_llm
|
|
23
|
+
|
|
24
|
+
agent = SemanticFinderAgent(get_llm())
|
|
25
|
+
return await agent.run(state)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def doc_writer_node(state: CortexState) -> dict:
|
|
29
|
+
"""Write or update Notion documentation pages."""
|
|
30
|
+
from codebase_cortex.agents.doc_writer import DocWriterAgent
|
|
31
|
+
from codebase_cortex.config import get_llm
|
|
32
|
+
|
|
33
|
+
agent = DocWriterAgent(get_llm())
|
|
34
|
+
return await agent.run(state)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def task_creator_node(state: CortexState) -> dict:
|
|
38
|
+
"""Create tasks for undocumented areas."""
|
|
39
|
+
from codebase_cortex.agents.task_creator import TaskCreatorAgent
|
|
40
|
+
from codebase_cortex.config import get_llm
|
|
41
|
+
|
|
42
|
+
agent = TaskCreatorAgent(get_llm())
|
|
43
|
+
return await agent.run(state)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def sprint_reporter_node(state: CortexState) -> dict:
|
|
47
|
+
"""Generate sprint summary report."""
|
|
48
|
+
from codebase_cortex.agents.sprint_reporter import SprintReporterAgent
|
|
49
|
+
from codebase_cortex.config import get_llm
|
|
50
|
+
|
|
51
|
+
agent = SprintReporterAgent(get_llm())
|
|
52
|
+
return await agent.run(state)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def should_run_docs(state: CortexState) -> str:
|
|
56
|
+
"""Route based on whether we have analysis results to act on."""
|
|
57
|
+
if state.get("analysis"):
|
|
58
|
+
return "semantic_finder"
|
|
59
|
+
return "end"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def should_run_sprint(state: CortexState) -> str:
|
|
63
|
+
"""Route to sprint reporter if there are doc updates to report."""
|
|
64
|
+
if state.get("doc_updates") or state.get("tasks_created"):
|
|
65
|
+
return "sprint_reporter"
|
|
66
|
+
return "end"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def build_graph() -> StateGraph:
|
|
70
|
+
"""Build and compile the Cortex pipeline graph.
|
|
71
|
+
|
|
72
|
+
Pipeline flow:
|
|
73
|
+
code_analyzer -> semantic_finder -> doc_writer -> task_creator -> [sprint_reporter] -> END
|
|
74
|
+
"""
|
|
75
|
+
graph = StateGraph(CortexState)
|
|
76
|
+
|
|
77
|
+
# Add nodes
|
|
78
|
+
graph.add_node("code_analyzer", code_analyzer_node)
|
|
79
|
+
graph.add_node("semantic_finder", semantic_finder_node)
|
|
80
|
+
graph.add_node("doc_writer", doc_writer_node)
|
|
81
|
+
graph.add_node("task_creator", task_creator_node)
|
|
82
|
+
graph.add_node("sprint_reporter", sprint_reporter_node)
|
|
83
|
+
|
|
84
|
+
# Entry point
|
|
85
|
+
graph.add_edge(START, "code_analyzer")
|
|
86
|
+
|
|
87
|
+
# Conditional: only proceed if analysis produced results
|
|
88
|
+
graph.add_conditional_edges(
|
|
89
|
+
"code_analyzer",
|
|
90
|
+
should_run_docs,
|
|
91
|
+
{"semantic_finder": "semantic_finder", "end": END},
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Linear flow through doc pipeline
|
|
95
|
+
graph.add_edge("semantic_finder", "doc_writer")
|
|
96
|
+
graph.add_edge("doc_writer", "task_creator")
|
|
97
|
+
|
|
98
|
+
# Conditional: sprint reporter only on schedule
|
|
99
|
+
graph.add_conditional_edges(
|
|
100
|
+
"task_creator",
|
|
101
|
+
should_run_sprint,
|
|
102
|
+
{"sprint_reporter": "sprint_reporter", "end": END},
|
|
103
|
+
)
|
|
104
|
+
graph.add_edge("sprint_reporter", END)
|
|
105
|
+
|
|
106
|
+
return graph
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def compile_graph():
|
|
110
|
+
"""Build and compile the graph, ready for invocation."""
|
|
111
|
+
return build_graph().compile()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Notion MCP client connection via Streamable HTTP."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from contextlib import asynccontextmanager
|
|
8
|
+
from typing import AsyncGenerator
|
|
9
|
+
|
|
10
|
+
from langchain_mcp_adapters.client import MultiServerMCPClient
|
|
11
|
+
from mcp.client.streamable_http import streamablehttp_client
|
|
12
|
+
from mcp import ClientSession
|
|
13
|
+
|
|
14
|
+
from codebase_cortex.auth.token_store import get_valid_token
|
|
15
|
+
from codebase_cortex.config import Settings
|
|
16
|
+
from codebase_cortex.utils.rate_limiter import NotionRateLimiter
|
|
17
|
+
|
|
18
|
+
NOTION_MCP_URL = "https://mcp.notion.com/mcp"
|
|
19
|
+
|
|
20
|
+
rate_limiter = NotionRateLimiter()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LoggingSession:
|
|
24
|
+
"""Wrapper around ClientSession that logs all tool calls."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, session: ClientSession):
|
|
27
|
+
self._session = session
|
|
28
|
+
self._logger = logging.getLogger("cortex")
|
|
29
|
+
|
|
30
|
+
async def call_tool(self, name: str, arguments: dict | None = None):
|
|
31
|
+
args_str = json.dumps(arguments, default=str)[:500] if arguments else "{}"
|
|
32
|
+
self._logger.debug(f"MCP CALL: {name}({args_str})")
|
|
33
|
+
|
|
34
|
+
result = await self._session.call_tool(name, arguments=arguments)
|
|
35
|
+
|
|
36
|
+
if result.isError:
|
|
37
|
+
self._logger.debug(f"MCP ERROR: {name} -> {result.content}")
|
|
38
|
+
else:
|
|
39
|
+
preview = ""
|
|
40
|
+
if result.content:
|
|
41
|
+
preview = result.content[0].text[:300]
|
|
42
|
+
self._logger.debug(f"MCP OK: {name} -> {preview}...")
|
|
43
|
+
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
async def list_tools(self):
|
|
47
|
+
return await self._session.list_tools()
|
|
48
|
+
|
|
49
|
+
async def initialize(self):
|
|
50
|
+
return await self._session.initialize()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@asynccontextmanager
|
|
54
|
+
async def notion_mcp_session(settings: Settings) -> AsyncGenerator[ClientSession, None]:
|
|
55
|
+
"""Create a raw MCP client session to Notion.
|
|
56
|
+
|
|
57
|
+
Handles token refresh and provides a connected session.
|
|
58
|
+
Wraps in LoggingSession when verbose mode is active.
|
|
59
|
+
"""
|
|
60
|
+
token = await get_valid_token(settings.notion_token_path)
|
|
61
|
+
headers = {"Authorization": f"Bearer {token}"}
|
|
62
|
+
|
|
63
|
+
async with streamablehttp_client(NOTION_MCP_URL, headers=headers) as (
|
|
64
|
+
read_stream,
|
|
65
|
+
write_stream,
|
|
66
|
+
_,
|
|
67
|
+
):
|
|
68
|
+
async with ClientSession(read_stream, write_stream) as session:
|
|
69
|
+
await session.initialize()
|
|
70
|
+
from codebase_cortex.utils.logging import is_verbose
|
|
71
|
+
if is_verbose():
|
|
72
|
+
yield LoggingSession(session)
|
|
73
|
+
else:
|
|
74
|
+
yield session
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def get_notion_tools(settings: Settings) -> list:
|
|
78
|
+
"""Get LangChain-compatible tools from the Notion MCP server.
|
|
79
|
+
|
|
80
|
+
Returns a list of tools that can be bound to LangChain agents.
|
|
81
|
+
"""
|
|
82
|
+
token = await get_valid_token(settings.notion_token_path)
|
|
83
|
+
|
|
84
|
+
client = MultiServerMCPClient(
|
|
85
|
+
{
|
|
86
|
+
"notion": {
|
|
87
|
+
"url": NOTION_MCP_URL,
|
|
88
|
+
"transport": "streamable_http",
|
|
89
|
+
"headers": {"Authorization": f"Bearer {token}"},
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
)
|
|
93
|
+
async with client:
|
|
94
|
+
return client.get_tools()
|
|
File without changes
|