codebase-cortex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. codebase_cortex/__init__.py +3 -0
  2. codebase_cortex/agents/__init__.py +0 -0
  3. codebase_cortex/agents/base.py +69 -0
  4. codebase_cortex/agents/code_analyzer.py +122 -0
  5. codebase_cortex/agents/doc_writer.py +356 -0
  6. codebase_cortex/agents/semantic_finder.py +64 -0
  7. codebase_cortex/agents/sprint_reporter.py +152 -0
  8. codebase_cortex/agents/task_creator.py +138 -0
  9. codebase_cortex/auth/__init__.py +0 -0
  10. codebase_cortex/auth/callback_server.py +80 -0
  11. codebase_cortex/auth/oauth.py +173 -0
  12. codebase_cortex/auth/token_store.py +90 -0
  13. codebase_cortex/cli.py +855 -0
  14. codebase_cortex/config.py +150 -0
  15. codebase_cortex/embeddings/__init__.py +0 -0
  16. codebase_cortex/embeddings/clustering.py +140 -0
  17. codebase_cortex/embeddings/indexer.py +208 -0
  18. codebase_cortex/embeddings/store.py +126 -0
  19. codebase_cortex/git/__init__.py +0 -0
  20. codebase_cortex/git/diff_parser.py +185 -0
  21. codebase_cortex/git/github_client.py +46 -0
  22. codebase_cortex/graph.py +111 -0
  23. codebase_cortex/mcp_client.py +94 -0
  24. codebase_cortex/notion/__init__.py +0 -0
  25. codebase_cortex/notion/bootstrap.py +298 -0
  26. codebase_cortex/notion/page_cache.py +107 -0
  27. codebase_cortex/state.py +77 -0
  28. codebase_cortex/utils/__init__.py +0 -0
  29. codebase_cortex/utils/json_parsing.py +59 -0
  30. codebase_cortex/utils/logging.py +62 -0
  31. codebase_cortex/utils/rate_limiter.py +56 -0
  32. codebase_cortex/utils/section_parser.py +139 -0
  33. codebase_cortex-0.1.0.dist-info/METADATA +209 -0
  34. codebase_cortex-0.1.0.dist-info/RECORD +37 -0
  35. codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
  36. codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
  37. codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,3 @@
1
+ """Codebase Cortex - LangGraph multi-agent system for keeping docs in sync with code."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,69 @@
1
+ """Base agent with MCP tool access."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+
8
+ from langchain_core.language_models import BaseChatModel
9
+ from langchain_core.messages import BaseMessage
10
+
11
+ from codebase_cortex.state import CortexState
12
+
13
+
14
+ class BaseAgent(ABC):
15
+ """Base class for all Cortex agents.
16
+
17
+ Provides access to the LLM and MCP tools from state.
18
+ """
19
+
20
+ def __init__(self, llm: BaseChatModel) -> None:
21
+ self.llm = llm
22
+ self._logger = logging.getLogger("cortex")
23
+
24
+ @abstractmethod
25
+ async def run(self, state: CortexState) -> dict:
26
+ """Execute this agent's logic and return state updates."""
27
+ ...
28
+
29
+ async def _invoke_llm(self, messages: list[BaseMessage]) -> str:
30
+ """Invoke the LLM with logging. Returns response content."""
31
+ agent_name = self.__class__.__name__
32
+
33
+ # Log prompt summary
34
+ total_chars = sum(
35
+ len(m.content) if isinstance(m.content, str) else
36
+ sum(len(p.get("text", "")) if isinstance(p, dict) else len(str(p)) for p in m.content)
37
+ for m in messages
38
+ )
39
+ self._logger.debug(
40
+ f"LLM CALL [{agent_name}]: {len(messages)} messages, {total_chars} chars"
41
+ )
42
+ for m in messages:
43
+ preview = m.content[:200] if isinstance(m.content, str) else str(m.content)[:200]
44
+ self._logger.debug(f" {m.type}: {preview}...")
45
+
46
+ response = await self.llm.ainvoke(messages)
47
+ content = response.content
48
+
49
+ # Some models (e.g. Gemini 3) return structured content blocks
50
+ # instead of a plain string. Extract text from them.
51
+ if isinstance(content, list):
52
+ content = "\n".join(
53
+ part["text"] if isinstance(part, dict) else str(part)
54
+ for part in content
55
+ if not isinstance(part, dict) or part.get("type") == "text"
56
+ )
57
+
58
+ self._logger.debug(
59
+ f"LLM RESPONSE [{agent_name}]: {len(content)} chars — {content[:200]}..."
60
+ )
61
+ return content
62
+
63
+ def _get_mcp_tools(self, state: CortexState) -> list:
64
+ return state.get("mcp_tools", [])
65
+
66
+ def _append_error(self, state: CortexState, error: str) -> list[str]:
67
+ errors = list(state.get("errors", []))
68
+ errors.append(f"[{self.__class__.__name__}] {error}")
69
+ return errors
@@ -0,0 +1,122 @@
1
+ """CodeAnalyzer agent — analyzes git diffs and identifies what changed and why."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from langchain_core.messages import HumanMessage, SystemMessage
6
+
7
+ from codebase_cortex.agents.base import BaseAgent
8
+ from codebase_cortex.git.diff_parser import get_recent_diff, get_full_codebase_summary, parse_diff
9
+ from codebase_cortex.state import CortexState
10
+
11
+ DIFF_SYSTEM_PROMPT = """You are a senior software engineer analyzing code changes.
12
+ Given a git diff, provide a clear, structured analysis covering:
13
+
14
+ 1. **Summary**: One-paragraph overview of what changed and why.
15
+ 2. **Changed Components**: List each file/module changed with a brief description.
16
+ 3. **Impact Assessment**: What parts of the system are affected? Any breaking changes?
17
+ 4. **Documentation Needs**: What documentation should be created or updated?
18
+
19
+ Be concise but thorough. Focus on the "why" behind changes, not just the "what".
20
+ If the diff is too large, focus on the most significant changes."""
21
+
22
+ FULL_SYSTEM_PROMPT = """You are a senior software engineer analyzing an entire codebase.
23
+ Given a summary of all source files, provide a comprehensive analysis covering:
24
+
25
+ 1. **Project Overview**: What this project does, its purpose and architecture.
26
+ 2. **Components**: List each major module/package with its responsibility.
27
+ 3. **Key APIs and Interfaces**: Public functions, classes, endpoints, and contracts.
28
+ 4. **Architecture**: How components relate to each other, data flow, dependencies.
29
+ 5. **Documentation Needs**: What documentation pages should be created?
30
+
31
+ Be thorough — this is the initial documentation for a project that has none.
32
+ Focus on what a new developer would need to understand the codebase."""
33
+
34
+
35
+ class CodeAnalyzerAgent(BaseAgent):
36
+ """Analyzes git diffs or full codebases to identify documentation needs."""
37
+
38
+ async def run(self, state: CortexState) -> dict:
39
+ full_scan = state.get("full_scan", False)
40
+ repo_path = state.get("repo_path", ".")
41
+
42
+ if full_scan:
43
+ return await self._run_full_scan(state, repo_path)
44
+ return await self._run_diff(state, repo_path)
45
+
46
+ async def _run_diff(self, state: CortexState, repo_path: str) -> dict:
47
+ """Analyze the most recent git diff."""
48
+ diff_text = state.get("diff_text", "")
49
+ if not diff_text:
50
+ try:
51
+ diff_text = get_recent_diff(repo_path)
52
+ except Exception as e:
53
+ return {"errors": self._append_error(state, f"Failed to get diff: {e}")}
54
+
55
+ if not diff_text:
56
+ return {"analysis": "", "changed_files": []}
57
+
58
+ changed_files = parse_diff(diff_text)
59
+
60
+ file_summary = "\n".join(
61
+ f"- {f['path']} ({f['status']}: +{f['additions']}/-{f['deletions']})"
62
+ for f in changed_files
63
+ )
64
+
65
+ prompt = f"""Analyze the following code changes:
66
+
67
+ ## Files Changed
68
+ {file_summary}
69
+
70
+ ## Full Diff
71
+ ```
72
+ {diff_text[:15000]}
73
+ ```"""
74
+
75
+ try:
76
+ messages = [
77
+ SystemMessage(content=DIFF_SYSTEM_PROMPT),
78
+ HumanMessage(content=prompt),
79
+ ]
80
+ analysis = await self._invoke_llm(messages)
81
+ except Exception as e:
82
+ return {
83
+ "diff_text": diff_text,
84
+ "changed_files": changed_files,
85
+ "errors": self._append_error(state, f"LLM analysis failed: {e}"),
86
+ }
87
+
88
+ return {
89
+ "diff_text": diff_text,
90
+ "changed_files": changed_files,
91
+ "analysis": analysis,
92
+ }
93
+
94
+ async def _run_full_scan(self, state: CortexState, repo_path: str) -> dict:
95
+ """Analyze the entire codebase for initial documentation."""
96
+ try:
97
+ summary = get_full_codebase_summary(repo_path)
98
+ except Exception as e:
99
+ return {"errors": self._append_error(state, f"Failed to scan codebase: {e}")}
100
+
101
+ if not summary:
102
+ return {"analysis": "", "changed_files": []}
103
+
104
+ prompt = f"""Analyze this entire codebase and produce a comprehensive analysis for documentation:
105
+
106
+ {summary}"""
107
+
108
+ try:
109
+ messages = [
110
+ SystemMessage(content=FULL_SYSTEM_PROMPT),
111
+ HumanMessage(content=prompt),
112
+ ]
113
+ analysis = await self._invoke_llm(messages)
114
+ except Exception as e:
115
+ return {
116
+ "errors": self._append_error(state, f"LLM analysis failed: {e}"),
117
+ }
118
+
119
+ return {
120
+ "analysis": analysis,
121
+ "changed_files": [],
122
+ }
@@ -0,0 +1,356 @@
1
+ """DocWriter agent — updates or creates Notion pages to reflect code changes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from langchain_core.messages import HumanMessage, SystemMessage
6
+
7
+ import re
8
+
9
+ from codebase_cortex.agents.base import BaseAgent
10
+ from codebase_cortex.config import Settings
11
+ from codebase_cortex.notion.bootstrap import extract_page_id
12
+ from codebase_cortex.notion.page_cache import PageCache
13
+ from codebase_cortex.state import CortexState, DocUpdate
14
+ from codebase_cortex.utils.json_parsing import parse_json_array
15
+ from codebase_cortex.utils.section_parser import merge_sections, parse_sections
16
+
17
+
18
+ def _unescape_notion_text(text: str) -> str:
19
+ """Convert literal escape sequences from Notion MCP responses to real characters.
20
+
21
+ The Notion MCP server returns page content with literal \\n and \\t
22
+ (two-character sequences) instead of real newline/tab characters.
23
+ This converts them back so markdown parsing works correctly.
24
+ """
25
+ # Replace literal \n and \t with real characters
26
+ # Use a single pass to handle \n and \t without touching \\n (escaped backslash + n)
27
+ result = []
28
+ i = 0
29
+ while i < len(text):
30
+ if text[i] == '\\' and i + 1 < len(text):
31
+ next_char = text[i + 1]
32
+ if next_char == 'n':
33
+ result.append('\n')
34
+ i += 2
35
+ continue
36
+ elif next_char == 't':
37
+ result.append('\t')
38
+ i += 2
39
+ continue
40
+ result.append(text[i])
41
+ i += 1
42
+ return ''.join(result)
43
+
44
+
45
+ def strip_notion_metadata(raw_text: str) -> str:
46
+ """Extract just the page content from a notion-fetch response.
47
+
48
+ The notion-fetch tool returns XML-like wrapper with metadata:
49
+ Here is the result of "view" for the Page ...
50
+ <page url="...">
51
+ <ancestor-path>...</ancestor-path>
52
+ <properties>...</properties>
53
+ <content>
54
+ ... actual markdown content ...
55
+ </content>
56
+ </page>
57
+
58
+ This function extracts only the content between <content> tags,
59
+ or falls back to stripping all XML-like tags.
60
+ """
61
+ # Notion MCP returns literal \n instead of real newlines — unescape first
62
+ raw_text = _unescape_notion_text(raw_text)
63
+
64
+ # Try to extract content between <content> and </content>
65
+ content_match = re.search(
66
+ r"<content>\s*(.*?)\s*</content>",
67
+ raw_text,
68
+ re.DOTALL,
69
+ )
70
+ if content_match:
71
+ return content_match.group(1).strip()
72
+
73
+ # Fallback: strip the "Here is the result..." header and XML tags
74
+ # Remove the leading metadata line
75
+ text = re.sub(r'^Here is the result of "view".*?\n', "", raw_text)
76
+ # Remove XML-like tags
77
+ text = re.sub(r"</?(?:page|ancestor-path|parent-page|properties|content)[^>]*>", "", text)
78
+ # Remove JSON property lines like {"title":"..."}
79
+ text = re.sub(r'^\s*\{.*?"title".*?\}\s*$', "", text, flags=re.MULTILINE)
80
+ # Clean up excessive blank lines
81
+ text = re.sub(r"\n{3,}", "\n\n", text)
82
+ return text.strip()
83
+
84
+ SYSTEM_PROMPT = """You are a technical documentation writer. Given a code analysis
85
+ and related existing documentation, generate documentation updates for a Notion workspace.
86
+
87
+ Output a JSON array of page updates. Each element has:
88
+ - "title": Page title (must match an existing page title when updating)
89
+ - "action": "update" or "create"
90
+
91
+ For "update" actions (modifying an existing page):
92
+ - Include "section_updates": a JSON array of ONLY the sections that changed.
93
+ - Each section update has:
94
+ - "heading": The exact markdown heading (e.g., "## API Endpoints", "### Authentication")
95
+ - "content": The new content for that section (everything below the heading until the next heading)
96
+ - "action": "update" to replace an existing section, or "create" to add a new section
97
+ - Do NOT include sections that haven't changed.
98
+ - Match headings exactly to existing page headings (case-insensitive matching is applied).
99
+
100
+ For "create" actions (new page):
101
+ - Include "content": Full markdown content for the new page.
102
+ - Do NOT include "section_updates".
103
+
104
+ Focus on:
105
+ - Architecture decisions and component relationships
106
+ - API contracts and interfaces
107
+ - How components interact
108
+ - Breaking changes and migration notes
109
+
110
+ Keep content concise and actionable. Use markdown headings, lists, and code blocks."""
111
+
112
+
113
+ class DocWriterAgent(BaseAgent):
114
+ """Writes and updates documentation in Notion via MCP tools.
115
+
116
+ Uses LLM to generate documentation content based on code analysis
117
+ and related docs, then writes to Notion via MCP (or logs in dry_run mode).
118
+ """
119
+
120
+ async def run(self, state: CortexState) -> dict:
121
+ analysis = state.get("analysis", "")
122
+ if not analysis:
123
+ return {"doc_updates": []}
124
+
125
+ related_docs = state.get("related_docs", [])
126
+ dry_run = state.get("dry_run", False)
127
+
128
+ settings = Settings.from_env()
129
+ cache = PageCache(cache_path=settings.page_cache_path)
130
+
131
+ # Step 1: Fetch existing content from all Notion doc pages
132
+ existing_pages = await self._fetch_existing_pages(settings, cache)
133
+
134
+ # Build context from related code chunks (actual content, not just titles)
135
+ related_context = ""
136
+ if related_docs:
137
+ related_context = "\n\n## Related Code\n"
138
+ for doc in related_docs[:5]:
139
+ related_context += f"\n### {doc['title']} (similarity: {doc['similarity']:.2f})\n"
140
+ if doc.get("content"):
141
+ related_context += f"```\n{doc['content'][:1500]}\n```\n"
142
+
143
+ # Build existing page content section for the LLM
144
+ # Show section structure so the LLM knows which headings exist
145
+ existing_content_section = ""
146
+ if existing_pages:
147
+ existing_content_section = "\n\n## Current Page Contents\n"
148
+ for title, content in existing_pages.items():
149
+ truncated = content[:3000] + ("..." if len(content) > 3000 else "")
150
+ existing_content_section += f"\n### {title}\n```\n{truncated}\n```\n"
151
+
152
+ # Build dynamic page list from cache
153
+ doc_pages = cache.find_all_doc_pages()
154
+ page_list = "\n".join(f"- {p.title}" for p in doc_pages) if doc_pages else "- (no pages yet)"
155
+
156
+ # Ask LLM to generate doc updates
157
+ prompt = f"""Based on this code analysis, determine what documentation should be updated or created.
158
+
159
+ ## Code Analysis
160
+ {analysis}
161
+ {related_context}
162
+ {existing_content_section}
163
+
164
+ ## Available Pages in Notion
165
+ {page_list}
166
+
167
+ Generate documentation updates as a JSON array.
168
+ For "update" actions: include "title", "action", and "section_updates" (array of sections to change).
169
+ Each section_update has "heading" (e.g. "## API Endpoints"), "content" (new content for that section), and "action" ("update" or "create").
170
+ Only include sections that actually changed — unchanged sections will be preserved automatically.
171
+ For "create" actions: include "title", "action", and "content" (full markdown for new page).
172
+ Only include pages that genuinely need updating. Respond with ONLY the JSON array."""
173
+
174
+ try:
175
+ messages = [
176
+ SystemMessage(content=SYSTEM_PROMPT),
177
+ HumanMessage(content=prompt),
178
+ ]
179
+ raw = await self._invoke_llm(messages)
180
+
181
+ updates_data = parse_json_array(raw)
182
+
183
+ except Exception as e:
184
+ return {
185
+ "doc_updates": [],
186
+ "errors": self._append_error(state, f"Doc generation failed: {e}"),
187
+ }
188
+
189
+ doc_updates: list[DocUpdate] = []
190
+
191
+ for update in updates_data:
192
+ title = update.get("title", "Untitled")
193
+ action = update.get("action", "update")
194
+
195
+ # Look up existing page ID from cache
196
+ cached = cache.find_by_title(title)
197
+ page_id = cached.page_id if cached else None
198
+
199
+ if action == "update" and title in existing_pages:
200
+ # Section-level merge for existing pages
201
+ section_updates = update.get("section_updates")
202
+ if section_updates:
203
+ # New format: merge only changed sections
204
+ existing_sections = parse_sections(existing_pages[title])
205
+ content = merge_sections(existing_sections, section_updates)
206
+ elif update.get("content"):
207
+ # Backward compatibility: LLM returned full content
208
+ content = update["content"]
209
+ else:
210
+ continue
211
+ else:
212
+ # New page or page not in existing_pages
213
+ content = update.get("content", "")
214
+ if not content:
215
+ continue
216
+
217
+ doc_updates.append(DocUpdate(
218
+ page_id=page_id,
219
+ title=title,
220
+ content=content,
221
+ action=action,
222
+ ))
223
+
224
+ # Write to Notion (unless dry_run)
225
+ if not dry_run and doc_updates:
226
+ await self._write_to_notion(doc_updates, cache, state)
227
+
228
+ return {"doc_updates": doc_updates}
229
+
230
+ async def _fetch_existing_pages(
231
+ self, settings: Settings, cache: PageCache
232
+ ) -> dict[str, str]:
233
+ """Fetch current content of all doc pages from Notion.
234
+
235
+ Also syncs page titles back to cache (detects renames).
236
+ """
237
+ from codebase_cortex.mcp_client import notion_mcp_session, rate_limiter
238
+ from codebase_cortex.utils.logging import get_logger
239
+
240
+ logger = get_logger()
241
+ existing: dict[str, str] = {}
242
+
243
+ # Fetch all doc pages (skip infrastructure-only pages)
244
+ doc_pages = cache.find_all_doc_pages()
245
+ # Limit to 10 pages to avoid excessive API calls
246
+ pages_to_fetch = doc_pages[:10]
247
+
248
+ if not pages_to_fetch:
249
+ return existing
250
+
251
+ try:
252
+ async with notion_mcp_session(settings) as session:
253
+ for cached_page in pages_to_fetch:
254
+ await rate_limiter.acquire()
255
+ try:
256
+ result = await session.call_tool(
257
+ "notion-fetch",
258
+ arguments={"id": cached_page.page_id},
259
+ )
260
+ if not result.isError and result.content:
261
+ raw = result.content[0].text
262
+ content = strip_notion_metadata(raw)
263
+ existing[cached_page.title] = content
264
+
265
+ # Sync title back from Notion (detect renames)
266
+ # Extract actual title from the raw response
267
+ title_match = re.search(
268
+ r'"title"\s*:\s*"([^"]+)"', raw
269
+ )
270
+ if title_match:
271
+ actual_title = title_match.group(1)
272
+ normalized_actual = cache._normalize_title(actual_title)
273
+ normalized_cached = cache._normalize_title(cached_page.title)
274
+ if normalized_actual != normalized_cached and normalized_actual:
275
+ logger.info(
276
+ f"Page renamed: '{cached_page.title}' → '{actual_title}'"
277
+ )
278
+ cache.upsert(
279
+ cached_page.page_id, actual_title
280
+ )
281
+ except Exception as e:
282
+ logger.warning(f"Could not fetch {cached_page.title}: {e}")
283
+ except Exception as e:
284
+ logger.warning(f"Could not fetch existing pages: {e}")
285
+
286
+ return existing
287
+
288
+ async def _write_to_notion(
289
+ self,
290
+ updates: list[DocUpdate],
291
+ cache: PageCache,
292
+ state: CortexState,
293
+ ) -> None:
294
+ """Write documentation updates to Notion via MCP."""
295
+ from codebase_cortex.mcp_client import notion_mcp_session, rate_limiter
296
+ from codebase_cortex.config import Settings
297
+ from codebase_cortex.utils.logging import get_logger
298
+
299
+ logger = get_logger()
300
+ settings = Settings.from_env()
301
+
302
+ # Get parent page for new pages
303
+ parent_page = cache.find_by_title("Codebase Cortex")
304
+ parent_id = parent_page.page_id if parent_page else None
305
+
306
+ try:
307
+ async with notion_mcp_session(settings) as session:
308
+ for update in updates:
309
+ await rate_limiter.acquire()
310
+
311
+ page_id = update["page_id"]
312
+ # Only update pages we already track in the cache.
313
+ # Never search the whole workspace — that risks
314
+ # overwriting unrelated user pages.
315
+
316
+ if page_id:
317
+ # Content already merged locally via section_parser
318
+ await session.call_tool(
319
+ "notion-update-page",
320
+ arguments={
321
+ "page_id": page_id,
322
+ "command": "replace_content",
323
+ "new_str": update["content"],
324
+ },
325
+ )
326
+ # Mark as written with a content hash so first-run detection works
327
+ import hashlib
328
+ content_hash = hashlib.md5(update["content"].encode()).hexdigest()[:8]
329
+ cache.upsert(page_id, update["title"], content_hash=content_hash)
330
+ logger.info(f"Updated: {update['title']}")
331
+ else:
332
+ # Create new page under parent
333
+ create_args: dict = {
334
+ "pages": [
335
+ {
336
+ "properties": {"title": update["title"]},
337
+ "content": update["content"],
338
+ }
339
+ ],
340
+ }
341
+ if parent_id:
342
+ create_args["parent"] = {"page_id": parent_id}
343
+
344
+ result = await session.call_tool(
345
+ "notion-create-pages",
346
+ arguments=create_args,
347
+ )
348
+ new_page_id = extract_page_id(result)
349
+ if new_page_id:
350
+ import hashlib
351
+ content_hash = hashlib.md5(update["content"].encode()).hexdigest()[:8]
352
+ cache.upsert(new_page_id, update["title"], content_hash=content_hash)
353
+ logger.info(f"Created: {update['title']}")
354
+
355
+ except Exception as e:
356
+ logger.error(f"Failed to write docs to Notion: {e}")
@@ -0,0 +1,64 @@
1
+ """SemanticFinder agent — finds related docs via FAISS embedding similarity."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from codebase_cortex.agents.base import BaseAgent
8
+ from codebase_cortex.config import Settings
9
+ from codebase_cortex.embeddings.indexer import EmbeddingIndexer
10
+ from codebase_cortex.embeddings.store import FAISSStore
11
+ from codebase_cortex.state import CortexState, RelatedDoc
12
+
13
+
14
+ class SemanticFinderAgent(BaseAgent):
15
+ """Finds semantically related code chunks using FAISS embeddings.
16
+
17
+ Embeds the analysis text from CodeAnalyzer, queries the FAISS index
18
+ for similar code chunks, and returns them as RelatedDoc entries.
19
+ """
20
+
21
+ async def run(self, state: CortexState) -> dict:
22
+ analysis = state.get("analysis", "")
23
+ if not analysis:
24
+ return {"related_docs": []}
25
+
26
+ repo_path = Path(state.get("repo_path", "."))
27
+ settings = Settings.from_env(repo_path)
28
+ index_dir = settings.faiss_index_dir
29
+ try:
30
+ # Always rebuild the index to capture new/changed files
31
+ indexer = EmbeddingIndexer(repo_path=repo_path)
32
+ chunks = indexer.collect_chunks()
33
+ if not chunks:
34
+ return {"related_docs": []}
35
+
36
+ store = FAISSStore(index_dir=index_dir)
37
+ embeddings = indexer.embed_chunks(chunks)
38
+ store.build(embeddings, chunks)
39
+ store.save()
40
+
41
+ query_emb = indexer.embed_texts([analysis])
42
+ if query_emb.size == 0:
43
+ return {"related_docs": []}
44
+
45
+ # Search for related chunks
46
+ results = store.search(query_emb[0], k=10)
47
+
48
+ related_docs: list[RelatedDoc] = []
49
+ for r in results:
50
+ related_docs.append(RelatedDoc(
51
+ page_id=r.chunk.file_path,
52
+ title=f"{r.chunk.name} ({r.chunk.file_path})",
53
+ similarity=r.score,
54
+ content=r.chunk.content[:2000],
55
+ ))
56
+
57
+ return {"related_docs": related_docs}
58
+
59
+ except Exception as e:
60
+ return {
61
+ "related_docs": [],
62
+ "errors": self._append_error(state, f"Semantic search failed: {e}"),
63
+ }
64
+