codebase-cortex 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_cortex/__init__.py +3 -0
- codebase_cortex/agents/__init__.py +0 -0
- codebase_cortex/agents/base.py +69 -0
- codebase_cortex/agents/code_analyzer.py +122 -0
- codebase_cortex/agents/doc_writer.py +356 -0
- codebase_cortex/agents/semantic_finder.py +64 -0
- codebase_cortex/agents/sprint_reporter.py +152 -0
- codebase_cortex/agents/task_creator.py +138 -0
- codebase_cortex/auth/__init__.py +0 -0
- codebase_cortex/auth/callback_server.py +80 -0
- codebase_cortex/auth/oauth.py +173 -0
- codebase_cortex/auth/token_store.py +90 -0
- codebase_cortex/cli.py +855 -0
- codebase_cortex/config.py +150 -0
- codebase_cortex/embeddings/__init__.py +0 -0
- codebase_cortex/embeddings/clustering.py +140 -0
- codebase_cortex/embeddings/indexer.py +208 -0
- codebase_cortex/embeddings/store.py +126 -0
- codebase_cortex/git/__init__.py +0 -0
- codebase_cortex/git/diff_parser.py +185 -0
- codebase_cortex/git/github_client.py +46 -0
- codebase_cortex/graph.py +111 -0
- codebase_cortex/mcp_client.py +94 -0
- codebase_cortex/notion/__init__.py +0 -0
- codebase_cortex/notion/bootstrap.py +298 -0
- codebase_cortex/notion/page_cache.py +107 -0
- codebase_cortex/state.py +77 -0
- codebase_cortex/utils/__init__.py +0 -0
- codebase_cortex/utils/json_parsing.py +59 -0
- codebase_cortex/utils/logging.py +62 -0
- codebase_cortex/utils/rate_limiter.py +56 -0
- codebase_cortex/utils/section_parser.py +139 -0
- codebase_cortex-0.1.0.dist-info/METADATA +209 -0
- codebase_cortex-0.1.0.dist-info/RECORD +37 -0
- codebase_cortex-0.1.0.dist-info/WHEEL +4 -0
- codebase_cortex-0.1.0.dist-info/entry_points.txt +3 -0
- codebase_cortex-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Configuration and LLM factory for Codebase Cortex."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from langchain_core.language_models import BaseChatModel
|
|
11
|
+
|
|
12
|
+
CORTEX_DIR_NAME = ".cortex"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def find_cortex_dir(start: Path | None = None) -> Path:
|
|
16
|
+
"""Find the .cortex directory, starting from the given path or cwd.
|
|
17
|
+
|
|
18
|
+
Returns the .cortex path (may not exist yet — for `cortex init`).
|
|
19
|
+
"""
|
|
20
|
+
start = start or Path.cwd()
|
|
21
|
+
return start.resolve() / CORTEX_DIR_NAME
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
DEFAULT_MODELS: dict[str, str] = {
|
|
25
|
+
"google": "gemini-2.5-flash-lite",
|
|
26
|
+
"anthropic": "claude-sonnet-4-20250514",
|
|
27
|
+
"openrouter": "", # no sensible default — user must choose
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
RECOMMENDED_MODELS: dict[str, list[str]] = {
|
|
31
|
+
"google": [
|
|
32
|
+
"gemini-2.5-flash-lite",
|
|
33
|
+
"gemini-3-flash-preview",
|
|
34
|
+
"gemini-2.5-pro",
|
|
35
|
+
],
|
|
36
|
+
"anthropic": [
|
|
37
|
+
"claude-sonnet-4-20250514",
|
|
38
|
+
"claude-haiku-4-5-20251001",
|
|
39
|
+
],
|
|
40
|
+
"openrouter": [
|
|
41
|
+
"anthropic/claude-sonnet-4",
|
|
42
|
+
"google/gemini-2.5-flash-lite",
|
|
43
|
+
"google/gemini-3-flash-preview",
|
|
44
|
+
],
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class Settings:
|
|
50
|
+
"""Application settings loaded from .cortex/.env in the target repo."""
|
|
51
|
+
|
|
52
|
+
llm_provider: str = "google"
|
|
53
|
+
llm_model: str = ""
|
|
54
|
+
google_api_key: str = ""
|
|
55
|
+
anthropic_api_key: str = ""
|
|
56
|
+
openrouter_api_key: str = ""
|
|
57
|
+
github_token: str = ""
|
|
58
|
+
repo_path: Path = field(default_factory=lambda: Path.cwd())
|
|
59
|
+
cortex_dir: Path = field(default_factory=lambda: find_cortex_dir())
|
|
60
|
+
oauth_callback_port: int = 9876
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def data_dir(self) -> Path:
|
|
64
|
+
return self.cortex_dir
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def notion_token_path(self) -> Path:
|
|
68
|
+
return self.cortex_dir / "notion_tokens.json"
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def faiss_index_dir(self) -> Path:
|
|
72
|
+
return self.cortex_dir / "faiss_index"
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def page_cache_path(self) -> Path:
|
|
76
|
+
return self.cortex_dir / "page_cache.json"
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def env_path(self) -> Path:
|
|
80
|
+
return self.cortex_dir / ".env"
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def from_env(cls, repo_path: Path | None = None) -> Settings:
|
|
84
|
+
"""Load settings from .cortex/.env in the given or current directory."""
|
|
85
|
+
repo = (repo_path or Path.cwd()).resolve()
|
|
86
|
+
cortex_dir = repo / CORTEX_DIR_NAME
|
|
87
|
+
env_file = cortex_dir / ".env"
|
|
88
|
+
|
|
89
|
+
if env_file.exists():
|
|
90
|
+
load_dotenv(env_file)
|
|
91
|
+
|
|
92
|
+
provider = os.getenv("LLM_PROVIDER", "google")
|
|
93
|
+
return cls(
|
|
94
|
+
llm_provider=provider,
|
|
95
|
+
llm_model=os.getenv("LLM_MODEL", DEFAULT_MODELS.get(provider, "")),
|
|
96
|
+
google_api_key=os.getenv("GOOGLE_API_KEY", ""),
|
|
97
|
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY", ""),
|
|
98
|
+
openrouter_api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
|
99
|
+
github_token=os.getenv("GITHUB_TOKEN", ""),
|
|
100
|
+
repo_path=repo,
|
|
101
|
+
cortex_dir=cortex_dir,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def is_initialized(self) -> bool:
|
|
106
|
+
return self.env_path.exists()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_llm(settings: Settings | None = None, model: str | None = None) -> BaseChatModel:
|
|
110
|
+
"""Create an LLM instance based on settings.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
settings: Application settings. Loads from env if not provided.
|
|
114
|
+
model: Override model name. Uses LLM_MODEL from env, then provider default.
|
|
115
|
+
"""
|
|
116
|
+
if settings is None:
|
|
117
|
+
settings = Settings.from_env()
|
|
118
|
+
|
|
119
|
+
resolved_model = model or settings.llm_model
|
|
120
|
+
|
|
121
|
+
if settings.llm_provider == "anthropic":
|
|
122
|
+
from langchain_anthropic import ChatAnthropic
|
|
123
|
+
|
|
124
|
+
return ChatAnthropic(
|
|
125
|
+
model=resolved_model or DEFAULT_MODELS["anthropic"],
|
|
126
|
+
api_key=settings.anthropic_api_key,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if settings.llm_provider == "openrouter":
|
|
130
|
+
from langchain_openai import ChatOpenAI
|
|
131
|
+
|
|
132
|
+
if not resolved_model:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
"LLM_MODEL is required for OpenRouter. "
|
|
135
|
+
"Set it in .cortex/.env (e.g. LLM_MODEL=anthropic/claude-sonnet-4)"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return ChatOpenAI(
|
|
139
|
+
model=resolved_model,
|
|
140
|
+
api_key=settings.openrouter_api_key,
|
|
141
|
+
base_url="https://openrouter.ai/api/v1",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Default: Google Gemini
|
|
145
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
146
|
+
|
|
147
|
+
return ChatGoogleGenerativeAI(
|
|
148
|
+
model=resolved_model or DEFAULT_MODELS["google"],
|
|
149
|
+
google_api_key=settings.google_api_key,
|
|
150
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""HDBSCAN topic clustering for Knowledge Map generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
|
|
8
|
+
import hdbscan
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from codebase_cortex.embeddings.indexer import CodeChunk
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class TopicCluster:
|
|
16
|
+
"""A cluster of related code chunks representing a topic."""
|
|
17
|
+
|
|
18
|
+
cluster_id: int
|
|
19
|
+
label: str
|
|
20
|
+
chunks: list[CodeChunk] = field(default_factory=list)
|
|
21
|
+
centroid: np.ndarray | None = field(default=None, repr=False)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def file_paths(self) -> list[str]:
|
|
25
|
+
"""Unique file paths in this cluster."""
|
|
26
|
+
return sorted(set(c.file_path for c in self.chunks))
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def size(self) -> int:
|
|
30
|
+
return len(self.chunks)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class TopicClusterer:
|
|
35
|
+
"""Clusters code embeddings into topics using HDBSCAN.
|
|
36
|
+
|
|
37
|
+
HDBSCAN is density-based — it automatically determines the number
|
|
38
|
+
of clusters and marks sparse points as noise (cluster_id = -1).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
min_cluster_size: int = 3
|
|
42
|
+
min_samples: int = 2
|
|
43
|
+
|
|
44
|
+
def cluster(
|
|
45
|
+
self,
|
|
46
|
+
embeddings: np.ndarray,
|
|
47
|
+
chunks: list[CodeChunk],
|
|
48
|
+
) -> list[TopicCluster]:
|
|
49
|
+
"""Cluster embeddings and return topic groups.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
embeddings: Array of shape (n, dimension).
|
|
53
|
+
chunks: Corresponding code chunks.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of TopicCluster, excluding noise cluster.
|
|
57
|
+
"""
|
|
58
|
+
if len(embeddings) < self.min_cluster_size:
|
|
59
|
+
# Too few chunks to cluster meaningfully
|
|
60
|
+
return [TopicCluster(
|
|
61
|
+
cluster_id=0,
|
|
62
|
+
label=self._generate_label(chunks),
|
|
63
|
+
chunks=list(chunks),
|
|
64
|
+
)] if chunks else []
|
|
65
|
+
|
|
66
|
+
clusterer = hdbscan.HDBSCAN(
|
|
67
|
+
min_cluster_size=self.min_cluster_size,
|
|
68
|
+
min_samples=self.min_samples,
|
|
69
|
+
metric="euclidean",
|
|
70
|
+
)
|
|
71
|
+
labels = clusterer.fit_predict(embeddings.astype(np.float64))
|
|
72
|
+
|
|
73
|
+
# Group chunks by cluster
|
|
74
|
+
cluster_map: dict[int, list[tuple[CodeChunk, np.ndarray]]] = defaultdict(list)
|
|
75
|
+
for label, chunk, emb in zip(labels, chunks, embeddings):
|
|
76
|
+
cluster_map[int(label)].append((chunk, emb))
|
|
77
|
+
|
|
78
|
+
topics = []
|
|
79
|
+
for cluster_id, items in sorted(cluster_map.items()):
|
|
80
|
+
if cluster_id == -1:
|
|
81
|
+
continue # Skip noise
|
|
82
|
+
|
|
83
|
+
cluster_chunks = [item[0] for item in items]
|
|
84
|
+
cluster_embeddings = np.array([item[1] for item in items])
|
|
85
|
+
centroid = cluster_embeddings.mean(axis=0)
|
|
86
|
+
|
|
87
|
+
topics.append(TopicCluster(
|
|
88
|
+
cluster_id=cluster_id,
|
|
89
|
+
label=self._generate_label(cluster_chunks),
|
|
90
|
+
chunks=cluster_chunks,
|
|
91
|
+
centroid=centroid,
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
return topics
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _generate_label(chunks: list[CodeChunk]) -> str:
|
|
98
|
+
"""Generate a descriptive label from chunk metadata.
|
|
99
|
+
|
|
100
|
+
Uses the most common directory and chunk names to create
|
|
101
|
+
a human-readable topic label.
|
|
102
|
+
"""
|
|
103
|
+
if not chunks:
|
|
104
|
+
return "Unknown"
|
|
105
|
+
|
|
106
|
+
# Find most common directory
|
|
107
|
+
dirs = defaultdict(int)
|
|
108
|
+
names = defaultdict(int)
|
|
109
|
+
for c in chunks:
|
|
110
|
+
parts = c.file_path.split("/")
|
|
111
|
+
if len(parts) > 1:
|
|
112
|
+
dirs[parts[0]] += 1
|
|
113
|
+
names[c.name] += 1
|
|
114
|
+
|
|
115
|
+
top_dir = max(dirs, key=dirs.get) if dirs else ""
|
|
116
|
+
# Pick top 2 most common names
|
|
117
|
+
top_names = sorted(names, key=names.get, reverse=True)[:2]
|
|
118
|
+
|
|
119
|
+
if top_dir:
|
|
120
|
+
return f"{top_dir}: {', '.join(top_names)}"
|
|
121
|
+
return ", ".join(top_names)
|
|
122
|
+
|
|
123
|
+
def to_markdown(self, topics: list[TopicCluster]) -> str:
|
|
124
|
+
"""Render topic clusters as a Markdown Knowledge Map."""
|
|
125
|
+
if not topics:
|
|
126
|
+
return "No topics identified yet.\n"
|
|
127
|
+
|
|
128
|
+
lines = ["# Knowledge Map\n"]
|
|
129
|
+
lines.append(f"*{sum(t.size for t in topics)} code chunks across {len(topics)} topics*\n")
|
|
130
|
+
|
|
131
|
+
for topic in sorted(topics, key=lambda t: t.size, reverse=True):
|
|
132
|
+
lines.append(f"## {topic.label}")
|
|
133
|
+
lines.append(f"*{topic.size} chunks across {len(topic.file_paths)} files*\n")
|
|
134
|
+
for fp in topic.file_paths[:10]:
|
|
135
|
+
lines.append(f"- `{fp}`")
|
|
136
|
+
if len(topic.file_paths) > 10:
|
|
137
|
+
lines.append(f"- ... and {len(topic.file_paths) - 10} more files")
|
|
138
|
+
lines.append("")
|
|
139
|
+
|
|
140
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Sentence-transformers embedding pipeline for code chunks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
# Lazy-loaded to avoid slow import at startup
|
|
12
|
+
_model = None
|
|
13
|
+
MODEL_NAME = "all-MiniLM-L6-v2"
|
|
14
|
+
|
|
15
|
+
# File extensions to index
|
|
16
|
+
CODE_EXTENSIONS = {
|
|
17
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
|
|
18
|
+
".rb", ".php", ".c", ".cpp", ".h", ".hpp", ".cs", ".swift",
|
|
19
|
+
".kt", ".scala", ".sh", ".bash", ".yml", ".yaml", ".toml",
|
|
20
|
+
".json", ".md", ".rst", ".txt",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# Directories to skip
|
|
24
|
+
SKIP_DIRS = {
|
|
25
|
+
".git", ".venv", "venv", "node_modules", "__pycache__",
|
|
26
|
+
".pytest_cache", "dist", "build", ".eggs", ".tox",
|
|
27
|
+
".mypy_cache", ".ruff_cache",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Max file size to index (100KB)
|
|
31
|
+
MAX_FILE_SIZE = 100_000
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_model():
|
|
35
|
+
"""Lazy-load the sentence-transformers model."""
|
|
36
|
+
global _model
|
|
37
|
+
if _model is None:
|
|
38
|
+
from sentence_transformers import SentenceTransformer
|
|
39
|
+
_model = SentenceTransformer(MODEL_NAME)
|
|
40
|
+
return _model
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class CodeChunk:
|
|
45
|
+
"""A chunk of code with metadata for embedding."""
|
|
46
|
+
|
|
47
|
+
file_path: str
|
|
48
|
+
chunk_type: str # "function" | "class" | "module" | "section"
|
|
49
|
+
name: str
|
|
50
|
+
content: str
|
|
51
|
+
start_line: int
|
|
52
|
+
end_line: int
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class EmbeddingIndexer:
|
|
57
|
+
"""Indexes code chunks using sentence-transformers.
|
|
58
|
+
|
|
59
|
+
Walks a repository, extracts meaningful code chunks,
|
|
60
|
+
and generates embeddings for similarity search.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
repo_path: Path
|
|
64
|
+
chunks: list[CodeChunk] = field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
def collect_chunks(self) -> list[CodeChunk]:
|
|
67
|
+
"""Walk the repo and extract code chunks from all indexable files."""
|
|
68
|
+
self.chunks = []
|
|
69
|
+
for file_path in self._iter_files():
|
|
70
|
+
try:
|
|
71
|
+
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
72
|
+
if not content.strip():
|
|
73
|
+
continue
|
|
74
|
+
rel_path = str(file_path.relative_to(self.repo_path))
|
|
75
|
+
chunks = self._chunk_file(rel_path, content)
|
|
76
|
+
self.chunks.extend(chunks)
|
|
77
|
+
except (OSError, UnicodeDecodeError):
|
|
78
|
+
continue
|
|
79
|
+
return self.chunks
|
|
80
|
+
|
|
81
|
+
def embed_chunks(self, chunks: list[CodeChunk] | None = None) -> np.ndarray:
|
|
82
|
+
"""Generate embeddings for code chunks.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
chunks: Chunks to embed. Uses self.chunks if not provided.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
numpy array of shape (n_chunks, embedding_dim).
|
|
89
|
+
"""
|
|
90
|
+
chunks = chunks or self.chunks
|
|
91
|
+
if not chunks:
|
|
92
|
+
return np.array([])
|
|
93
|
+
|
|
94
|
+
model = _get_model()
|
|
95
|
+
texts = [self._chunk_to_text(c) for c in chunks]
|
|
96
|
+
embeddings = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
|
|
97
|
+
return embeddings
|
|
98
|
+
|
|
99
|
+
def embed_texts(self, texts: list[str]) -> np.ndarray:
|
|
100
|
+
"""Embed arbitrary text strings (for query embedding)."""
|
|
101
|
+
if not texts:
|
|
102
|
+
return np.array([])
|
|
103
|
+
model = _get_model()
|
|
104
|
+
return model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
|
|
105
|
+
|
|
106
|
+
def _iter_files(self):
|
|
107
|
+
"""Yield indexable files from the repo."""
|
|
108
|
+
for path in self.repo_path.rglob("*"):
|
|
109
|
+
if any(skip in path.parts for skip in SKIP_DIRS):
|
|
110
|
+
continue
|
|
111
|
+
if not path.is_file():
|
|
112
|
+
continue
|
|
113
|
+
if path.suffix not in CODE_EXTENSIONS:
|
|
114
|
+
continue
|
|
115
|
+
if path.stat().st_size > MAX_FILE_SIZE:
|
|
116
|
+
continue
|
|
117
|
+
yield path
|
|
118
|
+
|
|
119
|
+
def _chunk_file(self, rel_path: str, content: str) -> list[CodeChunk]:
|
|
120
|
+
"""Split a file into meaningful chunks."""
|
|
121
|
+
if rel_path.endswith(".py"):
|
|
122
|
+
return self._chunk_python(rel_path, content)
|
|
123
|
+
# For non-Python files, chunk by sections or as whole module
|
|
124
|
+
return self._chunk_by_sections(rel_path, content)
|
|
125
|
+
|
|
126
|
+
def _chunk_python(self, rel_path: str, content: str) -> list[CodeChunk]:
|
|
127
|
+
"""Extract Python functions and classes as chunks."""
|
|
128
|
+
chunks = []
|
|
129
|
+
lines = content.split("\n")
|
|
130
|
+
|
|
131
|
+
# Regex patterns for top-level definitions
|
|
132
|
+
func_pattern = re.compile(r"^(async\s+)?def\s+(\w+)")
|
|
133
|
+
class_pattern = re.compile(r"^class\s+(\w+)")
|
|
134
|
+
|
|
135
|
+
current_def = None
|
|
136
|
+
current_start = 0
|
|
137
|
+
current_name = ""
|
|
138
|
+
current_type = ""
|
|
139
|
+
|
|
140
|
+
for i, line in enumerate(lines):
|
|
141
|
+
func_match = func_pattern.match(line)
|
|
142
|
+
class_match = class_pattern.match(line)
|
|
143
|
+
|
|
144
|
+
if func_match or class_match:
|
|
145
|
+
# Save previous definition
|
|
146
|
+
if current_def is not None:
|
|
147
|
+
chunk_content = "\n".join(lines[current_start:i])
|
|
148
|
+
if chunk_content.strip():
|
|
149
|
+
chunks.append(CodeChunk(
|
|
150
|
+
file_path=rel_path,
|
|
151
|
+
chunk_type=current_type,
|
|
152
|
+
name=current_name,
|
|
153
|
+
content=chunk_content,
|
|
154
|
+
start_line=current_start + 1,
|
|
155
|
+
end_line=i,
|
|
156
|
+
))
|
|
157
|
+
|
|
158
|
+
if func_match:
|
|
159
|
+
current_type = "function"
|
|
160
|
+
current_name = func_match.group(2)
|
|
161
|
+
else:
|
|
162
|
+
current_type = "class"
|
|
163
|
+
current_name = class_match.group(1)
|
|
164
|
+
current_def = True
|
|
165
|
+
current_start = i
|
|
166
|
+
|
|
167
|
+
# Save last definition
|
|
168
|
+
if current_def is not None:
|
|
169
|
+
chunk_content = "\n".join(lines[current_start:])
|
|
170
|
+
if chunk_content.strip():
|
|
171
|
+
chunks.append(CodeChunk(
|
|
172
|
+
file_path=rel_path,
|
|
173
|
+
chunk_type=current_type,
|
|
174
|
+
name=current_name,
|
|
175
|
+
content=chunk_content,
|
|
176
|
+
start_line=current_start + 1,
|
|
177
|
+
end_line=len(lines),
|
|
178
|
+
))
|
|
179
|
+
|
|
180
|
+
# If no definitions found, treat whole file as module chunk
|
|
181
|
+
if not chunks and content.strip():
|
|
182
|
+
chunks.append(CodeChunk(
|
|
183
|
+
file_path=rel_path,
|
|
184
|
+
chunk_type="module",
|
|
185
|
+
name=rel_path,
|
|
186
|
+
content=content[:3000],
|
|
187
|
+
start_line=1,
|
|
188
|
+
end_line=len(lines),
|
|
189
|
+
))
|
|
190
|
+
|
|
191
|
+
return chunks
|
|
192
|
+
|
|
193
|
+
def _chunk_by_sections(self, rel_path: str, content: str) -> list[CodeChunk]:
|
|
194
|
+
"""Chunk non-Python files as whole modules (truncated if large)."""
|
|
195
|
+
lines = content.split("\n")
|
|
196
|
+
return [CodeChunk(
|
|
197
|
+
file_path=rel_path,
|
|
198
|
+
chunk_type="module",
|
|
199
|
+
name=rel_path,
|
|
200
|
+
content=content[:3000],
|
|
201
|
+
start_line=1,
|
|
202
|
+
end_line=len(lines),
|
|
203
|
+
)]
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def _chunk_to_text(chunk: CodeChunk) -> str:
|
|
207
|
+
"""Convert a chunk to a text string suitable for embedding."""
|
|
208
|
+
return f"{chunk.file_path} ({chunk.chunk_type}: {chunk.name})\n{chunk.content[:2000]}"
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""FAISS index management for code embeddings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
import faiss
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from codebase_cortex.embeddings.indexer import CodeChunk
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SearchResult:
|
|
17
|
+
"""A single search result from the FAISS index."""
|
|
18
|
+
|
|
19
|
+
chunk: CodeChunk
|
|
20
|
+
distance: float
|
|
21
|
+
score: float # 1 / (1 + distance), higher = more similar
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class FAISSStore:
|
|
26
|
+
"""Manages a FAISS vector index for code embeddings.
|
|
27
|
+
|
|
28
|
+
Stores embeddings in a flat L2 index with chunk metadata
|
|
29
|
+
persisted alongside in a JSON sidecar file.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
index_dir: Path
|
|
33
|
+
index: faiss.IndexFlatL2 | None = field(default=None, repr=False)
|
|
34
|
+
chunks: list[CodeChunk] = field(default_factory=list)
|
|
35
|
+
_dimension: int = 384 # all-MiniLM-L6-v2 output dimension
|
|
36
|
+
|
|
37
|
+
def build(self, embeddings: np.ndarray, chunks: list[CodeChunk]) -> None:
|
|
38
|
+
"""Build a new index from embeddings and chunks.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
embeddings: Array of shape (n, dimension).
|
|
42
|
+
chunks: Corresponding code chunks (must match embeddings length).
|
|
43
|
+
"""
|
|
44
|
+
if len(embeddings) == 0:
|
|
45
|
+
self.index = faiss.IndexFlatL2(self._dimension)
|
|
46
|
+
self.chunks = []
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
self._dimension = embeddings.shape[1]
|
|
50
|
+
self.index = faiss.IndexFlatL2(self._dimension)
|
|
51
|
+
self.index.add(embeddings.astype(np.float32))
|
|
52
|
+
self.chunks = list(chunks)
|
|
53
|
+
|
|
54
|
+
def search(self, query_embedding: np.ndarray, k: int = 5) -> list[SearchResult]:
|
|
55
|
+
"""Search the index for the k most similar chunks.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
query_embedding: Array of shape (1, dimension) or (dimension,).
|
|
59
|
+
k: Number of results to return.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of SearchResult sorted by similarity (highest first).
|
|
63
|
+
"""
|
|
64
|
+
if self.index is None or self.index.ntotal == 0:
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
query = query_embedding.reshape(1, -1).astype(np.float32)
|
|
68
|
+
k = min(k, self.index.ntotal)
|
|
69
|
+
distances, indices = self.index.search(query, k)
|
|
70
|
+
|
|
71
|
+
results = []
|
|
72
|
+
for dist, idx in zip(distances[0], indices[0]):
|
|
73
|
+
if idx < 0 or idx >= len(self.chunks):
|
|
74
|
+
continue
|
|
75
|
+
results.append(SearchResult(
|
|
76
|
+
chunk=self.chunks[idx],
|
|
77
|
+
distance=float(dist),
|
|
78
|
+
score=1.0 / (1.0 + float(dist)),
|
|
79
|
+
))
|
|
80
|
+
return results
|
|
81
|
+
|
|
82
|
+
def save(self) -> None:
|
|
83
|
+
"""Persist the index and chunk metadata to disk."""
|
|
84
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
if self.index is not None:
|
|
87
|
+
faiss.write_index(self.index, str(self.index_dir / "index.faiss"))
|
|
88
|
+
|
|
89
|
+
metadata = [
|
|
90
|
+
{
|
|
91
|
+
"file_path": c.file_path,
|
|
92
|
+
"chunk_type": c.chunk_type,
|
|
93
|
+
"name": c.name,
|
|
94
|
+
"content": c.content[:500], # Truncate for storage
|
|
95
|
+
"start_line": c.start_line,
|
|
96
|
+
"end_line": c.end_line,
|
|
97
|
+
}
|
|
98
|
+
for c in self.chunks
|
|
99
|
+
]
|
|
100
|
+
(self.index_dir / "chunks.json").write_text(json.dumps(metadata, indent=2))
|
|
101
|
+
|
|
102
|
+
def load(self) -> bool:
|
|
103
|
+
"""Load an existing index from disk.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
True if loaded successfully, False if no index exists.
|
|
107
|
+
"""
|
|
108
|
+
index_path = self.index_dir / "index.faiss"
|
|
109
|
+
chunks_path = self.index_dir / "chunks.json"
|
|
110
|
+
|
|
111
|
+
if not index_path.exists() or not chunks_path.exists():
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
self.index = faiss.read_index(str(index_path))
|
|
115
|
+
self._dimension = self.index.d
|
|
116
|
+
|
|
117
|
+
metadata = json.loads(chunks_path.read_text())
|
|
118
|
+
self.chunks = [CodeChunk(**m) for m in metadata]
|
|
119
|
+
return True
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def size(self) -> int:
|
|
123
|
+
"""Number of vectors in the index."""
|
|
124
|
+
if self.index is None:
|
|
125
|
+
return 0
|
|
126
|
+
return self.index.ntotal
|
|
File without changes
|