devscontext 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devscontext/__init__.py +3 -0
- devscontext/adapters/__init__.py +23 -0
- devscontext/adapters/base.py +105 -0
- devscontext/adapters/fireflies.py +585 -0
- devscontext/adapters/gmail.py +580 -0
- devscontext/adapters/jira.py +639 -0
- devscontext/adapters/local_docs.py +984 -0
- devscontext/adapters/slack.py +804 -0
- devscontext/agents/__init__.py +28 -0
- devscontext/agents/preprocessor.py +775 -0
- devscontext/agents/watcher.py +265 -0
- devscontext/cache.py +151 -0
- devscontext/cli.py +727 -0
- devscontext/config.py +264 -0
- devscontext/constants.py +107 -0
- devscontext/core.py +582 -0
- devscontext/exceptions.py +148 -0
- devscontext/logging.py +181 -0
- devscontext/models.py +504 -0
- devscontext/plugins/__init__.py +49 -0
- devscontext/plugins/base.py +321 -0
- devscontext/plugins/registry.py +544 -0
- devscontext/py.typed +0 -0
- devscontext/rag/__init__.py +113 -0
- devscontext/rag/embeddings.py +296 -0
- devscontext/rag/index.py +323 -0
- devscontext/server.py +374 -0
- devscontext/storage.py +321 -0
- devscontext/synthesis.py +1057 -0
- devscontext/utils.py +297 -0
- devscontext-0.1.0.dist-info/METADATA +253 -0
- devscontext-0.1.0.dist-info/RECORD +35 -0
- devscontext-0.1.0.dist-info/WHEEL +4 -0
- devscontext-0.1.0.dist-info/entry_points.txt +2 -0
- devscontext-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Embedding providers for RAG-based document search.
|
|
2
|
+
|
|
3
|
+
This module provides abstract and concrete embedding providers that can
|
|
4
|
+
generate vector embeddings for text. These embeddings are used for semantic
|
|
5
|
+
similarity search in the document index.
|
|
6
|
+
|
|
7
|
+
Supported providers:
|
|
8
|
+
- local: Uses sentence-transformers (all-MiniLM-L6-v2 by default)
|
|
9
|
+
- openai: Uses OpenAI's text-embedding-3-small
|
|
10
|
+
- ollama: Uses locally-hosted Ollama models (mxbai-embed-large, nomic-embed-text)
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
provider = LocalEmbeddingProvider("all-MiniLM-L6-v2")
|
|
14
|
+
embeddings = await provider.embed(["Hello world", "How are you?"])
|
|
15
|
+
query_emb = await provider.embed_query("greeting")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import os
|
|
22
|
+
from abc import ABC, abstractmethod
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from devscontext.logging import get_logger
|
|
26
|
+
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EmbeddingProvider(ABC):
|
|
31
|
+
"""Abstract base class for embedding providers.
|
|
32
|
+
|
|
33
|
+
Embedding providers generate vector representations of text that can
|
|
34
|
+
be used for semantic similarity comparison.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, model: str) -> None:
|
|
38
|
+
"""Initialize the embedding provider.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
model: Model identifier to use for embeddings.
|
|
42
|
+
"""
|
|
43
|
+
self.model = model
|
|
44
|
+
self._dimension: int | None = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def dimension(self) -> int:
|
|
48
|
+
"""Return the embedding dimension.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Number of dimensions in the embedding vectors.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
RuntimeError: If dimension is not yet known (call embed first).
|
|
55
|
+
"""
|
|
56
|
+
if self._dimension is None:
|
|
57
|
+
raise RuntimeError("Dimension unknown until first embedding is generated")
|
|
58
|
+
return self._dimension
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
62
|
+
"""Generate embeddings for a batch of texts.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
texts: List of text strings to embed.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of embedding vectors, one per input text.
|
|
69
|
+
"""
|
|
70
|
+
...
|
|
71
|
+
|
|
72
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
73
|
+
"""Generate embedding for a single query.
|
|
74
|
+
|
|
75
|
+
This is a convenience method that wraps embed() for single queries.
|
|
76
|
+
Some providers may override this for query-specific optimizations.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
query: Query text to embed.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Embedding vector for the query.
|
|
83
|
+
"""
|
|
84
|
+
embeddings = await self.embed([query])
|
|
85
|
+
return embeddings[0]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class LocalEmbeddingProvider(EmbeddingProvider):
|
|
89
|
+
"""Embedding provider using sentence-transformers.
|
|
90
|
+
|
|
91
|
+
Uses locally-running models via the sentence-transformers library.
|
|
92
|
+
The default model (all-MiniLM-L6-v2) is fast and produces 384-dimensional
|
|
93
|
+
embeddings suitable for semantic similarity tasks.
|
|
94
|
+
|
|
95
|
+
Requires: pip install sentence-transformers
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, model: str = "all-MiniLM-L6-v2") -> None:
|
|
99
|
+
"""Initialize with a sentence-transformers model.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
model: Model name from HuggingFace (default: all-MiniLM-L6-v2).
|
|
103
|
+
"""
|
|
104
|
+
super().__init__(model)
|
|
105
|
+
self._model_instance: Any = None
|
|
106
|
+
|
|
107
|
+
def _load_model(self) -> Any: # Returns SentenceTransformer
|
|
108
|
+
"""Lazy-load the sentence-transformers model."""
|
|
109
|
+
if self._model_instance is None:
|
|
110
|
+
try:
|
|
111
|
+
from sentence_transformers import SentenceTransformer
|
|
112
|
+
except ImportError as e:
|
|
113
|
+
raise ImportError(
|
|
114
|
+
"sentence-transformers not installed. "
|
|
115
|
+
"Install with: pip install devscontext[rag]"
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
logger.info(
|
|
119
|
+
"Loading sentence-transformers model",
|
|
120
|
+
extra={"model": self.model},
|
|
121
|
+
)
|
|
122
|
+
self._model_instance = SentenceTransformer(self.model)
|
|
123
|
+
self._dimension = self._model_instance.get_sentence_embedding_dimension()
|
|
124
|
+
|
|
125
|
+
return self._model_instance
|
|
126
|
+
|
|
127
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
128
|
+
"""Generate embeddings using sentence-transformers.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
texts: List of text strings to embed.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of embedding vectors.
|
|
135
|
+
"""
|
|
136
|
+
if not texts:
|
|
137
|
+
return []
|
|
138
|
+
|
|
139
|
+
model = self._load_model()
|
|
140
|
+
|
|
141
|
+
# Run in thread pool to avoid blocking async event loop
|
|
142
|
+
loop = asyncio.get_event_loop()
|
|
143
|
+
embeddings = await loop.run_in_executor(
|
|
144
|
+
None,
|
|
145
|
+
lambda: model.encode(texts, show_progress_bar=False, convert_to_numpy=True),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Convert numpy array to list of lists
|
|
149
|
+
return [emb.tolist() for emb in embeddings]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class OpenAIEmbeddingProvider(EmbeddingProvider):
|
|
153
|
+
"""Embedding provider using OpenAI's embedding API.
|
|
154
|
+
|
|
155
|
+
Uses OpenAI's text-embedding-3-small model by default, which produces
|
|
156
|
+
1536-dimensional embeddings with excellent semantic quality.
|
|
157
|
+
|
|
158
|
+
Requires: pip install openai
|
|
159
|
+
Environment: OPENAI_API_KEY must be set.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self, model: str = "text-embedding-3-small") -> None:
|
|
163
|
+
"""Initialize with an OpenAI embedding model.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
model: OpenAI embedding model name (default: text-embedding-3-small).
|
|
167
|
+
"""
|
|
168
|
+
super().__init__(model)
|
|
169
|
+
self._client = None
|
|
170
|
+
|
|
171
|
+
def _get_client(self) -> Any: # Returns AsyncOpenAI
|
|
172
|
+
"""Lazy-load the OpenAI client."""
|
|
173
|
+
if self._client is None:
|
|
174
|
+
try:
|
|
175
|
+
from openai import AsyncOpenAI
|
|
176
|
+
except ImportError as e:
|
|
177
|
+
raise ImportError(
|
|
178
|
+
"openai package not installed. Install with: pip install devscontext[openai]"
|
|
179
|
+
) from e
|
|
180
|
+
|
|
181
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
182
|
+
if not api_key:
|
|
183
|
+
raise ValueError("OPENAI_API_KEY environment variable not set")
|
|
184
|
+
|
|
185
|
+
self._client = AsyncOpenAI(api_key=api_key)
|
|
186
|
+
|
|
187
|
+
return self._client
|
|
188
|
+
|
|
189
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
190
|
+
"""Generate embeddings using OpenAI's API.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
texts: List of text strings to embed.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of embedding vectors.
|
|
197
|
+
"""
|
|
198
|
+
if not texts:
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
client = self._get_client()
|
|
202
|
+
|
|
203
|
+
# OpenAI API accepts batch requests
|
|
204
|
+
response = await client.embeddings.create(
|
|
205
|
+
model=self.model,
|
|
206
|
+
input=texts,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Extract embeddings and set dimension
|
|
210
|
+
embeddings = [item.embedding for item in response.data]
|
|
211
|
+
if embeddings and self._dimension is None:
|
|
212
|
+
self._dimension = len(embeddings[0])
|
|
213
|
+
|
|
214
|
+
return embeddings
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class OllamaEmbeddingProvider(EmbeddingProvider):
|
|
218
|
+
"""Embedding provider using locally-hosted Ollama models.
|
|
219
|
+
|
|
220
|
+
Uses Ollama's embedding API for models like mxbai-embed-large or
|
|
221
|
+
nomic-embed-text. Requires Ollama to be running locally.
|
|
222
|
+
|
|
223
|
+
Default model: mxbai-embed-large (1024 dimensions)
|
|
224
|
+
Alternative: nomic-embed-text (768 dimensions)
|
|
225
|
+
|
|
226
|
+
Requires: Ollama installed and running (https://ollama.ai)
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
def __init__(
|
|
230
|
+
self, model: str = "mxbai-embed-large", base_url: str = "http://localhost:11434"
|
|
231
|
+
) -> None:
|
|
232
|
+
"""Initialize with an Ollama embedding model.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
model: Ollama model name (default: mxbai-embed-large).
|
|
236
|
+
base_url: Ollama API base URL (default: http://localhost:11434).
|
|
237
|
+
"""
|
|
238
|
+
super().__init__(model)
|
|
239
|
+
self.base_url = os.environ.get("OLLAMA_BASE_URL", base_url)
|
|
240
|
+
self._client: Any = None
|
|
241
|
+
|
|
242
|
+
def _get_client(self) -> Any: # Returns httpx.AsyncClient
|
|
243
|
+
"""Lazy-load the HTTP client."""
|
|
244
|
+
if self._client is None:
|
|
245
|
+
try:
|
|
246
|
+
import httpx
|
|
247
|
+
except ImportError as e:
|
|
248
|
+
raise ImportError("httpx not installed (should be a core dependency)") from e
|
|
249
|
+
|
|
250
|
+
self._client = httpx.AsyncClient(
|
|
251
|
+
base_url=self.base_url,
|
|
252
|
+
timeout=60.0, # Embedding can take time for large batches
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return self._client
|
|
256
|
+
|
|
257
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
258
|
+
"""Generate embeddings using Ollama's API.
|
|
259
|
+
|
|
260
|
+
Note: Ollama doesn't support batch embedding, so we make
|
|
261
|
+
individual requests for each text.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
texts: List of text strings to embed.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
List of embedding vectors.
|
|
268
|
+
"""
|
|
269
|
+
if not texts:
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
client = self._get_client()
|
|
273
|
+
embeddings = []
|
|
274
|
+
|
|
275
|
+
for text in texts:
|
|
276
|
+
response = await client.post(
|
|
277
|
+
"/api/embeddings",
|
|
278
|
+
json={"model": self.model, "prompt": text},
|
|
279
|
+
)
|
|
280
|
+
response.raise_for_status()
|
|
281
|
+
|
|
282
|
+
data = response.json()
|
|
283
|
+
embedding = data.get("embedding", [])
|
|
284
|
+
embeddings.append(embedding)
|
|
285
|
+
|
|
286
|
+
# Set dimension from first response
|
|
287
|
+
if self._dimension is None and embedding:
|
|
288
|
+
self._dimension = len(embedding)
|
|
289
|
+
|
|
290
|
+
return embeddings
|
|
291
|
+
|
|
292
|
+
async def close(self) -> None:
|
|
293
|
+
"""Close the HTTP client."""
|
|
294
|
+
if self._client is not None:
|
|
295
|
+
await self._client.aclose()
|
|
296
|
+
self._client = None
|
devscontext/rag/index.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""Document index for RAG-based semantic search.
|
|
2
|
+
|
|
3
|
+
This module provides a document index that stores section metadata and
|
|
4
|
+
embeddings, enabling semantic similarity search using cosine similarity.
|
|
5
|
+
|
|
6
|
+
The index is stored as a JSON file with the following structure:
|
|
7
|
+
{
|
|
8
|
+
"model": "all-MiniLM-L6-v2",
|
|
9
|
+
"dimension": 384,
|
|
10
|
+
"indexed_at": "2024-03-20T12:00:00Z",
|
|
11
|
+
"sections": [
|
|
12
|
+
{"file_path": "...", "section_title": "...", "content": "...", "doc_type": "..."}
|
|
13
|
+
],
|
|
14
|
+
"embeddings": [[0.1, 0.2, ...], ...]
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
index = DocumentIndex(".devscontext/doc_index.json")
|
|
19
|
+
index.load()
|
|
20
|
+
|
|
21
|
+
# Search for similar sections
|
|
22
|
+
results = index.search(query_embedding, top_k=10, threshold=0.3)
|
|
23
|
+
for section, score in results:
|
|
24
|
+
print(f"{section.section_title}: {score:.3f}")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import json
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from datetime import UTC, datetime
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any
|
|
34
|
+
|
|
35
|
+
from devscontext.logging import get_logger
|
|
36
|
+
|
|
37
|
+
logger = get_logger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class IndexedSection:
|
|
42
|
+
"""A document section stored in the index.
|
|
43
|
+
|
|
44
|
+
This mirrors ParsedSection from local_docs but is independent to avoid
|
|
45
|
+
circular imports and allow the index to work without the full adapter.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
file_path: str
|
|
49
|
+
section_title: str | None
|
|
50
|
+
content: str
|
|
51
|
+
doc_type: str # "architecture", "standards", "adr", "other"
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> dict[str, Any]:
|
|
54
|
+
"""Convert to dictionary for JSON serialization."""
|
|
55
|
+
return {
|
|
56
|
+
"file_path": self.file_path,
|
|
57
|
+
"section_title": self.section_title,
|
|
58
|
+
"content": self.content,
|
|
59
|
+
"doc_type": self.doc_type,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_dict(cls, data: dict[str, Any]) -> IndexedSection:
|
|
64
|
+
"""Create from dictionary loaded from JSON."""
|
|
65
|
+
return cls(
|
|
66
|
+
file_path=data["file_path"],
|
|
67
|
+
section_title=data.get("section_title"),
|
|
68
|
+
content=data["content"],
|
|
69
|
+
doc_type=data.get("doc_type", "other"),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DocumentIndex:
|
|
74
|
+
"""Index for storing and searching document embeddings.
|
|
75
|
+
|
|
76
|
+
Uses NumPy for efficient cosine similarity computation and stores
|
|
77
|
+
the index as a JSON file for simplicity and portability.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, index_path: str = ".devscontext/doc_index.json") -> None:
|
|
81
|
+
"""Initialize the document index.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
index_path: Path to the JSON index file.
|
|
85
|
+
"""
|
|
86
|
+
self._index_path = Path(index_path)
|
|
87
|
+
self._model: str | None = None
|
|
88
|
+
self._dimension: int | None = None
|
|
89
|
+
self._indexed_at: datetime | None = None
|
|
90
|
+
self._sections: list[IndexedSection] = []
|
|
91
|
+
self._embeddings: list[list[float]] = []
|
|
92
|
+
self._embeddings_array: Any = None # Cached numpy array
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def is_loaded(self) -> bool:
|
|
96
|
+
"""Check if index has been loaded or built."""
|
|
97
|
+
return len(self._sections) > 0
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def model(self) -> str | None:
|
|
101
|
+
"""Return the model used for embeddings."""
|
|
102
|
+
return self._model
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def dimension(self) -> int | None:
|
|
106
|
+
"""Return the embedding dimension."""
|
|
107
|
+
return self._dimension
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def section_count(self) -> int:
|
|
111
|
+
"""Return number of indexed sections."""
|
|
112
|
+
return len(self._sections)
|
|
113
|
+
|
|
114
|
+
def exists(self) -> bool:
|
|
115
|
+
"""Check if the index file exists."""
|
|
116
|
+
return self._index_path.exists()
|
|
117
|
+
|
|
118
|
+
def load(self) -> bool:
|
|
119
|
+
"""Load the index from disk.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
True if successfully loaded, False if file doesn't exist.
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValueError: If the index file is corrupted or invalid.
|
|
126
|
+
"""
|
|
127
|
+
if not self._index_path.exists():
|
|
128
|
+
logger.debug("Index file not found", extra={"path": str(self._index_path)})
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
with open(self._index_path) as f:
|
|
133
|
+
data = json.load(f)
|
|
134
|
+
|
|
135
|
+
self._model = data.get("model")
|
|
136
|
+
self._dimension = data.get("dimension")
|
|
137
|
+
|
|
138
|
+
indexed_at_str = data.get("indexed_at")
|
|
139
|
+
if indexed_at_str:
|
|
140
|
+
self._indexed_at = datetime.fromisoformat(indexed_at_str)
|
|
141
|
+
|
|
142
|
+
self._sections = [IndexedSection.from_dict(s) for s in data.get("sections", [])]
|
|
143
|
+
self._embeddings = data.get("embeddings", [])
|
|
144
|
+
self._embeddings_array = None # Clear cached array
|
|
145
|
+
|
|
146
|
+
logger.info(
|
|
147
|
+
"Loaded document index",
|
|
148
|
+
extra={
|
|
149
|
+
"path": str(self._index_path),
|
|
150
|
+
"sections": len(self._sections),
|
|
151
|
+
"model": self._model,
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
except json.JSONDecodeError as e:
|
|
157
|
+
raise ValueError(f"Invalid index file format: {e}") from e
|
|
158
|
+
except KeyError as e:
|
|
159
|
+
raise ValueError(f"Missing required field in index: {e}") from e
|
|
160
|
+
|
|
161
|
+
def save(self) -> None:
|
|
162
|
+
"""Save the index to disk.
|
|
163
|
+
|
|
164
|
+
Creates parent directories if needed.
|
|
165
|
+
"""
|
|
166
|
+
self._index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
|
|
168
|
+
data = {
|
|
169
|
+
"model": self._model,
|
|
170
|
+
"dimension": self._dimension,
|
|
171
|
+
"indexed_at": (self._indexed_at.isoformat() if self._indexed_at else None),
|
|
172
|
+
"sections": [s.to_dict() for s in self._sections],
|
|
173
|
+
"embeddings": self._embeddings,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
with open(self._index_path, "w") as f:
|
|
177
|
+
json.dump(data, f, indent=2)
|
|
178
|
+
|
|
179
|
+
logger.info(
|
|
180
|
+
"Saved document index",
|
|
181
|
+
extra={
|
|
182
|
+
"path": str(self._index_path),
|
|
183
|
+
"sections": len(self._sections),
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def add_sections(
|
|
188
|
+
self,
|
|
189
|
+
sections: list[IndexedSection],
|
|
190
|
+
embeddings: list[list[float]],
|
|
191
|
+
model: str,
|
|
192
|
+
) -> None:
|
|
193
|
+
"""Add sections with their embeddings to the index.
|
|
194
|
+
|
|
195
|
+
This replaces any existing content in the index.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
sections: List of document sections.
|
|
199
|
+
embeddings: Corresponding embedding vectors.
|
|
200
|
+
model: Name of the model used for embeddings.
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
ValueError: If sections and embeddings have different lengths.
|
|
204
|
+
"""
|
|
205
|
+
if len(sections) != len(embeddings):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"Sections ({len(sections)}) and embeddings ({len(embeddings)}) "
|
|
208
|
+
"must have the same length"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
self._sections = sections
|
|
212
|
+
self._embeddings = embeddings
|
|
213
|
+
self._model = model
|
|
214
|
+
self._dimension = len(embeddings[0]) if embeddings else None
|
|
215
|
+
self._indexed_at = datetime.now(UTC)
|
|
216
|
+
self._embeddings_array = None # Clear cached array
|
|
217
|
+
|
|
218
|
+
logger.info(
|
|
219
|
+
"Added sections to index",
|
|
220
|
+
extra={
|
|
221
|
+
"sections": len(sections),
|
|
222
|
+
"model": model,
|
|
223
|
+
"dimension": self._dimension,
|
|
224
|
+
},
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def search(
|
|
228
|
+
self,
|
|
229
|
+
query_embedding: list[float],
|
|
230
|
+
top_k: int = 10,
|
|
231
|
+
threshold: float = 0.0,
|
|
232
|
+
) -> list[tuple[IndexedSection, float]]:
|
|
233
|
+
"""Search for similar sections using cosine similarity.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
query_embedding: Query vector to search with.
|
|
237
|
+
top_k: Maximum number of results to return.
|
|
238
|
+
threshold: Minimum similarity score (0-1) to include.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
List of (section, similarity_score) tuples, sorted by score descending.
|
|
242
|
+
"""
|
|
243
|
+
if not self._sections or not self._embeddings:
|
|
244
|
+
return []
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
import numpy as np
|
|
248
|
+
except ImportError as e:
|
|
249
|
+
raise ImportError(
|
|
250
|
+
"numpy not installed. Install with: pip install devscontext[rag]"
|
|
251
|
+
) from e
|
|
252
|
+
|
|
253
|
+
# Cache the embeddings array for repeated queries
|
|
254
|
+
if self._embeddings_array is None:
|
|
255
|
+
self._embeddings_array = np.array(self._embeddings)
|
|
256
|
+
|
|
257
|
+
query_vec = np.array(query_embedding)
|
|
258
|
+
|
|
259
|
+
# Compute cosine similarity
|
|
260
|
+
# cosine_sim = (A . B) / (||A|| * ||B||)
|
|
261
|
+
query_norm = np.linalg.norm(query_vec)
|
|
262
|
+
if query_norm == 0:
|
|
263
|
+
return []
|
|
264
|
+
|
|
265
|
+
doc_norms = np.linalg.norm(self._embeddings_array, axis=1)
|
|
266
|
+
# Avoid division by zero
|
|
267
|
+
doc_norms = np.where(doc_norms == 0, 1, doc_norms)
|
|
268
|
+
|
|
269
|
+
similarities = np.dot(self._embeddings_array, query_vec) / (doc_norms * query_norm)
|
|
270
|
+
|
|
271
|
+
# Filter by threshold and get top-k
|
|
272
|
+
results = []
|
|
273
|
+
for idx, score in enumerate(similarities):
|
|
274
|
+
if score >= threshold:
|
|
275
|
+
results.append((idx, float(score)))
|
|
276
|
+
|
|
277
|
+
# Sort by score descending
|
|
278
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
279
|
+
|
|
280
|
+
# Return top-k with section objects
|
|
281
|
+
return [(self._sections[idx], score) for idx, score in results[:top_k]]
|
|
282
|
+
|
|
283
|
+
def clear(self) -> None:
|
|
284
|
+
"""Clear all data from the index."""
|
|
285
|
+
self._sections = []
|
|
286
|
+
self._embeddings = []
|
|
287
|
+
self._embeddings_array = None
|
|
288
|
+
self._indexed_at = None
|
|
289
|
+
logger.info("Cleared document index")
|
|
290
|
+
|
|
291
|
+
def get_stats(self) -> dict[str, Any]:
|
|
292
|
+
"""Get statistics about the index.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Dictionary with index statistics.
|
|
296
|
+
"""
|
|
297
|
+
doc_types: dict[str, int] = {}
|
|
298
|
+
for section in self._sections:
|
|
299
|
+
doc_types[section.doc_type] = doc_types.get(section.doc_type, 0) + 1
|
|
300
|
+
|
|
301
|
+
return {
|
|
302
|
+
"exists": self.exists(),
|
|
303
|
+
"loaded": self.is_loaded,
|
|
304
|
+
"model": self._model,
|
|
305
|
+
"dimension": self._dimension,
|
|
306
|
+
"section_count": len(self._sections),
|
|
307
|
+
"indexed_at": (self._indexed_at.isoformat() if self._indexed_at else None),
|
|
308
|
+
"doc_types": doc_types,
|
|
309
|
+
"index_path": str(self._index_path),
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
def delete(self) -> bool:
|
|
313
|
+
"""Delete the index file from disk.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
True if deleted, False if file didn't exist.
|
|
317
|
+
"""
|
|
318
|
+
if self._index_path.exists():
|
|
319
|
+
self._index_path.unlink()
|
|
320
|
+
self.clear()
|
|
321
|
+
logger.info("Deleted index file", extra={"path": str(self._index_path)})
|
|
322
|
+
return True
|
|
323
|
+
return False
|