devscontext 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,296 @@
1
+ """Embedding providers for RAG-based document search.
2
+
3
+ This module provides abstract and concrete embedding providers that can
4
+ generate vector embeddings for text. These embeddings are used for semantic
5
+ similarity search in the document index.
6
+
7
+ Supported providers:
8
+ - local: Uses sentence-transformers (all-MiniLM-L6-v2 by default)
9
+ - openai: Uses OpenAI's text-embedding-3-small
10
+ - ollama: Uses locally-hosted Ollama models (mxbai-embed-large, nomic-embed-text)
11
+
12
+ Example:
13
+ provider = LocalEmbeddingProvider("all-MiniLM-L6-v2")
14
+ embeddings = await provider.embed(["Hello world", "How are you?"])
15
+ query_emb = await provider.embed_query("greeting")
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ import os
22
+ from abc import ABC, abstractmethod
23
+ from typing import Any
24
+
25
+ from devscontext.logging import get_logger
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ class EmbeddingProvider(ABC):
31
+ """Abstract base class for embedding providers.
32
+
33
+ Embedding providers generate vector representations of text that can
34
+ be used for semantic similarity comparison.
35
+ """
36
+
37
+ def __init__(self, model: str) -> None:
38
+ """Initialize the embedding provider.
39
+
40
+ Args:
41
+ model: Model identifier to use for embeddings.
42
+ """
43
+ self.model = model
44
+ self._dimension: int | None = None
45
+
46
+ @property
47
+ def dimension(self) -> int:
48
+ """Return the embedding dimension.
49
+
50
+ Returns:
51
+ Number of dimensions in the embedding vectors.
52
+
53
+ Raises:
54
+ RuntimeError: If dimension is not yet known (call embed first).
55
+ """
56
+ if self._dimension is None:
57
+ raise RuntimeError("Dimension unknown until first embedding is generated")
58
+ return self._dimension
59
+
60
+ @abstractmethod
61
+ async def embed(self, texts: list[str]) -> list[list[float]]:
62
+ """Generate embeddings for a batch of texts.
63
+
64
+ Args:
65
+ texts: List of text strings to embed.
66
+
67
+ Returns:
68
+ List of embedding vectors, one per input text.
69
+ """
70
+ ...
71
+
72
+ async def embed_query(self, query: str) -> list[float]:
73
+ """Generate embedding for a single query.
74
+
75
+ This is a convenience method that wraps embed() for single queries.
76
+ Some providers may override this for query-specific optimizations.
77
+
78
+ Args:
79
+ query: Query text to embed.
80
+
81
+ Returns:
82
+ Embedding vector for the query.
83
+ """
84
+ embeddings = await self.embed([query])
85
+ return embeddings[0]
86
+
87
+
88
+ class LocalEmbeddingProvider(EmbeddingProvider):
89
+ """Embedding provider using sentence-transformers.
90
+
91
+ Uses locally-running models via the sentence-transformers library.
92
+ The default model (all-MiniLM-L6-v2) is fast and produces 384-dimensional
93
+ embeddings suitable for semantic similarity tasks.
94
+
95
+ Requires: pip install sentence-transformers
96
+ """
97
+
98
+ def __init__(self, model: str = "all-MiniLM-L6-v2") -> None:
99
+ """Initialize with a sentence-transformers model.
100
+
101
+ Args:
102
+ model: Model name from HuggingFace (default: all-MiniLM-L6-v2).
103
+ """
104
+ super().__init__(model)
105
+ self._model_instance: Any = None
106
+
107
+ def _load_model(self) -> Any: # Returns SentenceTransformer
108
+ """Lazy-load the sentence-transformers model."""
109
+ if self._model_instance is None:
110
+ try:
111
+ from sentence_transformers import SentenceTransformer
112
+ except ImportError as e:
113
+ raise ImportError(
114
+ "sentence-transformers not installed. "
115
+ "Install with: pip install devscontext[rag]"
116
+ ) from e
117
+
118
+ logger.info(
119
+ "Loading sentence-transformers model",
120
+ extra={"model": self.model},
121
+ )
122
+ self._model_instance = SentenceTransformer(self.model)
123
+ self._dimension = self._model_instance.get_sentence_embedding_dimension()
124
+
125
+ return self._model_instance
126
+
127
+ async def embed(self, texts: list[str]) -> list[list[float]]:
128
+ """Generate embeddings using sentence-transformers.
129
+
130
+ Args:
131
+ texts: List of text strings to embed.
132
+
133
+ Returns:
134
+ List of embedding vectors.
135
+ """
136
+ if not texts:
137
+ return []
138
+
139
+ model = self._load_model()
140
+
141
+ # Run in thread pool to avoid blocking async event loop
142
+ loop = asyncio.get_event_loop()
143
+ embeddings = await loop.run_in_executor(
144
+ None,
145
+ lambda: model.encode(texts, show_progress_bar=False, convert_to_numpy=True),
146
+ )
147
+
148
+ # Convert numpy array to list of lists
149
+ return [emb.tolist() for emb in embeddings]
150
+
151
+
152
+ class OpenAIEmbeddingProvider(EmbeddingProvider):
153
+ """Embedding provider using OpenAI's embedding API.
154
+
155
+ Uses OpenAI's text-embedding-3-small model by default, which produces
156
+ 1536-dimensional embeddings with excellent semantic quality.
157
+
158
+ Requires: pip install openai
159
+ Environment: OPENAI_API_KEY must be set.
160
+ """
161
+
162
+ def __init__(self, model: str = "text-embedding-3-small") -> None:
163
+ """Initialize with an OpenAI embedding model.
164
+
165
+ Args:
166
+ model: OpenAI embedding model name (default: text-embedding-3-small).
167
+ """
168
+ super().__init__(model)
169
+ self._client = None
170
+
171
+ def _get_client(self) -> Any: # Returns AsyncOpenAI
172
+ """Lazy-load the OpenAI client."""
173
+ if self._client is None:
174
+ try:
175
+ from openai import AsyncOpenAI
176
+ except ImportError as e:
177
+ raise ImportError(
178
+ "openai package not installed. Install with: pip install devscontext[openai]"
179
+ ) from e
180
+
181
+ api_key = os.environ.get("OPENAI_API_KEY")
182
+ if not api_key:
183
+ raise ValueError("OPENAI_API_KEY environment variable not set")
184
+
185
+ self._client = AsyncOpenAI(api_key=api_key)
186
+
187
+ return self._client
188
+
189
+ async def embed(self, texts: list[str]) -> list[list[float]]:
190
+ """Generate embeddings using OpenAI's API.
191
+
192
+ Args:
193
+ texts: List of text strings to embed.
194
+
195
+ Returns:
196
+ List of embedding vectors.
197
+ """
198
+ if not texts:
199
+ return []
200
+
201
+ client = self._get_client()
202
+
203
+ # OpenAI API accepts batch requests
204
+ response = await client.embeddings.create(
205
+ model=self.model,
206
+ input=texts,
207
+ )
208
+
209
+ # Extract embeddings and set dimension
210
+ embeddings = [item.embedding for item in response.data]
211
+ if embeddings and self._dimension is None:
212
+ self._dimension = len(embeddings[0])
213
+
214
+ return embeddings
215
+
216
+
217
+ class OllamaEmbeddingProvider(EmbeddingProvider):
218
+ """Embedding provider using locally-hosted Ollama models.
219
+
220
+ Uses Ollama's embedding API for models like mxbai-embed-large or
221
+ nomic-embed-text. Requires Ollama to be running locally.
222
+
223
+ Default model: mxbai-embed-large (1024 dimensions)
224
+ Alternative: nomic-embed-text (768 dimensions)
225
+
226
+ Requires: Ollama installed and running (https://ollama.ai)
227
+ """
228
+
229
+ def __init__(
230
+ self, model: str = "mxbai-embed-large", base_url: str = "http://localhost:11434"
231
+ ) -> None:
232
+ """Initialize with an Ollama embedding model.
233
+
234
+ Args:
235
+ model: Ollama model name (default: mxbai-embed-large).
236
+ base_url: Ollama API base URL (default: http://localhost:11434).
237
+ """
238
+ super().__init__(model)
239
+ self.base_url = os.environ.get("OLLAMA_BASE_URL", base_url)
240
+ self._client: Any = None
241
+
242
+ def _get_client(self) -> Any: # Returns httpx.AsyncClient
243
+ """Lazy-load the HTTP client."""
244
+ if self._client is None:
245
+ try:
246
+ import httpx
247
+ except ImportError as e:
248
+ raise ImportError("httpx not installed (should be a core dependency)") from e
249
+
250
+ self._client = httpx.AsyncClient(
251
+ base_url=self.base_url,
252
+ timeout=60.0, # Embedding can take time for large batches
253
+ )
254
+
255
+ return self._client
256
+
257
+ async def embed(self, texts: list[str]) -> list[list[float]]:
258
+ """Generate embeddings using Ollama's API.
259
+
260
+ Note: Ollama doesn't support batch embedding, so we make
261
+ individual requests for each text.
262
+
263
+ Args:
264
+ texts: List of text strings to embed.
265
+
266
+ Returns:
267
+ List of embedding vectors.
268
+ """
269
+ if not texts:
270
+ return []
271
+
272
+ client = self._get_client()
273
+ embeddings = []
274
+
275
+ for text in texts:
276
+ response = await client.post(
277
+ "/api/embeddings",
278
+ json={"model": self.model, "prompt": text},
279
+ )
280
+ response.raise_for_status()
281
+
282
+ data = response.json()
283
+ embedding = data.get("embedding", [])
284
+ embeddings.append(embedding)
285
+
286
+ # Set dimension from first response
287
+ if self._dimension is None and embedding:
288
+ self._dimension = len(embedding)
289
+
290
+ return embeddings
291
+
292
+ async def close(self) -> None:
293
+ """Close the HTTP client."""
294
+ if self._client is not None:
295
+ await self._client.aclose()
296
+ self._client = None
@@ -0,0 +1,323 @@
1
+ """Document index for RAG-based semantic search.
2
+
3
+ This module provides a document index that stores section metadata and
4
+ embeddings, enabling semantic similarity search using cosine similarity.
5
+
6
+ The index is stored as a JSON file with the following structure:
7
+ {
8
+ "model": "all-MiniLM-L6-v2",
9
+ "dimension": 384,
10
+ "indexed_at": "2024-03-20T12:00:00Z",
11
+ "sections": [
12
+ {"file_path": "...", "section_title": "...", "content": "...", "doc_type": "..."}
13
+ ],
14
+ "embeddings": [[0.1, 0.2, ...], ...]
15
+ }
16
+
17
+ Example:
18
+ index = DocumentIndex(".devscontext/doc_index.json")
19
+ index.load()
20
+
21
+ # Search for similar sections
22
+ results = index.search(query_embedding, top_k=10, threshold=0.3)
23
+ for section, score in results:
24
+ print(f"{section.section_title}: {score:.3f}")
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ from dataclasses import dataclass
31
+ from datetime import UTC, datetime
32
+ from pathlib import Path
33
+ from typing import Any
34
+
35
+ from devscontext.logging import get_logger
36
+
37
+ logger = get_logger(__name__)
38
+
39
+
40
+ @dataclass
41
+ class IndexedSection:
42
+ """A document section stored in the index.
43
+
44
+ This mirrors ParsedSection from local_docs but is independent to avoid
45
+ circular imports and allow the index to work without the full adapter.
46
+ """
47
+
48
+ file_path: str
49
+ section_title: str | None
50
+ content: str
51
+ doc_type: str # "architecture", "standards", "adr", "other"
52
+
53
+ def to_dict(self) -> dict[str, Any]:
54
+ """Convert to dictionary for JSON serialization."""
55
+ return {
56
+ "file_path": self.file_path,
57
+ "section_title": self.section_title,
58
+ "content": self.content,
59
+ "doc_type": self.doc_type,
60
+ }
61
+
62
+ @classmethod
63
+ def from_dict(cls, data: dict[str, Any]) -> IndexedSection:
64
+ """Create from dictionary loaded from JSON."""
65
+ return cls(
66
+ file_path=data["file_path"],
67
+ section_title=data.get("section_title"),
68
+ content=data["content"],
69
+ doc_type=data.get("doc_type", "other"),
70
+ )
71
+
72
+
73
+ class DocumentIndex:
74
+ """Index for storing and searching document embeddings.
75
+
76
+ Uses NumPy for efficient cosine similarity computation and stores
77
+ the index as a JSON file for simplicity and portability.
78
+ """
79
+
80
+ def __init__(self, index_path: str = ".devscontext/doc_index.json") -> None:
81
+ """Initialize the document index.
82
+
83
+ Args:
84
+ index_path: Path to the JSON index file.
85
+ """
86
+ self._index_path = Path(index_path)
87
+ self._model: str | None = None
88
+ self._dimension: int | None = None
89
+ self._indexed_at: datetime | None = None
90
+ self._sections: list[IndexedSection] = []
91
+ self._embeddings: list[list[float]] = []
92
+ self._embeddings_array: Any = None # Cached numpy array
93
+
94
+ @property
95
+ def is_loaded(self) -> bool:
96
+ """Check if index has been loaded or built."""
97
+ return len(self._sections) > 0
98
+
99
+ @property
100
+ def model(self) -> str | None:
101
+ """Return the model used for embeddings."""
102
+ return self._model
103
+
104
+ @property
105
+ def dimension(self) -> int | None:
106
+ """Return the embedding dimension."""
107
+ return self._dimension
108
+
109
+ @property
110
+ def section_count(self) -> int:
111
+ """Return number of indexed sections."""
112
+ return len(self._sections)
113
+
114
+ def exists(self) -> bool:
115
+ """Check if the index file exists."""
116
+ return self._index_path.exists()
117
+
118
+ def load(self) -> bool:
119
+ """Load the index from disk.
120
+
121
+ Returns:
122
+ True if successfully loaded, False if file doesn't exist.
123
+
124
+ Raises:
125
+ ValueError: If the index file is corrupted or invalid.
126
+ """
127
+ if not self._index_path.exists():
128
+ logger.debug("Index file not found", extra={"path": str(self._index_path)})
129
+ return False
130
+
131
+ try:
132
+ with open(self._index_path) as f:
133
+ data = json.load(f)
134
+
135
+ self._model = data.get("model")
136
+ self._dimension = data.get("dimension")
137
+
138
+ indexed_at_str = data.get("indexed_at")
139
+ if indexed_at_str:
140
+ self._indexed_at = datetime.fromisoformat(indexed_at_str)
141
+
142
+ self._sections = [IndexedSection.from_dict(s) for s in data.get("sections", [])]
143
+ self._embeddings = data.get("embeddings", [])
144
+ self._embeddings_array = None # Clear cached array
145
+
146
+ logger.info(
147
+ "Loaded document index",
148
+ extra={
149
+ "path": str(self._index_path),
150
+ "sections": len(self._sections),
151
+ "model": self._model,
152
+ },
153
+ )
154
+ return True
155
+
156
+ except json.JSONDecodeError as e:
157
+ raise ValueError(f"Invalid index file format: {e}") from e
158
+ except KeyError as e:
159
+ raise ValueError(f"Missing required field in index: {e}") from e
160
+
161
+ def save(self) -> None:
162
+ """Save the index to disk.
163
+
164
+ Creates parent directories if needed.
165
+ """
166
+ self._index_path.parent.mkdir(parents=True, exist_ok=True)
167
+
168
+ data = {
169
+ "model": self._model,
170
+ "dimension": self._dimension,
171
+ "indexed_at": (self._indexed_at.isoformat() if self._indexed_at else None),
172
+ "sections": [s.to_dict() for s in self._sections],
173
+ "embeddings": self._embeddings,
174
+ }
175
+
176
+ with open(self._index_path, "w") as f:
177
+ json.dump(data, f, indent=2)
178
+
179
+ logger.info(
180
+ "Saved document index",
181
+ extra={
182
+ "path": str(self._index_path),
183
+ "sections": len(self._sections),
184
+ },
185
+ )
186
+
187
+ def add_sections(
188
+ self,
189
+ sections: list[IndexedSection],
190
+ embeddings: list[list[float]],
191
+ model: str,
192
+ ) -> None:
193
+ """Add sections with their embeddings to the index.
194
+
195
+ This replaces any existing content in the index.
196
+
197
+ Args:
198
+ sections: List of document sections.
199
+ embeddings: Corresponding embedding vectors.
200
+ model: Name of the model used for embeddings.
201
+
202
+ Raises:
203
+ ValueError: If sections and embeddings have different lengths.
204
+ """
205
+ if len(sections) != len(embeddings):
206
+ raise ValueError(
207
+ f"Sections ({len(sections)}) and embeddings ({len(embeddings)}) "
208
+ "must have the same length"
209
+ )
210
+
211
+ self._sections = sections
212
+ self._embeddings = embeddings
213
+ self._model = model
214
+ self._dimension = len(embeddings[0]) if embeddings else None
215
+ self._indexed_at = datetime.now(UTC)
216
+ self._embeddings_array = None # Clear cached array
217
+
218
+ logger.info(
219
+ "Added sections to index",
220
+ extra={
221
+ "sections": len(sections),
222
+ "model": model,
223
+ "dimension": self._dimension,
224
+ },
225
+ )
226
+
227
+ def search(
228
+ self,
229
+ query_embedding: list[float],
230
+ top_k: int = 10,
231
+ threshold: float = 0.0,
232
+ ) -> list[tuple[IndexedSection, float]]:
233
+ """Search for similar sections using cosine similarity.
234
+
235
+ Args:
236
+ query_embedding: Query vector to search with.
237
+ top_k: Maximum number of results to return.
238
+ threshold: Minimum similarity score (0-1) to include.
239
+
240
+ Returns:
241
+ List of (section, similarity_score) tuples, sorted by score descending.
242
+ """
243
+ if not self._sections or not self._embeddings:
244
+ return []
245
+
246
+ try:
247
+ import numpy as np
248
+ except ImportError as e:
249
+ raise ImportError(
250
+ "numpy not installed. Install with: pip install devscontext[rag]"
251
+ ) from e
252
+
253
+ # Cache the embeddings array for repeated queries
254
+ if self._embeddings_array is None:
255
+ self._embeddings_array = np.array(self._embeddings)
256
+
257
+ query_vec = np.array(query_embedding)
258
+
259
+ # Compute cosine similarity
260
+ # cosine_sim = (A . B) / (||A|| * ||B||)
261
+ query_norm = np.linalg.norm(query_vec)
262
+ if query_norm == 0:
263
+ return []
264
+
265
+ doc_norms = np.linalg.norm(self._embeddings_array, axis=1)
266
+ # Avoid division by zero
267
+ doc_norms = np.where(doc_norms == 0, 1, doc_norms)
268
+
269
+ similarities = np.dot(self._embeddings_array, query_vec) / (doc_norms * query_norm)
270
+
271
+ # Filter by threshold and get top-k
272
+ results = []
273
+ for idx, score in enumerate(similarities):
274
+ if score >= threshold:
275
+ results.append((idx, float(score)))
276
+
277
+ # Sort by score descending
278
+ results.sort(key=lambda x: x[1], reverse=True)
279
+
280
+ # Return top-k with section objects
281
+ return [(self._sections[idx], score) for idx, score in results[:top_k]]
282
+
283
+ def clear(self) -> None:
284
+ """Clear all data from the index."""
285
+ self._sections = []
286
+ self._embeddings = []
287
+ self._embeddings_array = None
288
+ self._indexed_at = None
289
+ logger.info("Cleared document index")
290
+
291
+ def get_stats(self) -> dict[str, Any]:
292
+ """Get statistics about the index.
293
+
294
+ Returns:
295
+ Dictionary with index statistics.
296
+ """
297
+ doc_types: dict[str, int] = {}
298
+ for section in self._sections:
299
+ doc_types[section.doc_type] = doc_types.get(section.doc_type, 0) + 1
300
+
301
+ return {
302
+ "exists": self.exists(),
303
+ "loaded": self.is_loaded,
304
+ "model": self._model,
305
+ "dimension": self._dimension,
306
+ "section_count": len(self._sections),
307
+ "indexed_at": (self._indexed_at.isoformat() if self._indexed_at else None),
308
+ "doc_types": doc_types,
309
+ "index_path": str(self._index_path),
310
+ }
311
+
312
+ def delete(self) -> bool:
313
+ """Delete the index file from disk.
314
+
315
+ Returns:
316
+ True if deleted, False if file didn't exist.
317
+ """
318
+ if self._index_path.exists():
319
+ self._index_path.unlink()
320
+ self.clear()
321
+ logger.info("Deleted index file", extra={"path": str(self._index_path)})
322
+ return True
323
+ return False