loom-agent 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of loom-agent might be problematic. Click here for more details.

@@ -0,0 +1,393 @@
1
+ """
2
+ Embedding-based Retriever
3
+
4
+ Core retrieval system using embeddings and vector search.
5
+ Provides semantic search with lazy loading and caching.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any, Dict, List, Optional
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+
15
+ from loom.interfaces.retriever import BaseRetriever, Document
16
+ from loom.interfaces.embedding import BaseEmbedding
17
+ from loom.interfaces.vector_store import BaseVectorStore
18
+ from loom.retrieval.domain_adapter import DomainAdapter
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class IndexStrategy(str, Enum):
24
+ """Indexing strategy"""
25
+ EAGER = "eager"
26
+ """Index all documents at initialization"""
27
+
28
+ LAZY = "lazy"
29
+ """Index metadata at initialization, load full documents on demand"""
30
+
31
+ INCREMENTAL = "incremental"
32
+ """Index documents incrementally as they are accessed"""
33
+
34
+
35
+ @dataclass
36
+ class RetrievalConfig:
37
+ """Retrieval configuration"""
38
+
39
+ top_k: int = 5
40
+ """Number of documents to retrieve"""
41
+
42
+ similarity_threshold: float = 0.7
43
+ """Minimum similarity score (0-1) for retrieved documents"""
44
+
45
+ index_strategy: IndexStrategy = IndexStrategy.LAZY
46
+ """Indexing strategy"""
47
+
48
+ enable_cache: bool = True
49
+ """Enable caching of embeddings and documents"""
50
+
51
+ cache_ttl: int = 3600
52
+ """Cache time-to-live in seconds"""
53
+
54
+ batch_size: int = 100
55
+ """Batch size for embedding generation"""
56
+
57
+
58
+ class EmbeddingRetriever(BaseRetriever):
59
+ """
60
+ Embedding-based retriever
61
+
62
+ Provides semantic search using embeddings and vector stores.
63
+ Supports lazy loading, caching, and domain adaptation.
64
+
65
+ Key features:
66
+ - Semantic search using embeddings
67
+ - Multiple indexing strategies (eager/lazy/incremental)
68
+ - Caching of embeddings and documents
69
+ - Domain adaptation via DomainAdapter
70
+
71
+ Example:
72
+ # Create retriever
73
+ retriever = EmbeddingRetriever(
74
+ embedding=OpenAIEmbedding(model="text-embedding-3-small"),
75
+ vector_store=FAISSVectorStore(dimension=1536),
76
+ domain_adapter=my_adapter,
77
+ config=RetrievalConfig(
78
+ index_strategy=IndexStrategy.LAZY,
79
+ top_k=5
80
+ )
81
+ )
82
+
83
+ # Initialize
84
+ await retriever.initialize()
85
+
86
+ # Retrieve
87
+ results = await retriever.retrieve("user query", top_k=5)
88
+ """
89
+
90
+ def __init__(
91
+ self,
92
+ embedding: BaseEmbedding,
93
+ vector_store: BaseVectorStore,
94
+ domain_adapter: Optional[DomainAdapter] = None,
95
+ config: Optional[RetrievalConfig] = None
96
+ ):
97
+ """
98
+ Args:
99
+ embedding: Embedding model
100
+ vector_store: Vector storage backend
101
+ domain_adapter: Domain adapter for data extraction
102
+ config: Retrieval configuration
103
+ """
104
+ self.embedding = embedding
105
+ self.vector_store = vector_store
106
+ self.domain_adapter = domain_adapter
107
+ self.config = config or RetrievalConfig()
108
+
109
+ # Caches
110
+ self._embedding_cache: Dict[str, List[float]] = {}
111
+ self._document_cache: Dict[str, Document] = {}
112
+
113
+ # State
114
+ self._initialized = False
115
+ self._indexed_doc_count = 0
116
+
117
+ async def initialize(self) -> None:
118
+ """Initialize the retriever"""
119
+ if self._initialized:
120
+ logger.debug("Retriever already initialized")
121
+ return
122
+
123
+ logger.info(f"Initializing retriever with strategy: {self.config.index_strategy}")
124
+
125
+ try:
126
+ if self.config.index_strategy == IndexStrategy.EAGER:
127
+ await self._index_all_documents()
128
+ elif self.config.index_strategy == IndexStrategy.LAZY:
129
+ await self._index_metadata_only()
130
+ # INCREMENTAL strategy indexes on-demand, no initialization needed
131
+
132
+ self._initialized = True
133
+ logger.info(f"Retriever initialized successfully. Indexed {self._indexed_doc_count} documents")
134
+
135
+ except Exception as e:
136
+ logger.error(f"Failed to initialize retriever: {e}", exc_info=True)
137
+ raise
138
+
139
+ async def retrieve(
140
+ self,
141
+ query: str,
142
+ top_k: Optional[int] = None,
143
+ filters: Optional[Dict[str, Any]] = None
144
+ ) -> List[Document]:
145
+ """
146
+ Retrieve relevant documents
147
+
148
+ Args:
149
+ query: Query text
150
+ top_k: Number of documents to retrieve (overrides config)
151
+ filters: Metadata filters
152
+
153
+ Returns:
154
+ List of relevant documents sorted by similarity
155
+
156
+ Example:
157
+ results = await retriever.retrieve(
158
+ query="Find user-related tables",
159
+ top_k=5,
160
+ filters={"type": "table"}
161
+ )
162
+ """
163
+ if not self._initialized:
164
+ await self.initialize()
165
+
166
+ k = top_k or self.config.top_k
167
+
168
+ try:
169
+ # 1. Generate query embedding
170
+ query_embedding = await self._get_query_embedding(query)
171
+
172
+ # 2. Vector search
173
+ candidates = await self.vector_store.search(
174
+ query_embedding=query_embedding,
175
+ top_k=k * 2, # Get more candidates for filtering
176
+ filters=filters
177
+ )
178
+
179
+ logger.debug(f"Vector search returned {len(candidates)} candidates")
180
+
181
+ # 3. Filter by similarity threshold
182
+ filtered_candidates = [
183
+ doc for doc in candidates
184
+ if doc.score is not None and doc.score >= self.config.similarity_threshold
185
+ ]
186
+
187
+ logger.debug(f"After filtering: {len(filtered_candidates)} documents")
188
+
189
+ # 4. Lazy load full documents if needed
190
+ if self.config.index_strategy == IndexStrategy.LAZY:
191
+ filtered_candidates = await self._lazy_load_documents(filtered_candidates)
192
+
193
+ # 5. Return top-k
194
+ return filtered_candidates[:k]
195
+
196
+ except Exception as e:
197
+ logger.error(f"Retrieval failed: {e}", exc_info=True)
198
+ raise
199
+
200
+ async def add_documents(
201
+ self,
202
+ documents: List[Document]
203
+ ) -> None:
204
+ """
205
+ Add documents to the retrieval system
206
+
207
+ Args:
208
+ documents: Documents to add
209
+
210
+ Example:
211
+ await retriever.add_documents([
212
+ Document(doc_id="1", content="Document 1"),
213
+ Document(doc_id="2", content="Document 2"),
214
+ ])
215
+ """
216
+ if not self._initialized:
217
+ await self.initialize()
218
+
219
+ # Filter documents
220
+ if self.domain_adapter:
221
+ documents = [doc for doc in documents if self.domain_adapter.should_index(doc)]
222
+
223
+ # Index documents
224
+ await self._index_documents_batch(documents)
225
+
226
+ async def _get_query_embedding(self, query: str) -> List[float]:
227
+ """
228
+ Get query embedding with caching
229
+
230
+ Args:
231
+ query: Query text
232
+
233
+ Returns:
234
+ Embedding vector
235
+ """
236
+ # Check cache
237
+ if self.config.enable_cache and query in self._embedding_cache:
238
+ logger.debug(f"Query embedding cache hit: {query[:50]}...")
239
+ return self._embedding_cache[query]
240
+
241
+ # Generate embedding
242
+ logger.debug(f"Generating embedding for query: {query[:50]}...")
243
+ embedding = await self.embedding.embed_query(query)
244
+
245
+ # Cache
246
+ if self.config.enable_cache:
247
+ self._embedding_cache[query] = embedding
248
+
249
+ return embedding
250
+
251
+ async def _lazy_load_documents(
252
+ self,
253
+ document_refs: List[Document]
254
+ ) -> List[Document]:
255
+ """
256
+ Lazy load full document details
257
+
258
+ Args:
259
+ document_refs: Document references (may contain only metadata)
260
+
261
+ Returns:
262
+ Full documents with details
263
+ """
264
+ if not self.domain_adapter:
265
+ return document_refs
266
+
267
+ loaded_docs = []
268
+
269
+ for doc_ref in document_refs:
270
+ # Check cache
271
+ if doc_ref.doc_id in self._document_cache:
272
+ full_doc = self._document_cache[doc_ref.doc_id]
273
+ full_doc.score = doc_ref.score
274
+ loaded_docs.append(full_doc)
275
+ continue
276
+
277
+ try:
278
+ # Load from adapter
279
+ full_doc = await self.domain_adapter.load_document_details(doc_ref.doc_id)
280
+ full_doc.score = doc_ref.score
281
+
282
+ # Cache
283
+ if self.config.enable_cache:
284
+ self._document_cache[doc_ref.doc_id] = full_doc
285
+
286
+ loaded_docs.append(full_doc)
287
+
288
+ except Exception as e:
289
+ logger.warning(f"Failed to load document {doc_ref.doc_id}: {e}")
290
+ # Fallback to reference
291
+ loaded_docs.append(doc_ref)
292
+
293
+ return loaded_docs
294
+
295
+ async def _index_all_documents(self) -> None:
296
+ """Index all documents (EAGER strategy)"""
297
+ if not self.domain_adapter:
298
+ logger.warning("No domain adapter provided, skipping indexing")
299
+ return
300
+
301
+ logger.info("Indexing all documents (EAGER strategy)")
302
+
303
+ # Extract all documents
304
+ documents = await self.domain_adapter.extract_documents(
305
+ metadata_only=False
306
+ )
307
+
308
+ # Filter documents
309
+ documents = [doc for doc in documents if self.domain_adapter.should_index(doc)]
310
+
311
+ logger.info(f"Extracted {len(documents)} documents to index")
312
+
313
+ # Index in batches
314
+ await self._index_documents_batch(documents)
315
+
316
+ async def _index_metadata_only(self) -> None:
317
+ """Index metadata only (LAZY strategy)"""
318
+ if not self.domain_adapter:
319
+ logger.warning("No domain adapter provided, skipping indexing")
320
+ return
321
+
322
+ logger.info("Indexing metadata only (LAZY strategy)")
323
+
324
+ # Extract lightweight documents
325
+ documents = await self.domain_adapter.extract_documents(
326
+ metadata_only=True
327
+ )
328
+
329
+ # Filter documents
330
+ documents = [doc for doc in documents if self.domain_adapter.should_index(doc)]
331
+
332
+ logger.info(f"Extracted {len(documents)} lightweight documents to index")
333
+
334
+ # Index in batches
335
+ await self._index_documents_batch(documents)
336
+
337
+ async def _index_documents_batch(self, documents: List[Document]) -> None:
338
+ """
339
+ Index documents in batches
340
+
341
+ Args:
342
+ documents: Documents to index
343
+ """
344
+ batch_size = self.config.batch_size
345
+
346
+ for i in range(0, len(documents), batch_size):
347
+ batch = documents[i:i + batch_size]
348
+
349
+ # Format documents for embedding
350
+ texts = [
351
+ self.domain_adapter.format_for_embedding(doc)
352
+ if self.domain_adapter else doc.content
353
+ for doc in batch
354
+ ]
355
+
356
+ # Generate embeddings
357
+ logger.debug(f"Generating embeddings for batch {i//batch_size + 1}")
358
+ embeddings = await self.embedding.embed_documents(texts)
359
+
360
+ # Store in vector store
361
+ await self.vector_store.add_documents(batch, embeddings)
362
+
363
+ # Cache documents
364
+ if self.config.enable_cache:
365
+ for doc in batch:
366
+ self._document_cache[doc.doc_id] = doc
367
+
368
+ self._indexed_doc_count += len(batch)
369
+
370
+ logger.info(f"Indexed {self._indexed_doc_count} documents")
371
+
372
+ def clear_cache(self) -> None:
373
+ """Clear all caches"""
374
+ self._embedding_cache.clear()
375
+ self._document_cache.clear()
376
+ logger.info("Caches cleared")
377
+
378
+ def get_stats(self) -> Dict[str, Any]:
379
+ """
380
+ Get retriever statistics
381
+
382
+ Returns:
383
+ Statistics dictionary
384
+ """
385
+ return {
386
+ "initialized": self._initialized,
387
+ "indexed_documents": self._indexed_doc_count,
388
+ "embedding_cache_size": len(self._embedding_cache),
389
+ "document_cache_size": len(self._document_cache),
390
+ "index_strategy": self.config.index_strategy.value,
391
+ "top_k": self.config.top_k,
392
+ "similarity_threshold": self.config.similarity_threshold
393
+ }