loom-agent 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of loom-agent might be problematic. Click here for more details.
- loom/api/__init__.py +19 -0
- loom/api/v0_0_3.py +1 -0
- loom/builtin/retriever/faiss_store.py +403 -0
- loom/core/agent_executor.py +212 -26
- loom/core/events.py +3 -0
- loom/core/recursion_control.py +298 -0
- loom/core/turn_state.py +58 -6
- loom/retrieval/__init__.py +61 -0
- loom/retrieval/domain_adapter.py +195 -0
- loom/retrieval/embedding_retriever.py +393 -0
- loom_agent-0.0.5.dist-info/METADATA +561 -0
- {loom_agent-0.0.4.dist-info → loom_agent-0.0.5.dist-info}/RECORD +14 -8
- loom_agent-0.0.4.dist-info/METADATA +0 -292
- {loom_agent-0.0.4.dist-info → loom_agent-0.0.5.dist-info}/WHEEL +0 -0
- {loom_agent-0.0.4.dist-info → loom_agent-0.0.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding-based Retriever
|
|
3
|
+
|
|
4
|
+
Core retrieval system using embeddings and vector search.
|
|
5
|
+
Provides semantic search with lazy loading and caching.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
from loom.interfaces.retriever import BaseRetriever, Document
|
|
16
|
+
from loom.interfaces.embedding import BaseEmbedding
|
|
17
|
+
from loom.interfaces.vector_store import BaseVectorStore
|
|
18
|
+
from loom.retrieval.domain_adapter import DomainAdapter
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class IndexStrategy(str, Enum):
|
|
24
|
+
"""Indexing strategy"""
|
|
25
|
+
EAGER = "eager"
|
|
26
|
+
"""Index all documents at initialization"""
|
|
27
|
+
|
|
28
|
+
LAZY = "lazy"
|
|
29
|
+
"""Index metadata at initialization, load full documents on demand"""
|
|
30
|
+
|
|
31
|
+
INCREMENTAL = "incremental"
|
|
32
|
+
"""Index documents incrementally as they are accessed"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class RetrievalConfig:
|
|
37
|
+
"""Retrieval configuration"""
|
|
38
|
+
|
|
39
|
+
top_k: int = 5
|
|
40
|
+
"""Number of documents to retrieve"""
|
|
41
|
+
|
|
42
|
+
similarity_threshold: float = 0.7
|
|
43
|
+
"""Minimum similarity score (0-1) for retrieved documents"""
|
|
44
|
+
|
|
45
|
+
index_strategy: IndexStrategy = IndexStrategy.LAZY
|
|
46
|
+
"""Indexing strategy"""
|
|
47
|
+
|
|
48
|
+
enable_cache: bool = True
|
|
49
|
+
"""Enable caching of embeddings and documents"""
|
|
50
|
+
|
|
51
|
+
cache_ttl: int = 3600
|
|
52
|
+
"""Cache time-to-live in seconds"""
|
|
53
|
+
|
|
54
|
+
batch_size: int = 100
|
|
55
|
+
"""Batch size for embedding generation"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class EmbeddingRetriever(BaseRetriever):
|
|
59
|
+
"""
|
|
60
|
+
Embedding-based retriever
|
|
61
|
+
|
|
62
|
+
Provides semantic search using embeddings and vector stores.
|
|
63
|
+
Supports lazy loading, caching, and domain adaptation.
|
|
64
|
+
|
|
65
|
+
Key features:
|
|
66
|
+
- Semantic search using embeddings
|
|
67
|
+
- Multiple indexing strategies (eager/lazy/incremental)
|
|
68
|
+
- Caching of embeddings and documents
|
|
69
|
+
- Domain adaptation via DomainAdapter
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
# Create retriever
|
|
73
|
+
retriever = EmbeddingRetriever(
|
|
74
|
+
embedding=OpenAIEmbedding(model="text-embedding-3-small"),
|
|
75
|
+
vector_store=FAISSVectorStore(dimension=1536),
|
|
76
|
+
domain_adapter=my_adapter,
|
|
77
|
+
config=RetrievalConfig(
|
|
78
|
+
index_strategy=IndexStrategy.LAZY,
|
|
79
|
+
top_k=5
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Initialize
|
|
84
|
+
await retriever.initialize()
|
|
85
|
+
|
|
86
|
+
# Retrieve
|
|
87
|
+
results = await retriever.retrieve("user query", top_k=5)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
embedding: BaseEmbedding,
|
|
93
|
+
vector_store: BaseVectorStore,
|
|
94
|
+
domain_adapter: Optional[DomainAdapter] = None,
|
|
95
|
+
config: Optional[RetrievalConfig] = None
|
|
96
|
+
):
|
|
97
|
+
"""
|
|
98
|
+
Args:
|
|
99
|
+
embedding: Embedding model
|
|
100
|
+
vector_store: Vector storage backend
|
|
101
|
+
domain_adapter: Domain adapter for data extraction
|
|
102
|
+
config: Retrieval configuration
|
|
103
|
+
"""
|
|
104
|
+
self.embedding = embedding
|
|
105
|
+
self.vector_store = vector_store
|
|
106
|
+
self.domain_adapter = domain_adapter
|
|
107
|
+
self.config = config or RetrievalConfig()
|
|
108
|
+
|
|
109
|
+
# Caches
|
|
110
|
+
self._embedding_cache: Dict[str, List[float]] = {}
|
|
111
|
+
self._document_cache: Dict[str, Document] = {}
|
|
112
|
+
|
|
113
|
+
# State
|
|
114
|
+
self._initialized = False
|
|
115
|
+
self._indexed_doc_count = 0
|
|
116
|
+
|
|
117
|
+
async def initialize(self) -> None:
|
|
118
|
+
"""Initialize the retriever"""
|
|
119
|
+
if self._initialized:
|
|
120
|
+
logger.debug("Retriever already initialized")
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
logger.info(f"Initializing retriever with strategy: {self.config.index_strategy}")
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
if self.config.index_strategy == IndexStrategy.EAGER:
|
|
127
|
+
await self._index_all_documents()
|
|
128
|
+
elif self.config.index_strategy == IndexStrategy.LAZY:
|
|
129
|
+
await self._index_metadata_only()
|
|
130
|
+
# INCREMENTAL strategy indexes on-demand, no initialization needed
|
|
131
|
+
|
|
132
|
+
self._initialized = True
|
|
133
|
+
logger.info(f"Retriever initialized successfully. Indexed {self._indexed_doc_count} documents")
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"Failed to initialize retriever: {e}", exc_info=True)
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
async def retrieve(
|
|
140
|
+
self,
|
|
141
|
+
query: str,
|
|
142
|
+
top_k: Optional[int] = None,
|
|
143
|
+
filters: Optional[Dict[str, Any]] = None
|
|
144
|
+
) -> List[Document]:
|
|
145
|
+
"""
|
|
146
|
+
Retrieve relevant documents
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
query: Query text
|
|
150
|
+
top_k: Number of documents to retrieve (overrides config)
|
|
151
|
+
filters: Metadata filters
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List of relevant documents sorted by similarity
|
|
155
|
+
|
|
156
|
+
Example:
|
|
157
|
+
results = await retriever.retrieve(
|
|
158
|
+
query="Find user-related tables",
|
|
159
|
+
top_k=5,
|
|
160
|
+
filters={"type": "table"}
|
|
161
|
+
)
|
|
162
|
+
"""
|
|
163
|
+
if not self._initialized:
|
|
164
|
+
await self.initialize()
|
|
165
|
+
|
|
166
|
+
k = top_k or self.config.top_k
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
# 1. Generate query embedding
|
|
170
|
+
query_embedding = await self._get_query_embedding(query)
|
|
171
|
+
|
|
172
|
+
# 2. Vector search
|
|
173
|
+
candidates = await self.vector_store.search(
|
|
174
|
+
query_embedding=query_embedding,
|
|
175
|
+
top_k=k * 2, # Get more candidates for filtering
|
|
176
|
+
filters=filters
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
logger.debug(f"Vector search returned {len(candidates)} candidates")
|
|
180
|
+
|
|
181
|
+
# 3. Filter by similarity threshold
|
|
182
|
+
filtered_candidates = [
|
|
183
|
+
doc for doc in candidates
|
|
184
|
+
if doc.score is not None and doc.score >= self.config.similarity_threshold
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
logger.debug(f"After filtering: {len(filtered_candidates)} documents")
|
|
188
|
+
|
|
189
|
+
# 4. Lazy load full documents if needed
|
|
190
|
+
if self.config.index_strategy == IndexStrategy.LAZY:
|
|
191
|
+
filtered_candidates = await self._lazy_load_documents(filtered_candidates)
|
|
192
|
+
|
|
193
|
+
# 5. Return top-k
|
|
194
|
+
return filtered_candidates[:k]
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.error(f"Retrieval failed: {e}", exc_info=True)
|
|
198
|
+
raise
|
|
199
|
+
|
|
200
|
+
async def add_documents(
|
|
201
|
+
self,
|
|
202
|
+
documents: List[Document]
|
|
203
|
+
) -> None:
|
|
204
|
+
"""
|
|
205
|
+
Add documents to the retrieval system
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
documents: Documents to add
|
|
209
|
+
|
|
210
|
+
Example:
|
|
211
|
+
await retriever.add_documents([
|
|
212
|
+
Document(doc_id="1", content="Document 1"),
|
|
213
|
+
Document(doc_id="2", content="Document 2"),
|
|
214
|
+
])
|
|
215
|
+
"""
|
|
216
|
+
if not self._initialized:
|
|
217
|
+
await self.initialize()
|
|
218
|
+
|
|
219
|
+
# Filter documents
|
|
220
|
+
if self.domain_adapter:
|
|
221
|
+
documents = [doc for doc in documents if self.domain_adapter.should_index(doc)]
|
|
222
|
+
|
|
223
|
+
# Index documents
|
|
224
|
+
await self._index_documents_batch(documents)
|
|
225
|
+
|
|
226
|
+
async def _get_query_embedding(self, query: str) -> List[float]:
|
|
227
|
+
"""
|
|
228
|
+
Get query embedding with caching
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
query: Query text
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Embedding vector
|
|
235
|
+
"""
|
|
236
|
+
# Check cache
|
|
237
|
+
if self.config.enable_cache and query in self._embedding_cache:
|
|
238
|
+
logger.debug(f"Query embedding cache hit: {query[:50]}...")
|
|
239
|
+
return self._embedding_cache[query]
|
|
240
|
+
|
|
241
|
+
# Generate embedding
|
|
242
|
+
logger.debug(f"Generating embedding for query: {query[:50]}...")
|
|
243
|
+
embedding = await self.embedding.embed_query(query)
|
|
244
|
+
|
|
245
|
+
# Cache
|
|
246
|
+
if self.config.enable_cache:
|
|
247
|
+
self._embedding_cache[query] = embedding
|
|
248
|
+
|
|
249
|
+
return embedding
|
|
250
|
+
|
|
251
|
+
async def _lazy_load_documents(
|
|
252
|
+
self,
|
|
253
|
+
document_refs: List[Document]
|
|
254
|
+
) -> List[Document]:
|
|
255
|
+
"""
|
|
256
|
+
Lazy load full document details
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
document_refs: Document references (may contain only metadata)
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Full documents with details
|
|
263
|
+
"""
|
|
264
|
+
if not self.domain_adapter:
|
|
265
|
+
return document_refs
|
|
266
|
+
|
|
267
|
+
loaded_docs = []
|
|
268
|
+
|
|
269
|
+
for doc_ref in document_refs:
|
|
270
|
+
# Check cache
|
|
271
|
+
if doc_ref.doc_id in self._document_cache:
|
|
272
|
+
full_doc = self._document_cache[doc_ref.doc_id]
|
|
273
|
+
full_doc.score = doc_ref.score
|
|
274
|
+
loaded_docs.append(full_doc)
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
# Load from adapter
|
|
279
|
+
full_doc = await self.domain_adapter.load_document_details(doc_ref.doc_id)
|
|
280
|
+
full_doc.score = doc_ref.score
|
|
281
|
+
|
|
282
|
+
# Cache
|
|
283
|
+
if self.config.enable_cache:
|
|
284
|
+
self._document_cache[doc_ref.doc_id] = full_doc
|
|
285
|
+
|
|
286
|
+
loaded_docs.append(full_doc)
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.warning(f"Failed to load document {doc_ref.doc_id}: {e}")
|
|
290
|
+
# Fallback to reference
|
|
291
|
+
loaded_docs.append(doc_ref)
|
|
292
|
+
|
|
293
|
+
return loaded_docs
|
|
294
|
+
|
|
295
|
+
async def _index_all_documents(self) -> None:
|
|
296
|
+
"""Index all documents (EAGER strategy)"""
|
|
297
|
+
if not self.domain_adapter:
|
|
298
|
+
logger.warning("No domain adapter provided, skipping indexing")
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
logger.info("Indexing all documents (EAGER strategy)")
|
|
302
|
+
|
|
303
|
+
# Extract all documents
|
|
304
|
+
documents = await self.domain_adapter.extract_documents(
|
|
305
|
+
metadata_only=False
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Filter documents
|
|
309
|
+
documents = [doc for doc in documents if self.domain_adapter.should_index(doc)]
|
|
310
|
+
|
|
311
|
+
logger.info(f"Extracted {len(documents)} documents to index")
|
|
312
|
+
|
|
313
|
+
# Index in batches
|
|
314
|
+
await self._index_documents_batch(documents)
|
|
315
|
+
|
|
316
|
+
async def _index_metadata_only(self) -> None:
|
|
317
|
+
"""Index metadata only (LAZY strategy)"""
|
|
318
|
+
if not self.domain_adapter:
|
|
319
|
+
logger.warning("No domain adapter provided, skipping indexing")
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
logger.info("Indexing metadata only (LAZY strategy)")
|
|
323
|
+
|
|
324
|
+
# Extract lightweight documents
|
|
325
|
+
documents = await self.domain_adapter.extract_documents(
|
|
326
|
+
metadata_only=True
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Filter documents
|
|
330
|
+
documents = [doc for doc in documents if self.domain_adapter.should_index(doc)]
|
|
331
|
+
|
|
332
|
+
logger.info(f"Extracted {len(documents)} lightweight documents to index")
|
|
333
|
+
|
|
334
|
+
# Index in batches
|
|
335
|
+
await self._index_documents_batch(documents)
|
|
336
|
+
|
|
337
|
+
async def _index_documents_batch(self, documents: List[Document]) -> None:
|
|
338
|
+
"""
|
|
339
|
+
Index documents in batches
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
documents: Documents to index
|
|
343
|
+
"""
|
|
344
|
+
batch_size = self.config.batch_size
|
|
345
|
+
|
|
346
|
+
for i in range(0, len(documents), batch_size):
|
|
347
|
+
batch = documents[i:i + batch_size]
|
|
348
|
+
|
|
349
|
+
# Format documents for embedding
|
|
350
|
+
texts = [
|
|
351
|
+
self.domain_adapter.format_for_embedding(doc)
|
|
352
|
+
if self.domain_adapter else doc.content
|
|
353
|
+
for doc in batch
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
# Generate embeddings
|
|
357
|
+
logger.debug(f"Generating embeddings for batch {i//batch_size + 1}")
|
|
358
|
+
embeddings = await self.embedding.embed_documents(texts)
|
|
359
|
+
|
|
360
|
+
# Store in vector store
|
|
361
|
+
await self.vector_store.add_documents(batch, embeddings)
|
|
362
|
+
|
|
363
|
+
# Cache documents
|
|
364
|
+
if self.config.enable_cache:
|
|
365
|
+
for doc in batch:
|
|
366
|
+
self._document_cache[doc.doc_id] = doc
|
|
367
|
+
|
|
368
|
+
self._indexed_doc_count += len(batch)
|
|
369
|
+
|
|
370
|
+
logger.info(f"Indexed {self._indexed_doc_count} documents")
|
|
371
|
+
|
|
372
|
+
def clear_cache(self) -> None:
|
|
373
|
+
"""Clear all caches"""
|
|
374
|
+
self._embedding_cache.clear()
|
|
375
|
+
self._document_cache.clear()
|
|
376
|
+
logger.info("Caches cleared")
|
|
377
|
+
|
|
378
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
379
|
+
"""
|
|
380
|
+
Get retriever statistics
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Statistics dictionary
|
|
384
|
+
"""
|
|
385
|
+
return {
|
|
386
|
+
"initialized": self._initialized,
|
|
387
|
+
"indexed_documents": self._indexed_doc_count,
|
|
388
|
+
"embedding_cache_size": len(self._embedding_cache),
|
|
389
|
+
"document_cache_size": len(self._document_cache),
|
|
390
|
+
"index_strategy": self.config.index_strategy.value,
|
|
391
|
+
"top_k": self.config.top_k,
|
|
392
|
+
"similarity_threshold": self.config.similarity_threshold
|
|
393
|
+
}
|