solana-agent 28.3.3__py3-none-any.whl → 29.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/adapters/mongodb_adapter.py +10 -0
- solana_agent/client/solana_agent.py +0 -42
- solana_agent/factories/agent_factory.py +24 -0
- solana_agent/interfaces/client/client.py +0 -21
- solana_agent/interfaces/services/knowledge_base.py +0 -25
- solana_agent/services/knowledge_base.py +247 -351
- {solana_agent-28.3.3.dist-info → solana_agent-29.1.0.dist-info}/METADATA +38 -5
- {solana_agent-28.3.3.dist-info → solana_agent-29.1.0.dist-info}/RECORD +11 -11
- {solana_agent-28.3.3.dist-info → solana_agent-29.1.0.dist-info}/LICENSE +0 -0
- {solana_agent-28.3.3.dist-info → solana_agent-29.1.0.dist-info}/WHEEL +0 -0
- {solana_agent-28.3.3.dist-info → solana_agent-29.1.0.dist-info}/entry_points.txt +0 -0
@@ -32,6 +32,16 @@ class MongoDBAdapter(DataStorageProvider):
|
|
32
32
|
self.db[collection].insert_one(document)
|
33
33
|
return document["_id"]
|
34
34
|
|
35
|
+
def insert_many(self, collection: str, documents: List[Dict]) -> List[str]:
|
36
|
+
for document in documents:
|
37
|
+
if "_id" not in document:
|
38
|
+
document["_id"] = str(uuid.uuid4())
|
39
|
+
result = self.db[collection].insert_many(documents)
|
40
|
+
return [str(doc_id) for doc_id in result.inserted_ids]
|
41
|
+
|
42
|
+
def delete_many(self, collection: str, query: Dict):
|
43
|
+
return self.db[collection].delete_many(query)
|
44
|
+
|
35
45
|
def find_one(self, collection: str, query: Dict) -> Optional[Dict]:
|
36
46
|
return self.db[collection].find_one(query)
|
37
47
|
|
@@ -227,48 +227,6 @@ class SolanaAgent(SolanaAgentInterface):
|
|
227
227
|
kb = self._ensure_kb()
|
228
228
|
return await kb.delete_document(document_id, namespace)
|
229
229
|
|
230
|
-
async def kb_update_document(
|
231
|
-
self,
|
232
|
-
document_id: str,
|
233
|
-
text: Optional[str] = None,
|
234
|
-
metadata: Optional[Dict[str, Any]] = None,
|
235
|
-
namespace: Optional[str] = None,
|
236
|
-
) -> bool:
|
237
|
-
"""
|
238
|
-
Update an existing document in the knowledge base.
|
239
|
-
|
240
|
-
Args:
|
241
|
-
document_id: ID of document to update.
|
242
|
-
text: Optional new text content.
|
243
|
-
metadata: Optional metadata to update.
|
244
|
-
namespace: Optional Pinecone namespace.
|
245
|
-
|
246
|
-
Returns:
|
247
|
-
True if successful.
|
248
|
-
"""
|
249
|
-
kb = self._ensure_kb()
|
250
|
-
return await kb.update_document(document_id, text, metadata, namespace)
|
251
|
-
|
252
|
-
async def kb_add_documents_batch(
|
253
|
-
self,
|
254
|
-
documents: List[Dict[str, Any]],
|
255
|
-
namespace: Optional[str] = None,
|
256
|
-
batch_size: int = 50,
|
257
|
-
) -> List[str]:
|
258
|
-
"""
|
259
|
-
Add multiple documents to the knowledge base in batches.
|
260
|
-
|
261
|
-
Args:
|
262
|
-
documents: List of documents ({'text': ..., 'metadata': ...}).
|
263
|
-
namespace: Optional Pinecone namespace.
|
264
|
-
batch_size: Number of documents per batch.
|
265
|
-
|
266
|
-
Returns:
|
267
|
-
List of added document IDs.
|
268
|
-
"""
|
269
|
-
kb = self._ensure_kb()
|
270
|
-
return await kb.add_documents_batch(documents, namespace, batch_size)
|
271
|
-
|
272
230
|
async def kb_add_pdf_document(
|
273
231
|
self,
|
274
232
|
pdf_data: Union[bytes, str],
|
@@ -168,6 +168,30 @@ class SolanaAgentFactory:
|
|
168
168
|
output_guardrails=output_guardrails,
|
169
169
|
)
|
170
170
|
|
171
|
+
if "gemini" in config and "api_key" in config["gemini"]:
|
172
|
+
# Create primary services
|
173
|
+
agent_service = AgentService(
|
174
|
+
llm_provider=llm_adapter,
|
175
|
+
business_mission=business_mission,
|
176
|
+
config=config,
|
177
|
+
api_key=config["gemini"]["api_key"],
|
178
|
+
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
|
179
|
+
model="gemini-2.5-pro-preview-03-25",
|
180
|
+
output_guardrails=output_guardrails,
|
181
|
+
) # pragma: no cover
|
182
|
+
|
183
|
+
elif "grok" in config and "api_key" in config["grok"]:
|
184
|
+
# Create primary services
|
185
|
+
agent_service = AgentService(
|
186
|
+
llm_provider=llm_adapter,
|
187
|
+
business_mission=business_mission,
|
188
|
+
config=config,
|
189
|
+
api_key=config["grok"]["api_key"],
|
190
|
+
base_url="https://api.x.ai/v1",
|
191
|
+
model="grok-3-fast",
|
192
|
+
output_guardrails=output_guardrails,
|
193
|
+
) # pragma: no cover
|
194
|
+
|
171
195
|
# Create routing service
|
172
196
|
routing_service = RoutingService(
|
173
197
|
llm_provider=llm_adapter,
|
@@ -91,27 +91,6 @@ class SolanaAgent(ABC):
|
|
91
91
|
"""Delete a document from the knowledge base."""
|
92
92
|
pass
|
93
93
|
|
94
|
-
@abstractmethod
|
95
|
-
async def kb_update_document(
|
96
|
-
self,
|
97
|
-
document_id: str,
|
98
|
-
text: Optional[str] = None,
|
99
|
-
metadata: Optional[Dict[str, Any]] = None,
|
100
|
-
namespace: Optional[str] = None,
|
101
|
-
) -> bool:
|
102
|
-
"""Update an existing document in the knowledge base."""
|
103
|
-
pass
|
104
|
-
|
105
|
-
@abstractmethod
|
106
|
-
async def kb_add_documents_batch(
|
107
|
-
self,
|
108
|
-
documents: List[Dict[str, Any]],
|
109
|
-
namespace: Optional[str] = None,
|
110
|
-
batch_size: int = 50,
|
111
|
-
) -> List[str]:
|
112
|
-
"""Add multiple documents to the knowledge base in batches."""
|
113
|
-
pass
|
114
|
-
|
115
94
|
@abstractmethod
|
116
95
|
async def kb_add_pdf_document(
|
117
96
|
self,
|
@@ -44,31 +44,6 @@ class KnowledgeBaseService(ABC):
|
|
44
44
|
"""
|
45
45
|
pass
|
46
46
|
|
47
|
-
@abstractmethod
|
48
|
-
async def update_document(
|
49
|
-
self,
|
50
|
-
document_id: str,
|
51
|
-
text: Optional[str] = None,
|
52
|
-
metadata: Optional[Dict[str, Any]] = None,
|
53
|
-
namespace: Optional[str] = None,
|
54
|
-
) -> bool:
|
55
|
-
"""
|
56
|
-
Update an existing document in the knowledge base.
|
57
|
-
"""
|
58
|
-
pass
|
59
|
-
|
60
|
-
@abstractmethod
|
61
|
-
async def add_documents_batch(
|
62
|
-
self,
|
63
|
-
documents: List[Dict[str, Any]],
|
64
|
-
namespace: Optional[str] = None,
|
65
|
-
batch_size: int = 50,
|
66
|
-
) -> List[str]:
|
67
|
-
"""
|
68
|
-
Add multiple documents in batches.
|
69
|
-
"""
|
70
|
-
pass
|
71
|
-
|
72
47
|
@abstractmethod
|
73
48
|
async def add_pdf_document(
|
74
49
|
self,
|
@@ -23,8 +23,9 @@ logger = logging.getLogger(__name__)
|
|
23
23
|
|
24
24
|
class KnowledgeBaseService(KnowledgeBaseInterface):
|
25
25
|
"""
|
26
|
-
Knowledge Base service using Pinecone for vector search and MongoDB for metadata/
|
26
|
+
Knowledge Base service using Pinecone for vector search and MongoDB for metadata/chunk storage.
|
27
27
|
Supports text documents and PDF semantic chunking using OpenAI embeddings via LlamaIndex.
|
28
|
+
PDF binary data is not stored. Chunks are stored individually in MongoDB.
|
28
29
|
"""
|
29
30
|
|
30
31
|
def __init__(
|
@@ -48,7 +49,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
48
49
|
mongodb_adapter: Configured MongoDBAdapter instance.
|
49
50
|
openai_api_key: OpenAI API key for embedding.
|
50
51
|
openai_model_name: OpenAI embedding model name.
|
51
|
-
collection_name: MongoDB collection for storing document metadata and
|
52
|
+
collection_name: MongoDB collection for storing document metadata and chunks.
|
52
53
|
rerank_results: Whether PineconeAdapter should rerank results.
|
53
54
|
rerank_top_k: Number of results to return after reranking (used by PineconeAdapter).
|
54
55
|
splitter_buffer_size: Buffer size for SemanticSplitterNodeParser.
|
@@ -133,7 +134,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
133
134
|
metadata: Dict[str, Any],
|
134
135
|
document_id: Optional[str] = None,
|
135
136
|
namespace: Optional[str] = None,
|
136
|
-
) -> str:
|
137
|
+
) -> str: # pragma: no cover
|
137
138
|
"""
|
138
139
|
Add a plain text document to the knowledge base. Embeds using OpenAI.
|
139
140
|
|
@@ -147,6 +148,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
147
148
|
The document ID.
|
148
149
|
"""
|
149
150
|
doc_id = document_id or str(uuid.uuid4())
|
151
|
+
now = dt.now(tz=dt.now().astimezone().tzinfo)
|
150
152
|
|
151
153
|
# Store metadata and content in MongoDB
|
152
154
|
mongo_doc = {
|
@@ -156,10 +158,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
156
158
|
"parent_document_id": None,
|
157
159
|
**metadata,
|
158
160
|
# Use timezone aware datetime
|
159
|
-
"created_at": metadata.get(
|
160
|
-
|
161
|
-
),
|
162
|
-
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
|
161
|
+
"created_at": metadata.get("created_at", now),
|
162
|
+
"updated_at": now,
|
163
163
|
}
|
164
164
|
try:
|
165
165
|
self.mongo.insert_one(self.collection, mongo_doc)
|
@@ -184,6 +184,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
184
184
|
pinecone_metadata = {
|
185
185
|
"document_id": doc_id,
|
186
186
|
"is_chunk": False,
|
187
|
+
"parent_document_id": False, # Explicitly set for clarity - Pinecone can't use None
|
187
188
|
"source": metadata.get("source", "unknown"),
|
188
189
|
"tags": metadata.get("tags", []),
|
189
190
|
}
|
@@ -215,10 +216,11 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
215
216
|
document_id: Optional[str] = None,
|
216
217
|
namespace: Optional[str] = None,
|
217
218
|
chunk_batch_size: int = 50,
|
218
|
-
) -> str:
|
219
|
+
) -> str: # pragma: no cover
|
219
220
|
"""
|
220
221
|
Add a PDF document, performs semantic chunking using OpenAI embeddings,
|
221
|
-
stores
|
222
|
+
stores parent metadata and individual chunks in Mongo, and chunk vectors in Pinecone.
|
223
|
+
Full PDF binary is NOT stored.
|
222
224
|
|
223
225
|
Args:
|
224
226
|
pdf_data: PDF content as bytes or a path to the PDF file.
|
@@ -232,6 +234,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
232
234
|
"""
|
233
235
|
parent_doc_id = document_id or str(uuid.uuid4())
|
234
236
|
pdf_bytes: bytes
|
237
|
+
now = dt.now(tz=dt.now().astimezone().tzinfo)
|
235
238
|
|
236
239
|
# --- 1. Read PDF and Extract Text ---
|
237
240
|
try:
|
@@ -249,42 +252,40 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
249
252
|
logger.warning(
|
250
253
|
f"No text extracted from PDF {parent_doc_id}."
|
251
254
|
) # Use logger.warning
|
255
|
+
# Still store parent metadata even if no text
|
252
256
|
except Exception as e:
|
253
257
|
logger.error(
|
254
258
|
f"Error reading or extracting text from PDF {parent_doc_id}: {e}"
|
255
259
|
) # Use logger.error
|
256
260
|
raise
|
257
261
|
|
258
|
-
# --- 2. Store
|
262
|
+
# --- 2. Store Parent PDF Metadata in MongoDB (NO BINARY) ---
|
259
263
|
mongo_parent_doc = {
|
260
264
|
"document_id": parent_doc_id,
|
261
|
-
"content":
|
262
|
-
"pdf_data": pdf_bytes,
|
265
|
+
"content": None,
|
263
266
|
"is_chunk": False,
|
264
267
|
"parent_document_id": None,
|
265
268
|
**metadata,
|
266
|
-
"created_at": metadata.get(
|
267
|
-
|
268
|
-
),
|
269
|
-
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
|
269
|
+
"created_at": metadata.get("created_at", now),
|
270
|
+
"updated_at": now,
|
270
271
|
}
|
271
272
|
try:
|
272
273
|
self.mongo.insert_one(self.collection, mongo_parent_doc)
|
273
274
|
logger.info(
|
274
|
-
f"Stored
|
275
|
+
f"Stored parent metadata for PDF {parent_doc_id} in MongoDB."
|
275
276
|
) # Use logger.info
|
276
|
-
except Exception as e:
|
277
|
-
logger.error(
|
278
|
-
f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}"
|
279
|
-
)
|
280
|
-
raise
|
277
|
+
except Exception as e:
|
278
|
+
logger.error(
|
279
|
+
f"Error inserting parent PDF metadata {parent_doc_id} into MongoDB: {e}"
|
280
|
+
)
|
281
|
+
raise
|
281
282
|
|
282
283
|
# --- 3. Semantic Chunking ---
|
283
284
|
if not extracted_text.strip():
|
284
285
|
logger.info( # Use logger.info
|
285
286
|
f"Skipping chunking for PDF {parent_doc_id} due to no extracted text."
|
286
287
|
)
|
287
|
-
return parent_doc_id
|
288
|
+
return parent_doc_id # Return parent ID even if no chunks
|
288
289
|
|
289
290
|
try:
|
290
291
|
llama_doc = LlamaDocument(text=extracted_text)
|
@@ -299,9 +300,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
299
300
|
logger.error(
|
300
301
|
f"Error during semantic chunking for PDF {parent_doc_id}: {e}"
|
301
302
|
) # Use logger.error
|
303
|
+
# Parent metadata is already stored, decide how to proceed. Raising for now.
|
302
304
|
raise
|
303
305
|
|
304
|
-
# --- 4. Embed Chunks and Batch Upsert to Pinecone ---
|
306
|
+
# --- 4. Embed Chunks and Batch Upsert to Pinecone AND Store Chunks in MongoDB ---
|
305
307
|
if not nodes:
|
306
308
|
return parent_doc_id # No chunks generated
|
307
309
|
|
@@ -315,8 +317,6 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
315
317
|
# Embed chunks in batches (using embed_model's internal batching)
|
316
318
|
try:
|
317
319
|
# Use aget_text_embedding_batch for async embedding
|
318
|
-
# Note: LlamaIndex OpenAIEmbedding might handle batch size internally.
|
319
|
-
# If large number of nodes, consider explicit batching here if needed.
|
320
320
|
all_chunk_embeddings = await embed_model.aget_text_embedding_batch(
|
321
321
|
chunk_texts, show_progress=True
|
322
322
|
)
|
@@ -327,34 +327,75 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
327
327
|
raise # Stop if embedding fails
|
328
328
|
|
329
329
|
logger.info(
|
330
|
-
"Embedding complete. Preparing vectors for Pinecone."
|
330
|
+
"Embedding complete. Preparing vectors for Pinecone and documents for MongoDB."
|
331
331
|
) # Use logger.info
|
332
332
|
pinecone_vectors = []
|
333
|
+
mongo_chunk_docs = []
|
334
|
+
chunk_now = dt.now(
|
335
|
+
tz=dt.now().astimezone().tzinfo
|
336
|
+
) # Consistent timestamp for chunks
|
337
|
+
|
333
338
|
for i, node in enumerate(nodes):
|
334
339
|
chunk_id = f"{parent_doc_id}_chunk_{i}"
|
335
|
-
|
336
|
-
|
340
|
+
chunk_text = chunk_texts[i]
|
341
|
+
|
342
|
+
# Prepare Pinecone Vector Metadata
|
343
|
+
pinecone_chunk_metadata = {
|
344
|
+
"document_id": chunk_id, # Pinecone ID is the chunk ID
|
337
345
|
"parent_document_id": parent_doc_id,
|
338
346
|
"chunk_index": i,
|
339
347
|
"is_chunk": True,
|
340
|
-
"source": metadata.get("source", "unknown"),
|
341
|
-
"tags": metadata.get("tags", []),
|
348
|
+
"source": metadata.get("source", "unknown"), # Inherit from parent
|
349
|
+
"tags": metadata.get("tags", []), # Inherit from parent
|
342
350
|
}
|
343
351
|
# Add chunk text itself if Pinecone adapter reranking is used
|
344
352
|
if self.pinecone.use_reranking:
|
345
|
-
|
353
|
+
pinecone_chunk_metadata[self.pinecone.rerank_text_field] = chunk_text
|
346
354
|
|
347
355
|
pinecone_vectors.append(
|
348
356
|
{
|
349
357
|
"id": chunk_id,
|
350
358
|
"values": all_chunk_embeddings[i],
|
351
|
-
"metadata":
|
359
|
+
"metadata": pinecone_chunk_metadata,
|
352
360
|
}
|
353
361
|
)
|
354
362
|
|
355
|
-
|
363
|
+
# Prepare MongoDB Chunk Document
|
364
|
+
mongo_chunk_doc = {
|
365
|
+
"document_id": chunk_id, # Mongo ID is the chunk ID
|
366
|
+
"parent_document_id": parent_doc_id,
|
367
|
+
"chunk_index": i,
|
368
|
+
"is_chunk": True,
|
369
|
+
"content": chunk_text, # Store chunk text in Mongo
|
370
|
+
"source": metadata.get("source", "unknown"), # Inherit from parent
|
371
|
+
"tags": metadata.get("tags", []), # Inherit from parent
|
372
|
+
# Add other relevant parent metadata if needed, avoid duplication if possible
|
373
|
+
"created_at": chunk_now, # Use consistent time for batch
|
374
|
+
"updated_at": chunk_now,
|
375
|
+
}
|
376
|
+
mongo_chunk_docs.append(mongo_chunk_doc)
|
377
|
+
|
378
|
+
# --- 5. Store Chunks in MongoDB ---
|
379
|
+
if mongo_chunk_docs:
|
380
|
+
try:
|
381
|
+
self.mongo.insert_many(self.collection, mongo_chunk_docs)
|
382
|
+
logger.info(
|
383
|
+
f"Stored {len(mongo_chunk_docs)} chunks in MongoDB for parent {parent_doc_id}."
|
384
|
+
)
|
385
|
+
except Exception as e:
|
386
|
+
logger.error(
|
387
|
+
f"Error inserting chunks into MongoDB for parent {parent_doc_id}: {e}"
|
388
|
+
)
|
389
|
+
# Decide how to handle: Pinecone upsert might still proceed or fail.
|
390
|
+
# For now, log the error and continue to Pinecone upsert attempt.
|
391
|
+
|
392
|
+
# --- 6. Upsert Chunk Vectors to Pinecone in Batches ---
|
393
|
+
if not pinecone_vectors:
|
394
|
+
logger.warning(f"No vectors generated to upsert for PDF {parent_doc_id}.")
|
395
|
+
return parent_doc_id
|
396
|
+
|
356
397
|
logger.info( # Use logger.info
|
357
|
-
f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}..."
|
398
|
+
f"Upserting {len(pinecone_vectors)} chunk vectors to Pinecone in batches of {chunk_batch_size}..."
|
358
399
|
)
|
359
400
|
upsert_tasks = []
|
360
401
|
for i in range(0, len(pinecone_vectors), chunk_batch_size):
|
@@ -370,12 +411,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
370
411
|
results = await asyncio.gather(*upsert_tasks, return_exceptions=True)
|
371
412
|
|
372
413
|
# Check for errors during upsert
|
414
|
+
upsert_errors = False
|
373
415
|
for idx, result in enumerate(results):
|
374
416
|
if isinstance(result, Exception):
|
417
|
+
upsert_errors = True
|
375
418
|
logger.error(
|
376
|
-
f"Error upserting vector batch {idx + 1} to Pinecone: {result}"
|
419
|
+
f"Error upserting vector batch {idx + 1} to Pinecone for parent {parent_doc_id}: {result}"
|
377
420
|
) # Use logger.error
|
378
|
-
# Decide on error handling: log, raise, etc.
|
421
|
+
# Decide on error handling: log, raise, etc. Consider cleanup?
|
422
|
+
|
423
|
+
if upsert_errors:
|
424
|
+
logger.warning(
|
425
|
+
f"Some errors occurred during Pinecone vector upsert for {parent_doc_id}."
|
426
|
+
)
|
427
|
+
# Consider if partial success requires specific handling or cleanup
|
379
428
|
|
380
429
|
logger.info(f"Finished processing PDF {parent_doc_id}.") # Use logger.info
|
381
430
|
return parent_doc_id
|
@@ -388,9 +437,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
388
437
|
namespace: Optional[str] = None,
|
389
438
|
include_content: bool = True,
|
390
439
|
include_metadata: bool = True,
|
391
|
-
) -> List[Dict[str, Any]]:
|
440
|
+
) -> List[Dict[str, Any]]: # pragma: no cover
|
392
441
|
"""
|
393
442
|
Query the knowledge base using semantic search with OpenAI embeddings.
|
443
|
+
Retrieves chunk or document content and metadata from MongoDB based on Pinecone results.
|
394
444
|
|
395
445
|
Args:
|
396
446
|
query_text: The query text.
|
@@ -439,29 +489,43 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
439
489
|
|
440
490
|
# Extract IDs, scores, and metadata from Pinecone results
|
441
491
|
# PineconeAdapter might have already reranked and truncated to final top_k
|
442
|
-
result_ids = [
|
492
|
+
result_ids = [
|
493
|
+
res["id"] for res in pinecone_results
|
494
|
+
] # These are chunk IDs or plain doc IDs
|
443
495
|
scores = {res["id"]: res["score"] for res in pinecone_results}
|
444
496
|
pinecone_metadatas = {
|
445
497
|
res["id"]: res.get("metadata", {}) for res in pinecone_results
|
446
498
|
}
|
447
499
|
|
448
500
|
# --- Fetch corresponding data from MongoDB ---
|
449
|
-
|
501
|
+
# We need:
|
502
|
+
# 1. Chunk documents (using result_ids where is_chunk is True)
|
503
|
+
# 2. Parent documents (using parent_document_id from chunk metadata)
|
504
|
+
# 3. Plain documents (using result_ids where is_chunk is False)
|
505
|
+
chunk_ids_to_fetch = set()
|
450
506
|
parent_ids_to_fetch = set()
|
507
|
+
plain_doc_ids_to_fetch = set()
|
508
|
+
|
451
509
|
for res_id in result_ids:
|
452
510
|
meta = pinecone_metadatas.get(res_id, {})
|
453
511
|
if meta.get("is_chunk"):
|
512
|
+
chunk_ids_to_fetch.add(res_id)
|
454
513
|
parent_id = meta.get("parent_document_id")
|
455
514
|
if parent_id:
|
456
515
|
parent_ids_to_fetch.add(parent_id)
|
457
516
|
else:
|
458
|
-
|
459
|
-
parent_ids_to_fetch.add(res_id)
|
517
|
+
plain_doc_ids_to_fetch.add(res_id)
|
460
518
|
|
461
|
-
|
519
|
+
# Fetch all required docs from Mongo in potentially fewer queries
|
520
|
+
mongo_docs_map = {}
|
521
|
+
ids_to_fetch_mongo = list(
|
522
|
+
chunk_ids_to_fetch | parent_ids_to_fetch | plain_doc_ids_to_fetch
|
523
|
+
)
|
524
|
+
|
525
|
+
if ids_to_fetch_mongo:
|
462
526
|
try:
|
463
527
|
mongo_docs = self.mongo.find(
|
464
|
-
self.collection, {"document_id": {"$in":
|
528
|
+
self.collection, {"document_id": {"$in": ids_to_fetch_mongo}}
|
465
529
|
)
|
466
530
|
mongo_docs_map = {doc["document_id"]: doc for doc in mongo_docs}
|
467
531
|
except Exception as e:
|
@@ -477,43 +541,67 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
477
541
|
is_chunk = pinecone_meta.get("is_chunk", False)
|
478
542
|
parent_doc_id = pinecone_meta.get("parent_document_id")
|
479
543
|
|
480
|
-
# Determine which Mongo doc holds the relevant info
|
481
|
-
mongo_doc_for_meta = None
|
482
|
-
mongo_doc_for_content = None
|
483
|
-
if is_chunk and parent_doc_id:
|
484
|
-
mongo_doc_for_meta = mongo_docs_map.get(parent_doc_id)
|
485
|
-
mongo_doc_for_content = mongo_doc_for_meta # Parent holds full content
|
486
|
-
else: # Not a chunk
|
487
|
-
mongo_doc_for_meta = mongo_docs_map.get(res_id)
|
488
|
-
mongo_doc_for_content = mongo_doc_for_meta
|
489
|
-
|
490
544
|
result = {
|
491
|
-
"document_id": res_id,
|
545
|
+
"document_id": res_id, # This is the chunk_id if is_chunk, else the doc_id
|
492
546
|
"score": scores.get(res_id, 0.0),
|
493
547
|
"is_chunk": is_chunk,
|
494
|
-
"parent_document_id": parent_doc_id,
|
548
|
+
"parent_document_id": parent_doc_id, # Null if not a chunk
|
495
549
|
}
|
496
550
|
|
551
|
+
mongo_doc = mongo_docs_map.get(
|
552
|
+
res_id
|
553
|
+
) # Get the specific chunk or plain doc
|
554
|
+
|
555
|
+
# --- FIX: Skip result if corresponding Mongo doc not found ---
|
556
|
+
if not mongo_doc:
|
557
|
+
logger.warning(
|
558
|
+
f"Document/chunk {res_id} found in Pinecone but not in MongoDB. Skipping."
|
559
|
+
)
|
560
|
+
continue
|
561
|
+
# --- End FIX ---
|
562
|
+
|
497
563
|
if include_content:
|
498
564
|
content = None
|
499
|
-
# Priority 1: Reranking field in Pinecone metadata (holds chunk text)
|
565
|
+
# Priority 1: Reranking field in Pinecone metadata (holds chunk text if reranking)
|
566
|
+
# Note: This might be redundant if we fetch from Mongo anyway, but keep for flexibility
|
500
567
|
if (
|
501
568
|
self.pinecone.use_reranking
|
502
569
|
and self.pinecone.rerank_text_field in pinecone_meta
|
503
570
|
):
|
504
571
|
content = pinecone_meta[self.pinecone.rerank_text_field]
|
505
|
-
# Priority 2: Get content from the
|
506
|
-
elif
|
507
|
-
content =
|
572
|
+
# Priority 2: Get content from the fetched Mongo doc (chunk or plain doc)
|
573
|
+
elif mongo_doc:
|
574
|
+
content = mongo_doc.get("content")
|
508
575
|
result["content"] = content or ""
|
509
576
|
|
510
577
|
if include_metadata:
|
511
578
|
combined_meta = {}
|
512
|
-
#
|
513
|
-
if
|
579
|
+
# If it's a chunk, fetch the parent document's metadata
|
580
|
+
if is_chunk and parent_doc_id:
|
581
|
+
parent_mongo_doc = mongo_docs_map.get(parent_doc_id)
|
582
|
+
if parent_mongo_doc:
|
583
|
+
# Extract metadata from parent, excluding fields specific to parent/content
|
584
|
+
combined_meta = {
|
585
|
+
k: v
|
586
|
+
for k, v in parent_mongo_doc.items()
|
587
|
+
if k
|
588
|
+
not in [
|
589
|
+
"_id",
|
590
|
+
"document_id",
|
591
|
+
"content",
|
592
|
+
"pdf_data", # pdf_data removed anyway
|
593
|
+
"is_chunk",
|
594
|
+
"parent_document_id",
|
595
|
+
"created_at",
|
596
|
+
"updated_at",
|
597
|
+
"chunk_index",
|
598
|
+
]
|
599
|
+
}
|
600
|
+
# If it's a plain doc, fetch its own metadata
|
601
|
+
elif not is_chunk and mongo_doc:
|
514
602
|
combined_meta = {
|
515
603
|
k: v
|
516
|
-
for k, v in
|
604
|
+
for k, v in mongo_doc.items()
|
517
605
|
if k
|
518
606
|
not in [
|
519
607
|
"_id",
|
@@ -524,15 +612,24 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
524
612
|
"parent_document_id",
|
525
613
|
"created_at",
|
526
614
|
"updated_at",
|
615
|
+
"chunk_index",
|
527
616
|
]
|
528
617
|
}
|
529
|
-
|
618
|
+
|
619
|
+
# Add/overwrite with chunk-specific info from Pinecone meta (like chunk_index)
|
620
|
+
# or specific metadata stored directly on the plain doc in Pinecone
|
530
621
|
combined_meta.update(
|
531
622
|
{
|
532
623
|
k: v
|
533
624
|
for k, v in pinecone_meta.items()
|
534
|
-
# Avoid redundancy
|
535
|
-
if k
|
625
|
+
# Avoid redundancy with already included fields or internal fields
|
626
|
+
if k
|
627
|
+
not in [
|
628
|
+
"document_id",
|
629
|
+
"parent_document_id",
|
630
|
+
"is_chunk",
|
631
|
+
self.pinecone.rerank_text_field,
|
632
|
+
]
|
536
633
|
}
|
537
634
|
)
|
538
635
|
result["metadata"] = combined_meta
|
@@ -543,31 +640,56 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
543
640
|
|
544
641
|
async def delete_document(
|
545
642
|
self, document_id: str, namespace: Optional[str] = None
|
546
|
-
) -> bool:
|
643
|
+
) -> bool: # pragma: no cover
|
547
644
|
"""
|
548
|
-
Delete a document (plain text or PDF) and all its associated chunks
|
645
|
+
Delete a parent document (plain text or PDF) and all its associated chunks
|
646
|
+
from both MongoDB and Pinecone. Cannot delete a chunk directly.
|
549
647
|
|
550
648
|
Args:
|
551
|
-
document_id: ID of the parent document
|
649
|
+
document_id: ID of the parent document to delete.
|
552
650
|
namespace: Optional Pinecone namespace.
|
553
651
|
|
554
652
|
Returns:
|
555
|
-
True if deletion was successful (
|
653
|
+
True if deletion was successful in both stores (if applicable), False otherwise.
|
556
654
|
"""
|
557
|
-
logger.info(
|
655
|
+
logger.info(
|
558
656
|
f"Attempting to delete document and associated data for ID: {document_id}"
|
559
657
|
)
|
560
|
-
|
561
|
-
|
658
|
+
mongo_delete_error = False
|
659
|
+
pinecone_delete_error = False
|
660
|
+
document_found = False # Track if the initial ID exists
|
661
|
+
|
662
|
+
# --- 0. Check if the target ID is a chunk ---
|
663
|
+
try:
|
664
|
+
target_doc = self.mongo.find_one(
|
665
|
+
self.collection,
|
666
|
+
{"document_id": document_id},
|
667
|
+
)
|
668
|
+
if target_doc and target_doc.get("is_chunk"):
|
669
|
+
logger.warning(
|
670
|
+
f"Cannot delete chunk {document_id} directly. Delete the parent document."
|
671
|
+
)
|
672
|
+
return False # Prevent deleting chunks directly
|
673
|
+
if target_doc:
|
674
|
+
document_found = True
|
675
|
+
except Exception as e: # pragma: no cover
|
676
|
+
logger.error(
|
677
|
+
f"Error checking document type for {document_id} in MongoDB: {e}"
|
678
|
+
) # pragma: no cover
|
679
|
+
return False # pragma: no cover # Fail if we can't even check the type
|
680
|
+
|
681
|
+
if not document_found:
|
682
|
+
logger.warning(f"Document {document_id} not found for deletion.")
|
683
|
+
# Even if not found, attempt cleanup in Pinecone just in case of inconsistency
|
684
|
+
# but the overall result should be False as the primary doc wasn't there.
|
685
|
+
pass # Continue to attempt Pinecone cleanup, but final result will be False
|
562
686
|
|
563
687
|
# --- 1. Find all associated document IDs in MongoDB ---
|
564
|
-
|
565
|
-
|
566
|
-
# A more robust way might be to query Pinecone directly for vectors with parent_document_id == document_id
|
567
|
-
# For now, assume IDs in Mongo cover what needs deletion.
|
568
|
-
docs_to_delete_mongo = []
|
569
|
-
mongo_ids_to_delete = set([document_id]) # Start with the main ID
|
688
|
+
mongo_ids_to_delete = set()
|
689
|
+
pinecone_ids_to_delete = set()
|
570
690
|
try:
|
691
|
+
# Find parent doc and all chunk docs linked to it
|
692
|
+
# Use the ID confirmed not to be a chunk
|
571
693
|
docs_to_delete_mongo = list(
|
572
694
|
self.mongo.find(
|
573
695
|
self.collection,
|
@@ -579,294 +701,68 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
579
701
|
},
|
580
702
|
)
|
581
703
|
)
|
582
|
-
|
583
|
-
|
704
|
+
if docs_to_delete_mongo:
|
705
|
+
document_found = True # Confirm something was found related to the ID
|
706
|
+
for doc in docs_to_delete_mongo:
|
707
|
+
mongo_ids_to_delete.add(doc["document_id"])
|
708
|
+
pinecone_ids_to_delete.add(doc["document_id"])
|
709
|
+
elif document_found: # Parent existed but no chunks found (plain text doc)
|
710
|
+
mongo_ids_to_delete.add(document_id)
|
711
|
+
pinecone_ids_to_delete.add(document_id)
|
712
|
+
# If !document_found initially, sets remain empty unless fallback below happens
|
713
|
+
|
584
714
|
except Exception as e:
|
585
|
-
logger.warning(
|
586
|
-
f"Error finding documents in MongoDB for deletion ({document_id}): {e}.
|
715
|
+
logger.warning(
|
716
|
+
f"Error finding associated documents in MongoDB for deletion ({document_id}): {e}. Attempting Pinecone/Mongo deletion with main ID only."
|
587
717
|
)
|
588
|
-
|
589
|
-
|
718
|
+
# Fallback: try deleting the main ID from Pinecone/Mongo
|
719
|
+
if document_found: # Only add if we confirmed the initial doc existed
|
720
|
+
pinecone_ids_to_delete.add(document_id)
|
721
|
+
mongo_ids_to_delete.add(document_id)
|
722
|
+
|
723
|
+
# Convert sets to lists for deletion methods
|
724
|
+
pinecone_ids_list = list(pinecone_ids_to_delete)
|
725
|
+
mongo_ids_list = list(mongo_ids_to_delete)
|
726
|
+
|
727
|
+
# If no IDs were found at all, and the initial doc wasn't found, return False
|
728
|
+
if not document_found and not mongo_ids_list and not pinecone_ids_list:
|
729
|
+
logger.info(f"No trace of document {document_id} found to delete.")
|
730
|
+
return False
|
590
731
|
|
591
732
|
# --- 2. Delete from Pinecone ---
|
592
|
-
if
|
733
|
+
if pinecone_ids_list:
|
593
734
|
try:
|
594
|
-
await self.pinecone.delete(
|
595
|
-
|
735
|
+
await self.pinecone.delete(ids=pinecone_ids_list, namespace=namespace)
|
736
|
+
logger.info(
|
737
|
+
f"Attempted deletion of {len(pinecone_ids_list)} vectors from Pinecone for {document_id}."
|
596
738
|
)
|
597
|
-
logger.info( # Use logger.info
|
598
|
-
f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}."
|
599
|
-
)
|
600
|
-
pinecone_deleted = True
|
601
739
|
except Exception as e:
|
602
740
|
logger.error(
|
603
|
-
f"Error deleting vectors from Pinecone for {document_id}: {e}"
|
604
|
-
)
|
741
|
+
f"Error deleting vectors from Pinecone for {document_id} (IDs: {pinecone_ids_list}): {e}"
|
742
|
+
)
|
743
|
+
pinecone_delete_error = True # Track error
|
605
744
|
|
606
745
|
# --- 3. Delete from MongoDB ---
|
607
|
-
|
608
|
-
|
609
|
-
if mongo_ids_found_in_db:
|
746
|
+
mongo_deleted_count = 0
|
747
|
+
if mongo_ids_list:
|
610
748
|
try:
|
611
749
|
delete_result = self.mongo.delete_many(
|
612
|
-
self.collection, {"document_id": {"$in":
|
750
|
+
self.collection, {"document_id": {"$in": mongo_ids_list}}
|
613
751
|
)
|
614
752
|
mongo_deleted_count = delete_result.deleted_count
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
except Exception as e:
|
619
|
-
logger.error(
|
620
|
-
f"Error deleting documents from MongoDB for {document_id}: {e}"
|
621
|
-
) # Use logger.error
|
622
|
-
|
623
|
-
return pinecone_deleted or mongo_deleted_count > 0
|
624
|
-
|
625
|
-
async def update_document(
|
626
|
-
self,
|
627
|
-
document_id: str,
|
628
|
-
text: Optional[str] = None,
|
629
|
-
metadata: Optional[Dict[str, Any]] = None,
|
630
|
-
namespace: Optional[str] = None,
|
631
|
-
) -> bool:
|
632
|
-
"""
|
633
|
-
Update an existing plain text document or metadata. Embeds using OpenAI.
|
634
|
-
Updating PDF content requires deleting and re-adding.
|
635
|
-
|
636
|
-
Args:
|
637
|
-
document_id: ID of document to update.
|
638
|
-
text: Optional new text content (for plain text docs only).
|
639
|
-
metadata: Optional metadata to update.
|
640
|
-
namespace: Optional Pinecone namespace.
|
641
|
-
|
642
|
-
Returns:
|
643
|
-
True if successful.
|
644
|
-
"""
|
645
|
-
current_doc = self.mongo.find_one(self.collection, {"document_id": document_id})
|
646
|
-
if not current_doc:
|
647
|
-
logger.warning(
|
648
|
-
f"Document {document_id} not found for update."
|
649
|
-
) # Use logger.warning
|
650
|
-
return False
|
651
|
-
|
652
|
-
if current_doc.get("is_chunk"):
|
653
|
-
logger.warning(
|
654
|
-
f"Cannot update chunk {document_id} directly."
|
655
|
-
) # Use logger.warning
|
656
|
-
return False
|
657
|
-
if current_doc.get("pdf_data") and text is not None:
|
658
|
-
logger.warning(
|
659
|
-
"Cannot update PDF content via this method. Delete and re-add."
|
660
|
-
) # Use logger.warning
|
661
|
-
return False
|
662
|
-
|
663
|
-
update_text = text is not None and not current_doc.get("pdf_data")
|
664
|
-
text_content = text if update_text else current_doc.get("content", "")
|
665
|
-
|
666
|
-
# --- Update MongoDB ---
|
667
|
-
mongo_update = {}
|
668
|
-
if metadata:
|
669
|
-
mongo_update.update(metadata)
|
670
|
-
if update_text:
|
671
|
-
mongo_update["content"] = text_content
|
672
|
-
mongo_update["updated_at"] = dt.now(tz=dt.now().astimezone().tzinfo)
|
673
|
-
|
674
|
-
mongo_updated = False
|
675
|
-
if mongo_update: # Only update if there are changes
|
676
|
-
try:
|
677
|
-
update_result = self.mongo.update_one(
|
678
|
-
self.collection,
|
679
|
-
{"document_id": document_id},
|
680
|
-
{"$set": mongo_update},
|
681
|
-
)
|
682
|
-
mongo_updated = update_result.modified_count > 0
|
683
|
-
except Exception as e:
|
684
|
-
logger.error(
|
685
|
-
f"Error updating document {document_id} in MongoDB: {e}"
|
686
|
-
) # Use logger.error
|
687
|
-
# Decide if we should proceed to Pinecone update if Mongo failed
|
688
|
-
return False # Return False if Mongo update fails
|
689
|
-
|
690
|
-
# --- Update Pinecone (only if text changed) ---
|
691
|
-
pinecone_updated = False
|
692
|
-
if update_text:
|
693
|
-
# Embed updated text
|
694
|
-
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
695
|
-
try:
|
696
|
-
embedding = await embed_model.aget_text_embedding(text_content)
|
697
|
-
except Exception as e:
|
698
|
-
logger.error(
|
699
|
-
f"Error embedding updated text for {document_id}: {e}"
|
700
|
-
) # Use logger.error
|
701
|
-
# Mongo update might have succeeded, but embedding failed
|
702
|
-
return mongo_updated # Return based on Mongo success
|
703
|
-
|
704
|
-
# Prepare Pinecone metadata
|
705
|
-
final_metadata = {**current_doc, **mongo_update} # Use updated data
|
706
|
-
pinecone_metadata = {"document_id": document_id, "is_chunk": False}
|
707
|
-
for key, value in final_metadata.items():
|
708
|
-
if key not in [
|
709
|
-
"_id",
|
710
|
-
"content",
|
711
|
-
"pdf_data",
|
712
|
-
"created_at",
|
713
|
-
"updated_at",
|
714
|
-
"document_id",
|
715
|
-
"is_chunk",
|
716
|
-
"parent_document_id",
|
717
|
-
]:
|
718
|
-
pinecone_metadata[key] = value
|
719
|
-
if self.pinecone.use_reranking:
|
720
|
-
pinecone_metadata[self.pinecone.rerank_text_field] = text_content
|
721
|
-
|
722
|
-
# Upsert vector to Pinecone
|
723
|
-
try:
|
724
|
-
await self.pinecone.upsert(
|
725
|
-
vectors=[
|
726
|
-
{
|
727
|
-
"id": document_id,
|
728
|
-
"values": embedding,
|
729
|
-
"metadata": pinecone_metadata,
|
730
|
-
}
|
731
|
-
],
|
732
|
-
namespace=namespace,
|
733
|
-
)
|
734
|
-
pinecone_updated = True
|
735
|
-
except Exception as e:
|
736
|
-
logger.error( # Use logger.error
|
737
|
-
f"Error upserting updated vector in Pinecone for {document_id}: {e}"
|
738
|
-
)
|
739
|
-
# Mongo update succeeded, Pinecone failed
|
740
|
-
|
741
|
-
return mongo_updated or pinecone_updated
|
742
|
-
|
743
|
-
async def add_documents_batch(
|
744
|
-
self,
|
745
|
-
# Expects {'text': ..., 'metadata': ...}
|
746
|
-
documents: List[Dict[str, Any]],
|
747
|
-
namespace: Optional[str] = None,
|
748
|
-
batch_size: int = 50,
|
749
|
-
) -> List[str]:
|
750
|
-
"""
|
751
|
-
Add multiple plain text documents in batches using OpenAI embeddings.
|
752
|
-
|
753
|
-
Args:
|
754
|
-
documents: List of documents, each with 'text' and 'metadata'.
|
755
|
-
namespace: Optional Pinecone namespace.
|
756
|
-
batch_size: Number of documents per embedding/upsert batch.
|
757
|
-
|
758
|
-
Returns:
|
759
|
-
List of added document IDs.
|
760
|
-
"""
|
761
|
-
all_doc_ids = []
|
762
|
-
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
763
|
-
|
764
|
-
for i in range(0, len(documents), batch_size):
|
765
|
-
batch_docs_input = documents[i : i + batch_size]
|
766
|
-
batch_texts = [doc["text"] for doc in batch_docs_input]
|
767
|
-
batch_metadatas = [doc["metadata"] for doc in batch_docs_input]
|
768
|
-
# Generate IDs if not provided in metadata
|
769
|
-
batch_doc_ids = [
|
770
|
-
doc["metadata"].get("document_id") or str(uuid.uuid4())
|
771
|
-
for doc in batch_docs_input
|
772
|
-
]
|
773
|
-
all_doc_ids.extend(batch_doc_ids)
|
774
|
-
|
775
|
-
# Prepare MongoDB docs
|
776
|
-
mongo_batch = []
|
777
|
-
for idx, text in enumerate(batch_texts):
|
778
|
-
doc_id = batch_doc_ids[idx]
|
779
|
-
metadata = batch_metadatas[idx]
|
780
|
-
mongo_doc = {
|
781
|
-
"document_id": doc_id,
|
782
|
-
"content": text,
|
783
|
-
"is_chunk": False,
|
784
|
-
"parent_document_id": None,
|
785
|
-
**metadata,
|
786
|
-
"created_at": metadata.get(
|
787
|
-
"created_at", dt.now(tz=dt.now().astimezone().tzinfo)
|
788
|
-
),
|
789
|
-
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
|
790
|
-
}
|
791
|
-
# Ensure generated ID is in the doc for Mongo
|
792
|
-
if "document_id" not in metadata:
|
793
|
-
mongo_doc["document_id"] = doc_id
|
794
|
-
mongo_batch.append(mongo_doc)
|
795
|
-
|
796
|
-
# Insert into MongoDB
|
797
|
-
if mongo_batch:
|
798
|
-
try:
|
799
|
-
self.mongo.insert_many(self.collection, mongo_batch)
|
800
|
-
except Exception as e:
|
801
|
-
logger.error( # Use logger.error
|
802
|
-
f"Error inserting batch {i // batch_size + 1} into MongoDB: {e}"
|
753
|
+
if mongo_deleted_count > 0:
|
754
|
+
logger.info(
|
755
|
+
f"Deleted {mongo_deleted_count} documents from MongoDB for {document_id}."
|
803
756
|
)
|
804
|
-
|
805
|
-
|
757
|
+
# else: # No need to log if count is 0, covered by initial find log
|
758
|
+
# logger.info(f"No documents found to delete in MongoDB for {document_id} with IDs: {mongo_ids_list}")
|
806
759
|
|
807
|
-
# Embed batch using OpenAIEmbedding
|
808
|
-
try:
|
809
|
-
batch_embeddings = await embed_model.aget_text_embedding_batch(
|
810
|
-
batch_texts, show_progress=True
|
811
|
-
)
|
812
760
|
except Exception as e:
|
813
|
-
logger.error(
|
814
|
-
f"Error
|
815
|
-
)
|
816
|
-
continue # Skip Pinecone upsert for this batch
|
817
|
-
|
818
|
-
# Prepare Pinecone vectors
|
819
|
-
pinecone_vectors = []
|
820
|
-
for idx, doc_id in enumerate(batch_doc_ids):
|
821
|
-
metadata = batch_metadatas[idx]
|
822
|
-
pinecone_meta = {
|
823
|
-
"document_id": doc_id,
|
824
|
-
"is_chunk": False,
|
825
|
-
"source": metadata.get("source", "unknown"),
|
826
|
-
"tags": metadata.get("tags", []),
|
827
|
-
}
|
828
|
-
if self.pinecone.use_reranking:
|
829
|
-
pinecone_meta[self.pinecone.rerank_text_field] = batch_texts[idx]
|
830
|
-
|
831
|
-
pinecone_vectors.append(
|
832
|
-
{
|
833
|
-
"id": doc_id,
|
834
|
-
"values": batch_embeddings[idx],
|
835
|
-
"metadata": pinecone_meta,
|
836
|
-
}
|
761
|
+
logger.error(
|
762
|
+
f"Error deleting documents from MongoDB for {document_id} (IDs: {mongo_ids_list}): {e}"
|
837
763
|
)
|
764
|
+
mongo_delete_error = True # Track error
|
838
765
|
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
await self.pinecone.upsert(
|
843
|
-
vectors=pinecone_vectors, namespace=namespace
|
844
|
-
)
|
845
|
-
except Exception as e:
|
846
|
-
logger.error( # Use logger.error
|
847
|
-
f"Error upserting vector batch {i // batch_size + 1} to Pinecone: {e}"
|
848
|
-
)
|
849
|
-
|
850
|
-
# Optional delay
|
851
|
-
if i + batch_size < len(documents):
|
852
|
-
await asyncio.sleep(0.1)
|
853
|
-
|
854
|
-
return all_doc_ids
|
855
|
-
|
856
|
-
async def get_full_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
857
|
-
"""
|
858
|
-
Retrieve the full document entry (including PDF data if applicable) from MongoDB.
|
859
|
-
|
860
|
-
Args:
|
861
|
-
document_id: The ID of the document (parent ID if it was a PDF).
|
862
|
-
|
863
|
-
Returns:
|
864
|
-
The document dictionary from MongoDB, or None if not found.
|
865
|
-
"""
|
866
|
-
try:
|
867
|
-
return self.mongo.find_one(self.collection, {"document_id": document_id})
|
868
|
-
except Exception as e:
|
869
|
-
logger.error(
|
870
|
-
f"Error retrieving full document {document_id} from MongoDB: {e}"
|
871
|
-
) # Use logger.error
|
872
|
-
return None
|
766
|
+
# Return True only if the document was initially found and no errors occurred during deletion attempts
|
767
|
+
# If the document wasn't found initially, return False even if cleanup attempts were made.
|
768
|
+
return document_found and not mongo_delete_error and not pinecone_delete_error
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: solana-agent
|
3
|
-
Version:
|
3
|
+
Version: 29.1.0
|
4
4
|
Summary: AI Agents for Solana
|
5
5
|
License: MIT
|
6
6
|
Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
|
@@ -18,7 +18,7 @@ Requires-Dist: instructor (==1.7.9)
|
|
18
18
|
Requires-Dist: llama-index-core (==0.12.32)
|
19
19
|
Requires-Dist: llama-index-embeddings-openai (==0.3.1)
|
20
20
|
Requires-Dist: logfire (==3.14.1)
|
21
|
-
Requires-Dist: openai (==1.
|
21
|
+
Requires-Dist: openai (==1.77.0)
|
22
22
|
Requires-Dist: pillow (==11.2.1)
|
23
23
|
Requires-Dist: pinecone (==6.0.2)
|
24
24
|
Requires-Dist: pydantic (>=2)
|
@@ -27,7 +27,7 @@ Requires-Dist: pypdf (==5.4.0)
|
|
27
27
|
Requires-Dist: rich (>=13,<14.0)
|
28
28
|
Requires-Dist: scrubadub (==2.0.1)
|
29
29
|
Requires-Dist: typer (==0.15.3)
|
30
|
-
Requires-Dist: zep-cloud (==2.
|
30
|
+
Requires-Dist: zep-cloud (==2.12.1)
|
31
31
|
Project-URL: Documentation, https://docs.solana-agent.com
|
32
32
|
Project-URL: Homepage, https://solana-agent.com
|
33
33
|
Project-URL: Repository, https://github.com/truemagic-coder/solana-agent
|
@@ -55,6 +55,7 @@ Build your AI agents in three lines of code!
|
|
55
55
|
* Three lines of code setup
|
56
56
|
* Simple Agent Definition
|
57
57
|
* Fast Responses
|
58
|
+
* Multi-Vendor Support
|
58
59
|
* Solana Ecosystem Integration
|
59
60
|
* Multi-Agent Swarm
|
60
61
|
* Multi-Modal (Images & Audio & Text)
|
@@ -79,6 +80,7 @@ Build your AI agents in three lines of code!
|
|
79
80
|
* Easy three lines of code setup
|
80
81
|
* Simple agent definition using JSON
|
81
82
|
* Fast AI responses
|
83
|
+
* Multi-vendor support including OpenAI, Grok, and Gemini AI services
|
82
84
|
* Solana Ecosystem Integration via [AgentiPy](https://github.com/niceberginc/agentipy)
|
83
85
|
* MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
|
84
86
|
* Integrated observability and tracing via [Pydantic Logfire](https://pydantic.dev/logfire)
|
@@ -112,7 +114,8 @@ Build your AI agents in three lines of code!
|
|
112
114
|
|
113
115
|
### AI Models Used
|
114
116
|
|
115
|
-
|
117
|
+
**OpenAI**
|
118
|
+
* [gpt-4.1](https://platform.openai.com/docs/models/gpt-4.1) (agent - can be overridden)
|
116
119
|
* [gpt-4.1-nano](https://platform.openai.com/docs/models/gpt-4.1-nano) (router)
|
117
120
|
* [text-embedding-3-large](https://platform.openai.com/docs/models/text-embedding-3-large) (embedding)
|
118
121
|
* [tts-1](https://platform.openai.com/docs/models/tts-1) (audio TTS)
|
@@ -120,6 +123,12 @@ Build your AI agents in three lines of code!
|
|
120
123
|
* [gpt-image-1](https://platform.openai.com/docs/models/gpt-image-1) (image generation)
|
121
124
|
* [gpt-4o-mini-search-preview](https://platform.openai.com/docs/models/gpt-4o-mini-search-preview) (Internet search)
|
122
125
|
|
126
|
+
**Grok**
|
127
|
+
* [grok-3-fast](https://x.ai/api#pricing) (agent - optional)
|
128
|
+
|
129
|
+
**Gemini**
|
130
|
+
* [gemini-2.5-pro-preview-03-25](https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-03-25) (agent - optional)
|
131
|
+
|
123
132
|
## Installation
|
124
133
|
|
125
134
|
You can install Solana Agent using pip:
|
@@ -421,9 +430,33 @@ config = {
|
|
421
430
|
}
|
422
431
|
```
|
423
432
|
|
433
|
+
### Grok
|
434
|
+
|
435
|
+
`grok-3-fast` can be used instead of `gpt-4.1` for the agent model
|
436
|
+
|
437
|
+
```python
|
438
|
+
config = {
|
439
|
+
"grok": {
|
440
|
+
"api_key": "your-grok-api-key",
|
441
|
+
},
|
442
|
+
}
|
443
|
+
```
|
444
|
+
|
445
|
+
### Gemini
|
446
|
+
|
447
|
+
`gemini-2.5-pro-preview-03-25` can be used instead of `gpt-4.1` for the agent model
|
448
|
+
|
449
|
+
```python
|
450
|
+
config = {
|
451
|
+
"gemini": {
|
452
|
+
"api_key": "your-gemini-api-key",
|
453
|
+
},
|
454
|
+
}
|
455
|
+
```
|
456
|
+
|
424
457
|
### Knowledge Base
|
425
458
|
|
426
|
-
The Knowledge Base (KB) is meant to store text values and/or
|
459
|
+
The Knowledge Base (KB) is meant to store text values and/or PDFs (extracts text) - can handle very large PDFs.
|
427
460
|
|
428
461
|
```python
|
429
462
|
config = {
|
@@ -1,19 +1,19 @@
|
|
1
1
|
solana_agent/__init__.py,sha256=g83qhMOCwcWL19V4CYbQwl0Ykpb0xn49OUh05i-pu3g,1001
|
2
2
|
solana_agent/adapters/__init__.py,sha256=tiEEuuy0NF3ngc_tGEcRTt71zVI58v3dYY9RvMrF2Cg,204
|
3
|
-
solana_agent/adapters/mongodb_adapter.py,sha256=
|
3
|
+
solana_agent/adapters/mongodb_adapter.py,sha256=Hq3S8VzfLmnPjV40z8yJXGqUamOJcX5GbOMd-1nNWO4,3175
|
4
4
|
solana_agent/adapters/openai_adapter.py,sha256=XnocNAV1nJGcjpRgOyMXnyDQSU8HvTx9zmb4pWtSb58,23432
|
5
5
|
solana_agent/adapters/pinecone_adapter.py,sha256=XlfOpoKHwzpaU4KZnovO2TnEYbsw-3B53ZKQDtBeDgU,23847
|
6
6
|
solana_agent/cli.py,sha256=FGvTIQmKLp6XsQdyKtuhIIfbBtMmcCCXfigNrj4bzMc,4704
|
7
7
|
solana_agent/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
solana_agent/client/solana_agent.py,sha256
|
8
|
+
solana_agent/client/solana_agent.py,sha256=z_8i_dDJOV5JcZvvRNsRulCd40RvIUuE_f231fRXBDo,8975
|
9
9
|
solana_agent/domains/__init__.py,sha256=HiC94wVPRy-QDJSSRywCRrhrFfTBeHjfi5z-QfZv46U,168
|
10
10
|
solana_agent/domains/agent.py,sha256=3Q1wg4eIul0CPpaYBOjEthKTfcdhf1SAiWc2R-IMGO8,2561
|
11
11
|
solana_agent/domains/routing.py,sha256=1yR4IswGcmREGgbOOI6TKCfuM7gYGOhQjLkBqnZ-rNo,582
|
12
12
|
solana_agent/factories/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
solana_agent/factories/agent_factory.py,sha256=
|
13
|
+
solana_agent/factories/agent_factory.py,sha256=PUjno9bgbkkp0s1oDSR_NXc3b60eb4H44BJF32z9dls,14286
|
14
14
|
solana_agent/guardrails/pii.py,sha256=FCz1IC3mmkr41QFFf5NaC0fwJrVkwFsxgyOCS2POO5I,4428
|
15
15
|
solana_agent/interfaces/__init__.py,sha256=IQs1WIM1FeKP1-kY2FEfyhol_dB-I-VAe2rD6jrVF6k,355
|
16
|
-
solana_agent/interfaces/client/client.py,sha256=
|
16
|
+
solana_agent/interfaces/client/client.py,sha256=hFYe04lFGbp4BDlUMOnYQrp_SQXFPcktGOwsi0F87vU,3140
|
17
17
|
solana_agent/interfaces/guardrails/guardrails.py,sha256=gZCQ1FrirW-mX6s7FoYrbRs6golsp-x269kk4kQiZzc,572
|
18
18
|
solana_agent/interfaces/plugins/plugins.py,sha256=Rz52cWBLdotwf4kV-2mC79tRYlN29zHSu1z9-y1HVPk,3329
|
19
19
|
solana_agent/interfaces/providers/data_storage.py,sha256=Y92Cq8BtC55VlsYLD7bo3ofqQabNnlg7Q4H1Q6CDsLU,1713
|
@@ -21,7 +21,7 @@ solana_agent/interfaces/providers/llm.py,sha256=FbK6HNMBOIONPE-ljPRElkO2fmFbkzWE
|
|
21
21
|
solana_agent/interfaces/providers/memory.py,sha256=h3HEOwWCiFGIuFBX49XOv1jFaQW3NGjyKPOfmQloevk,1011
|
22
22
|
solana_agent/interfaces/providers/vector_storage.py,sha256=XPYzvoWrlDVFCS9ItBmoqCFWXXWNYY-d9I7_pvP7YYk,1561
|
23
23
|
solana_agent/interfaces/services/agent.py,sha256=MgLudTwzCzzzSR6PsVTB-w5rhGDHB5B81TGjo2z3G-A,2152
|
24
|
-
solana_agent/interfaces/services/knowledge_base.py,sha256=
|
24
|
+
solana_agent/interfaces/services/knowledge_base.py,sha256=Mu8lCGFXPmI_IW5LRGti7octLoWZIg4k5PmGwPfe7LQ,1479
|
25
25
|
solana_agent/interfaces/services/query.py,sha256=eLMMwc8hwHHjxFxlvVvkZfoQi8cSgQycWJbYAVphl9E,1632
|
26
26
|
solana_agent/interfaces/services/routing.py,sha256=Qbn3-DQGVSQKaegHDekSFmn_XCklA0H2f0XUx9-o3wA,367
|
27
27
|
solana_agent/plugins/__init__.py,sha256=coZdgJKq1ExOaj6qB810i3rEhbjdVlrkN76ozt_Ojgo,193
|
@@ -33,11 +33,11 @@ solana_agent/repositories/__init__.py,sha256=fP83w83CGzXLnSdq-C5wbw9EhWTYtqE2lQT
|
|
33
33
|
solana_agent/repositories/memory.py,sha256=e-27ju6wmurxSxULzr_uDHxxdnvw8KrJt9NWyvAz-i4,7684
|
34
34
|
solana_agent/services/__init__.py,sha256=iko0c2MlF8b_SA_nuBGFllr2E3g_JowOrOzGcnU9tkA,162
|
35
35
|
solana_agent/services/agent.py,sha256=QoeQq_OEWyLdBS0FPa-lXm5qiE0RnRfrCKiFTfOSGE0,42369
|
36
|
-
solana_agent/services/knowledge_base.py,sha256=
|
36
|
+
solana_agent/services/knowledge_base.py,sha256=ZvOPrSmcNDgUzz4bJIQ4LeRl9vMZiK9hOfs71IpB7Bk,32735
|
37
37
|
solana_agent/services/query.py,sha256=ENUfs4WSTpODMRXppDVW-Y3li9jYn8pOfQIHIPerUdQ,18498
|
38
38
|
solana_agent/services/routing.py,sha256=C5Ku4t9TqvY7S8wlUPMTC04HCrT4Ib3E8Q8yX0lVU_s,7137
|
39
|
-
solana_agent-
|
40
|
-
solana_agent-
|
41
|
-
solana_agent-
|
42
|
-
solana_agent-
|
43
|
-
solana_agent-
|
39
|
+
solana_agent-29.1.0.dist-info/LICENSE,sha256=BnSRc-NSFuyF2s496l_4EyrwAP6YimvxWcjPiJ0J7g4,1057
|
40
|
+
solana_agent-29.1.0.dist-info/METADATA,sha256=bo0_BwDaDVd3URyQhkH-y6TD4e6Dfyt6VjjwvXYjjwU,33486
|
41
|
+
solana_agent-29.1.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
42
|
+
solana_agent-29.1.0.dist-info/entry_points.txt,sha256=-AuT_mfqk8dlZ0pHuAjx1ouAWpTRjpqvEUa6YV3lmc0,53
|
43
|
+
solana_agent-29.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|