solana-agent 20.1.2__py3-none-any.whl → 31.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. solana_agent/__init__.py +10 -5
  2. solana_agent/adapters/ffmpeg_transcoder.py +375 -0
  3. solana_agent/adapters/mongodb_adapter.py +15 -2
  4. solana_agent/adapters/openai_adapter.py +679 -0
  5. solana_agent/adapters/openai_realtime_ws.py +1813 -0
  6. solana_agent/adapters/pinecone_adapter.py +543 -0
  7. solana_agent/cli.py +128 -0
  8. solana_agent/client/solana_agent.py +180 -20
  9. solana_agent/domains/agent.py +13 -13
  10. solana_agent/domains/routing.py +18 -8
  11. solana_agent/factories/agent_factory.py +239 -38
  12. solana_agent/guardrails/pii.py +107 -0
  13. solana_agent/interfaces/client/client.py +95 -12
  14. solana_agent/interfaces/guardrails/guardrails.py +26 -0
  15. solana_agent/interfaces/plugins/plugins.py +2 -1
  16. solana_agent/interfaces/providers/__init__.py +0 -0
  17. solana_agent/interfaces/providers/audio.py +40 -0
  18. solana_agent/interfaces/providers/data_storage.py +9 -2
  19. solana_agent/interfaces/providers/llm.py +86 -9
  20. solana_agent/interfaces/providers/memory.py +13 -1
  21. solana_agent/interfaces/providers/realtime.py +212 -0
  22. solana_agent/interfaces/providers/vector_storage.py +53 -0
  23. solana_agent/interfaces/services/agent.py +27 -12
  24. solana_agent/interfaces/services/knowledge_base.py +59 -0
  25. solana_agent/interfaces/services/query.py +41 -8
  26. solana_agent/interfaces/services/routing.py +0 -1
  27. solana_agent/plugins/manager.py +37 -16
  28. solana_agent/plugins/registry.py +34 -19
  29. solana_agent/plugins/tools/__init__.py +0 -5
  30. solana_agent/plugins/tools/auto_tool.py +1 -0
  31. solana_agent/repositories/memory.py +332 -111
  32. solana_agent/services/__init__.py +1 -1
  33. solana_agent/services/agent.py +390 -241
  34. solana_agent/services/knowledge_base.py +768 -0
  35. solana_agent/services/query.py +1858 -153
  36. solana_agent/services/realtime.py +626 -0
  37. solana_agent/services/routing.py +104 -51
  38. solana_agent-31.4.0.dist-info/METADATA +1070 -0
  39. solana_agent-31.4.0.dist-info/RECORD +49 -0
  40. {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info}/WHEEL +1 -1
  41. solana_agent-31.4.0.dist-info/entry_points.txt +3 -0
  42. solana_agent/adapters/llm_adapter.py +0 -160
  43. solana_agent-20.1.2.dist-info/METADATA +0 -464
  44. solana_agent-20.1.2.dist-info/RECORD +0 -35
  45. {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,768 @@
1
+ import logging
2
+ from datetime import datetime as dt
3
+ from typing import Dict, List, Any, Optional, Union
4
+ import uuid
5
+ import asyncio
6
+ import io
7
+
8
+ # PDF and Chunking Imports
9
+ import pypdf
10
+ from llama_index.core import Document as LlamaDocument
11
+ from llama_index.core.node_parser import SemanticSplitterNodeParser
12
+ from llama_index.embeddings.openai import OpenAIEmbedding
13
+
14
+ from solana_agent.adapters.pinecone_adapter import PineconeAdapter
15
+ from solana_agent.adapters.mongodb_adapter import MongoDBAdapter
16
+ from solana_agent.interfaces.services.knowledge_base import (
17
+ KnowledgeBaseService as KnowledgeBaseInterface,
18
+ )
19
+
20
+ # Setup logger for this module
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class KnowledgeBaseService(KnowledgeBaseInterface):
25
+ """
26
+ Knowledge Base service using Pinecone for vector search and MongoDB for metadata/chunk storage.
27
+ Supports text documents and PDF semantic chunking using OpenAI embeddings via LlamaIndex.
28
+ PDF binary data is not stored. Chunks are stored individually in MongoDB.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ pinecone_adapter: PineconeAdapter,
34
+ mongodb_adapter: MongoDBAdapter,
35
+ openai_api_key: str,
36
+ openai_model_name: str = "text-embedding-3-large",
37
+ collection_name: str = "knowledge_documents",
38
+ rerank_results: bool = False,
39
+ rerank_top_k: int = 3,
40
+ # Semantic Splitter Config
41
+ splitter_buffer_size: int = 1,
42
+ splitter_breakpoint_percentile: int = 95,
43
+ ):
44
+ """
45
+ Initialize the Knowledge Base service.
46
+
47
+ Args:
48
+ pinecone_adapter: Configured PineconeAdapter instance.
49
+ mongodb_adapter: Configured MongoDBAdapter instance.
50
+ openai_api_key: OpenAI API key for embedding.
51
+ openai_model_name: OpenAI embedding model name.
52
+ collection_name: MongoDB collection for storing document metadata and chunks.
53
+ rerank_results: Whether PineconeAdapter should rerank results.
54
+ rerank_top_k: Number of results to return after reranking (used by PineconeAdapter).
55
+ splitter_buffer_size: Buffer size for SemanticSplitterNodeParser.
56
+ splitter_breakpoint_percentile: Breakpoint percentile for SemanticSplitterNodeParser.
57
+ """
58
+ self.pinecone = pinecone_adapter
59
+ self.mongo = mongodb_adapter
60
+ self.collection = collection_name
61
+ # Store rerank config for reference, but PineconeAdapter applies it
62
+ self.rerank_results = rerank_results
63
+ self.rerank_top_k = rerank_top_k
64
+
65
+ # --- Initialize Semantic Splitter with OpenAIEmbedding ---
66
+ api_key = openai_api_key
67
+ if not api_key:
68
+ raise ValueError(
69
+ "OpenAI API key not provided via argument or OPENAI_API_KEY environment variable."
70
+ )
71
+
72
+ # Determine expected embedding dimensions based on model name
73
+ if openai_model_name == "text-embedding-3-large":
74
+ openai_dimensions = 3072
75
+ elif openai_model_name == "text-embedding-3-small":
76
+ openai_dimensions = 1536
77
+ else:
78
+ # Attempt to get dimension from Pinecone config if available, otherwise raise error
79
+ openai_dimensions = getattr(self.pinecone, "embedding_dimensions", 0)
80
+ if openai_dimensions <= 0:
81
+ raise ValueError(
82
+ f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured."
83
+ )
84
+ logger.warning( # Use logger.warning
85
+ f"Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct."
86
+ )
87
+
88
+ # Instantiate OpenAIEmbedding
89
+ # Note: LlamaIndex OpenAIEmbedding doesn't directly support reducing dimensions via 'dimensions' param during init
90
+ # like the raw OpenAI client does. It uses the model's default.
91
+ try:
92
+ llama_embed_model = OpenAIEmbedding(
93
+ model=openai_model_name,
94
+ api_key=api_key,
95
+ # embed_batch_size=10 # Optional: Adjust batch size if needed
96
+ )
97
+ except Exception as e:
98
+ logger.error(f"Error initializing OpenAIEmbedding: {e}") # Use logger.error
99
+ raise
100
+
101
+ self.semantic_splitter = SemanticSplitterNodeParser(
102
+ buffer_size=splitter_buffer_size,
103
+ breakpoint_percentile_threshold=splitter_breakpoint_percentile,
104
+ embed_model=llama_embed_model, # Use the OpenAIEmbedding instance
105
+ )
106
+ # Store model name for logging/reference
107
+ self.openai_model_name = openai_model_name
108
+ # --- End Semantic Splitter Init ---
109
+
110
+ self._ensure_collection()
111
+
112
+ def _ensure_collection(self) -> None:
113
+ """Set up MongoDB collection with appropriate indexes."""
114
+ if not self.mongo.collection_exists(self.collection):
115
+ self.mongo.create_collection(self.collection)
116
+ logger.info(
117
+ f"Created MongoDB collection: {self.collection}"
118
+ ) # Use logger.info
119
+
120
+ # Indexes for retrieval and filtering
121
+ self.mongo.create_index(self.collection, [("document_id", 1)], unique=True)
122
+ self.mongo.create_index(self.collection, [("parent_document_id", 1)])
123
+ self.mongo.create_index(self.collection, [("source", 1)])
124
+ self.mongo.create_index(self.collection, [("created_at", -1)])
125
+ self.mongo.create_index(self.collection, [("tags", 1)])
126
+ self.mongo.create_index(self.collection, [("is_chunk", 1)])
127
+ logger.info(
128
+ f"Ensured indexes exist for MongoDB collection: {self.collection}"
129
+ ) # Use logger.info
130
+
131
+ async def add_document(
132
+ self,
133
+ text: str,
134
+ metadata: Dict[str, Any],
135
+ document_id: Optional[str] = None,
136
+ namespace: Optional[str] = None,
137
+ ) -> str: # pragma: no cover
138
+ """
139
+ Add a plain text document to the knowledge base. Embeds using OpenAI.
140
+
141
+ Args:
142
+ text: Document text content.
143
+ metadata: Document metadata.
144
+ document_id: Optional document ID.
145
+ namespace: Optional Pinecone namespace.
146
+
147
+ Returns:
148
+ The document ID.
149
+ """
150
+ doc_id = document_id or str(uuid.uuid4())
151
+ now = dt.now(tz=dt.now().astimezone().tzinfo)
152
+
153
+ # Store metadata and content in MongoDB
154
+ mongo_doc = {
155
+ "document_id": doc_id,
156
+ "content": text,
157
+ "is_chunk": False,
158
+ "parent_document_id": None,
159
+ **metadata,
160
+ # Use timezone aware datetime
161
+ "created_at": metadata.get("created_at", now),
162
+ "updated_at": now,
163
+ }
164
+ try:
165
+ self.mongo.insert_one(self.collection, mongo_doc)
166
+ except Exception as e:
167
+ logger.error(
168
+ f"Error inserting document {doc_id} into MongoDB: {e}"
169
+ ) # Use logger.error
170
+ raise
171
+
172
+ # Embed text using OpenAIEmbedding
173
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
174
+ try:
175
+ embedding = await embed_model.aget_text_embedding(text)
176
+ except Exception as e:
177
+ logger.error( # Use logger.error
178
+ f"Error embedding document {doc_id} using {self.openai_model_name}: {e}"
179
+ )
180
+ # Decide how to handle - Mongo insert succeeded, embedding failed
181
+ raise # Re-raise for now
182
+
183
+ # Prepare Pinecone metadata
184
+ pinecone_metadata = {
185
+ "document_id": doc_id,
186
+ "is_chunk": False,
187
+ "parent_document_id": False, # Explicitly set for clarity - Pinecone can't use None
188
+ "source": metadata.get("source", "unknown"),
189
+ "tags": metadata.get("tags", []),
190
+ }
191
+ # Add text itself if Pinecone adapter reranking is used
192
+ if self.pinecone.use_reranking:
193
+ pinecone_metadata[self.pinecone.rerank_text_field] = text
194
+
195
+ # Upsert vector to Pinecone using the generic upsert method
196
+ try:
197
+ await self.pinecone.upsert(
198
+ vectors=[
199
+ {"id": doc_id, "values": embedding, "metadata": pinecone_metadata}
200
+ ],
201
+ namespace=namespace,
202
+ )
203
+ except Exception as e:
204
+ logger.error(
205
+ f"Error upserting vector for {doc_id} to Pinecone: {e}"
206
+ ) # Use logger.error
207
+ # Decide how to handle - Mongo insert succeeded, Pinecone failed
208
+ raise # Re-raise for now
209
+
210
+ return doc_id
211
+
212
+ async def add_pdf_document(
213
+ self,
214
+ pdf_data: Union[bytes, str],
215
+ metadata: Dict[str, Any],
216
+ document_id: Optional[str] = None,
217
+ namespace: Optional[str] = None,
218
+ chunk_batch_size: int = 50,
219
+ ) -> str: # pragma: no cover
220
+ """
221
+ Add a PDF document, performs semantic chunking using OpenAI embeddings,
222
+ stores parent metadata and individual chunks in Mongo, and chunk vectors in Pinecone.
223
+ Full PDF binary is NOT stored.
224
+
225
+ Args:
226
+ pdf_data: PDF content as bytes or a path to the PDF file.
227
+ metadata: Document metadata (applies to the parent PDF).
228
+ document_id: Optional parent document ID.
229
+ namespace: Optional Pinecone namespace for chunks.
230
+ chunk_batch_size: Batch size for embedding and upserting chunks.
231
+
232
+ Returns:
233
+ The parent document ID.
234
+ """
235
+ parent_doc_id = document_id or str(uuid.uuid4())
236
+ pdf_bytes: bytes
237
+ now = dt.now(tz=dt.now().astimezone().tzinfo)
238
+
239
+ # --- 1. Read PDF and Extract Text ---
240
+ try:
241
+ if isinstance(pdf_data, str):
242
+ with open(pdf_data, "rb") as f:
243
+ pdf_bytes = f.read()
244
+ elif isinstance(pdf_data, bytes):
245
+ pdf_bytes = pdf_data
246
+ else:
247
+ raise ValueError("pdf_data must be bytes or a file path string.")
248
+
249
+ reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
250
+ extracted_text = "".join(page.extract_text() or "" for page in reader.pages)
251
+ if not extracted_text.strip():
252
+ logger.warning(
253
+ f"No text extracted from PDF {parent_doc_id}."
254
+ ) # Use logger.warning
255
+ # Still store parent metadata even if no text
256
+ except Exception as e:
257
+ logger.error(
258
+ f"Error reading or extracting text from PDF {parent_doc_id}: {e}"
259
+ ) # Use logger.error
260
+ raise
261
+
262
+ # --- 2. Store Parent PDF Metadata in MongoDB (NO BINARY) ---
263
+ mongo_parent_doc = {
264
+ "document_id": parent_doc_id,
265
+ "content": None,
266
+ "is_chunk": False,
267
+ "parent_document_id": None,
268
+ **metadata,
269
+ "created_at": metadata.get("created_at", now),
270
+ "updated_at": now,
271
+ }
272
+ try:
273
+ self.mongo.insert_one(self.collection, mongo_parent_doc)
274
+ logger.info(
275
+ f"Stored parent metadata for PDF {parent_doc_id} in MongoDB."
276
+ ) # Use logger.info
277
+ except Exception as e:
278
+ logger.error(
279
+ f"Error inserting parent PDF metadata {parent_doc_id} into MongoDB: {e}"
280
+ )
281
+ raise
282
+
283
+ # --- 3. Semantic Chunking ---
284
+ if not extracted_text.strip():
285
+ logger.info( # Use logger.info
286
+ f"Skipping chunking for PDF {parent_doc_id} due to no extracted text."
287
+ )
288
+ return parent_doc_id # Return parent ID even if no chunks
289
+
290
+ try:
291
+ llama_doc = LlamaDocument(text=extracted_text)
292
+ # Run synchronous splitter in thread pool
293
+ nodes = await asyncio.to_thread(
294
+ self.semantic_splitter.get_nodes_from_documents, [llama_doc]
295
+ )
296
+ logger.info(
297
+ f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}."
298
+ ) # Use logger.info
299
+ except Exception as e:
300
+ logger.error(
301
+ f"Error during semantic chunking for PDF {parent_doc_id}: {e}"
302
+ ) # Use logger.error
303
+ # Parent metadata is already stored, decide how to proceed. Raising for now.
304
+ raise
305
+
306
+ # --- 4. Embed Chunks and Batch Upsert to Pinecone AND Store Chunks in MongoDB ---
307
+ if not nodes:
308
+ return parent_doc_id # No chunks generated
309
+
310
+ logger.info(
311
+ f"Embedding {len(nodes)} chunks using {self.openai_model_name}..."
312
+ ) # Use logger.info
313
+ chunk_texts = [node.get_content() for node in nodes]
314
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
315
+ all_chunk_embeddings = []
316
+
317
+ # Embed chunks in batches (using embed_model's internal batching)
318
+ try:
319
+ # Use aget_text_embedding_batch for async embedding
320
+ all_chunk_embeddings = await embed_model.aget_text_embedding_batch(
321
+ chunk_texts, show_progress=True
322
+ )
323
+ except Exception as e:
324
+ logger.error(
325
+ f"Error embedding chunks for PDF {parent_doc_id}: {e}"
326
+ ) # Use logger.error
327
+ raise # Stop if embedding fails
328
+
329
+ logger.info(
330
+ "Embedding complete. Preparing vectors for Pinecone and documents for MongoDB."
331
+ ) # Use logger.info
332
+ pinecone_vectors = []
333
+ mongo_chunk_docs = []
334
+ chunk_now = dt.now(
335
+ tz=dt.now().astimezone().tzinfo
336
+ ) # Consistent timestamp for chunks
337
+
338
+ for i, node in enumerate(nodes):
339
+ chunk_id = f"{parent_doc_id}_chunk_{i}"
340
+ chunk_text = chunk_texts[i]
341
+
342
+ # Prepare Pinecone Vector Metadata
343
+ pinecone_chunk_metadata = {
344
+ "document_id": chunk_id, # Pinecone ID is the chunk ID
345
+ "parent_document_id": parent_doc_id,
346
+ "chunk_index": i,
347
+ "is_chunk": True,
348
+ "source": metadata.get("source", "unknown"), # Inherit from parent
349
+ "tags": metadata.get("tags", []), # Inherit from parent
350
+ }
351
+ # Add chunk text itself if Pinecone adapter reranking is used
352
+ if self.pinecone.use_reranking:
353
+ pinecone_chunk_metadata[self.pinecone.rerank_text_field] = chunk_text
354
+
355
+ pinecone_vectors.append(
356
+ {
357
+ "id": chunk_id,
358
+ "values": all_chunk_embeddings[i],
359
+ "metadata": pinecone_chunk_metadata,
360
+ }
361
+ )
362
+
363
+ # Prepare MongoDB Chunk Document
364
+ mongo_chunk_doc = {
365
+ "document_id": chunk_id, # Mongo ID is the chunk ID
366
+ "parent_document_id": parent_doc_id,
367
+ "chunk_index": i,
368
+ "is_chunk": True,
369
+ "content": chunk_text, # Store chunk text in Mongo
370
+ "source": metadata.get("source", "unknown"), # Inherit from parent
371
+ "tags": metadata.get("tags", []), # Inherit from parent
372
+ # Add other relevant parent metadata if needed, avoid duplication if possible
373
+ "created_at": chunk_now, # Use consistent time for batch
374
+ "updated_at": chunk_now,
375
+ }
376
+ mongo_chunk_docs.append(mongo_chunk_doc)
377
+
378
+ # --- 5. Store Chunks in MongoDB ---
379
+ if mongo_chunk_docs:
380
+ try:
381
+ self.mongo.insert_many(self.collection, mongo_chunk_docs)
382
+ logger.info(
383
+ f"Stored {len(mongo_chunk_docs)} chunks in MongoDB for parent {parent_doc_id}."
384
+ )
385
+ except Exception as e:
386
+ logger.error(
387
+ f"Error inserting chunks into MongoDB for parent {parent_doc_id}: {e}"
388
+ )
389
+ # Decide how to handle: Pinecone upsert might still proceed or fail.
390
+ # For now, log the error and continue to Pinecone upsert attempt.
391
+
392
+ # --- 6. Upsert Chunk Vectors to Pinecone in Batches ---
393
+ if not pinecone_vectors:
394
+ logger.warning(f"No vectors generated to upsert for PDF {parent_doc_id}.")
395
+ return parent_doc_id
396
+
397
+ logger.info( # Use logger.info
398
+ f"Upserting {len(pinecone_vectors)} chunk vectors to Pinecone in batches of {chunk_batch_size}..."
399
+ )
400
+ upsert_tasks = []
401
+ for i in range(0, len(pinecone_vectors), chunk_batch_size):
402
+ batch_vectors = pinecone_vectors[i : i + chunk_batch_size]
403
+ # Create task for each batch upsert
404
+ upsert_tasks.append(
405
+ self.pinecone.upsert(vectors=batch_vectors, namespace=namespace)
406
+ )
407
+ # Optional: Add a small delay between initiating tasks if rate limiting is a concern
408
+ # await asyncio.sleep(0.05)
409
+
410
+ # Run upsert tasks concurrently
411
+ results = await asyncio.gather(*upsert_tasks, return_exceptions=True)
412
+
413
+ # Check for errors during upsert
414
+ upsert_errors = False
415
+ for idx, result in enumerate(results):
416
+ if isinstance(result, Exception):
417
+ upsert_errors = True
418
+ logger.error(
419
+ f"Error upserting vector batch {idx + 1} to Pinecone for parent {parent_doc_id}: {result}"
420
+ ) # Use logger.error
421
+ # Decide on error handling: log, raise, etc. Consider cleanup?
422
+
423
+ if upsert_errors:
424
+ logger.warning(
425
+ f"Some errors occurred during Pinecone vector upsert for {parent_doc_id}."
426
+ )
427
+ # Consider if partial success requires specific handling or cleanup
428
+
429
+ logger.info(f"Finished processing PDF {parent_doc_id}.") # Use logger.info
430
+ return parent_doc_id
431
+
432
+ async def query(
433
+ self,
434
+ query_text: str,
435
+ filter: Optional[Dict[str, Any]] = None,
436
+ top_k: int = 5,
437
+ namespace: Optional[str] = None,
438
+ include_content: bool = True,
439
+ include_metadata: bool = True,
440
+ ) -> List[Dict[str, Any]]: # pragma: no cover
441
+ """
442
+ Query the knowledge base using semantic search with OpenAI embeddings.
443
+ Retrieves chunk or document content and metadata from MongoDB based on Pinecone results.
444
+
445
+ Args:
446
+ query_text: The query text.
447
+ filter: Optional Pinecone metadata filter.
448
+ top_k: Number of results to retrieve initially.
449
+ namespace: Optional Pinecone namespace.
450
+ include_content: Whether to include document/chunk content in results.
451
+ include_metadata: Whether to include document/chunk metadata in results.
452
+
453
+ Returns:
454
+ List of result dictionaries, potentially reranked by PineconeAdapter.
455
+ """
456
+ # Determine initial K for Pinecone query, considering reranking multiplier
457
+ initial_k = top_k
458
+ if self.pinecone.use_reranking:
459
+ initial_k = top_k * self.pinecone.initial_query_top_k_multiplier
460
+
461
+ # --- Embed Query using OpenAIEmbedding ---
462
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
463
+ try:
464
+ query_vector = await embed_model.aget_query_embedding(query_text)
465
+ except Exception as e:
466
+ logger.error(
467
+ f"Error embedding query text '{query_text}': {e}"
468
+ ) # Use logger.error
469
+ return []
470
+
471
+ # --- Query Pinecone using the vector ---
472
+ try:
473
+ # Use the generic query method with the vector
474
+ # PineconeAdapter handles reranking internally if configured
475
+ pinecone_results = await self.pinecone.query(
476
+ vector=query_vector,
477
+ filter=filter,
478
+ top_k=initial_k, # Fetch more initially if reranking
479
+ namespace=namespace,
480
+ include_values=False, # Don't need embeddings back
481
+ include_metadata=True, # Need metadata for linking
482
+ )
483
+ except Exception as e:
484
+ logger.error(f"Error querying Pinecone: {e}") # Use logger.error
485
+ return []
486
+
487
+ if not pinecone_results:
488
+ return []
489
+
490
+ # Extract IDs, scores, and metadata from Pinecone results
491
+ # PineconeAdapter might have already reranked and truncated to final top_k
492
+ result_ids = [
493
+ res["id"] for res in pinecone_results
494
+ ] # These are chunk IDs or plain doc IDs
495
+ scores = {res["id"]: res["score"] for res in pinecone_results}
496
+ pinecone_metadatas = {
497
+ res["id"]: res.get("metadata", {}) for res in pinecone_results
498
+ }
499
+
500
+ # --- Fetch corresponding data from MongoDB ---
501
+ # We need:
502
+ # 1. Chunk documents (using result_ids where is_chunk is True)
503
+ # 2. Parent documents (using parent_document_id from chunk metadata)
504
+ # 3. Plain documents (using result_ids where is_chunk is False)
505
+ chunk_ids_to_fetch = set()
506
+ parent_ids_to_fetch = set()
507
+ plain_doc_ids_to_fetch = set()
508
+
509
+ for res_id in result_ids:
510
+ meta = pinecone_metadatas.get(res_id, {})
511
+ if meta.get("is_chunk"):
512
+ chunk_ids_to_fetch.add(res_id)
513
+ parent_id = meta.get("parent_document_id")
514
+ if parent_id:
515
+ parent_ids_to_fetch.add(parent_id)
516
+ else:
517
+ plain_doc_ids_to_fetch.add(res_id)
518
+
519
+ # Fetch all required docs from Mongo in potentially fewer queries
520
+ mongo_docs_map = {}
521
+ ids_to_fetch_mongo = list(
522
+ chunk_ids_to_fetch | parent_ids_to_fetch | plain_doc_ids_to_fetch
523
+ )
524
+
525
+ if ids_to_fetch_mongo:
526
+ try:
527
+ mongo_docs = self.mongo.find(
528
+ self.collection, {"document_id": {"$in": ids_to_fetch_mongo}}
529
+ )
530
+ mongo_docs_map = {doc["document_id"]: doc for doc in mongo_docs}
531
+ except Exception as e:
532
+ logger.error(
533
+ f"Error fetching documents from MongoDB: {e}"
534
+ ) # Use logger.error
535
+ # Proceed with potentially missing Mongo data
536
+
537
+ # --- Combine Results ---
538
+ results = []
539
+ for res_id in result_ids: # Iterate in Pinecone's score order
540
+ pinecone_meta = pinecone_metadatas.get(res_id, {})
541
+ is_chunk = pinecone_meta.get("is_chunk", False)
542
+ parent_doc_id = pinecone_meta.get("parent_document_id")
543
+
544
+ result = {
545
+ "document_id": res_id, # This is the chunk_id if is_chunk, else the doc_id
546
+ "score": scores.get(res_id, 0.0),
547
+ "is_chunk": is_chunk,
548
+ "parent_document_id": parent_doc_id, # Null if not a chunk
549
+ }
550
+
551
+ mongo_doc = mongo_docs_map.get(
552
+ res_id
553
+ ) # Get the specific chunk or plain doc
554
+
555
+ # --- FIX: Skip result if corresponding Mongo doc not found ---
556
+ if not mongo_doc:
557
+ logger.warning(
558
+ f"Document/chunk {res_id} found in Pinecone but not in MongoDB. Skipping."
559
+ )
560
+ continue
561
+ # --- End FIX ---
562
+
563
+ if include_content:
564
+ content = None
565
+ # Priority 1: Reranking field in Pinecone metadata (holds chunk text if reranking)
566
+ # Note: This might be redundant if we fetch from Mongo anyway, but keep for flexibility
567
+ if (
568
+ self.pinecone.use_reranking
569
+ and self.pinecone.rerank_text_field in pinecone_meta
570
+ ):
571
+ content = pinecone_meta[self.pinecone.rerank_text_field]
572
+ # Priority 2: Get content from the fetched Mongo doc (chunk or plain doc)
573
+ elif mongo_doc:
574
+ content = mongo_doc.get("content")
575
+ result["content"] = content or ""
576
+
577
+ if include_metadata:
578
+ combined_meta = {}
579
+ # If it's a chunk, fetch the parent document's metadata
580
+ if is_chunk and parent_doc_id:
581
+ parent_mongo_doc = mongo_docs_map.get(parent_doc_id)
582
+ if parent_mongo_doc:
583
+ # Extract metadata from parent, excluding fields specific to parent/content
584
+ combined_meta = {
585
+ k: v
586
+ for k, v in parent_mongo_doc.items()
587
+ if k
588
+ not in [
589
+ "_id",
590
+ "document_id",
591
+ "content",
592
+ "pdf_data", # pdf_data removed anyway
593
+ "is_chunk",
594
+ "parent_document_id",
595
+ "created_at",
596
+ "updated_at",
597
+ "chunk_index",
598
+ ]
599
+ }
600
+ # If it's a plain doc, fetch its own metadata
601
+ elif not is_chunk and mongo_doc:
602
+ combined_meta = {
603
+ k: v
604
+ for k, v in mongo_doc.items()
605
+ if k
606
+ not in [
607
+ "_id",
608
+ "document_id",
609
+ "content",
610
+ "pdf_data",
611
+ "is_chunk",
612
+ "parent_document_id",
613
+ "created_at",
614
+ "updated_at",
615
+ "chunk_index",
616
+ ]
617
+ }
618
+
619
+ # Add/overwrite with chunk-specific info from Pinecone meta (like chunk_index)
620
+ # or specific metadata stored directly on the plain doc in Pinecone
621
+ combined_meta.update(
622
+ {
623
+ k: v
624
+ for k, v in pinecone_meta.items()
625
+ # Avoid redundancy with already included fields or internal fields
626
+ if k
627
+ not in [
628
+ "document_id",
629
+ "parent_document_id",
630
+ "is_chunk",
631
+ self.pinecone.rerank_text_field,
632
+ ]
633
+ }
634
+ )
635
+ result["metadata"] = combined_meta
636
+
637
+ results.append(result)
638
+
639
+ return results
640
+
641
+ async def delete_document(
642
+ self, document_id: str, namespace: Optional[str] = None
643
+ ) -> bool: # pragma: no cover
644
+ """
645
+ Delete a parent document (plain text or PDF) and all its associated chunks
646
+ from both MongoDB and Pinecone. Cannot delete a chunk directly.
647
+
648
+ Args:
649
+ document_id: ID of the parent document to delete.
650
+ namespace: Optional Pinecone namespace.
651
+
652
+ Returns:
653
+ True if deletion was successful in both stores (if applicable), False otherwise.
654
+ """
655
+ logger.info(
656
+ f"Attempting to delete document and associated data for ID: {document_id}"
657
+ )
658
+ mongo_delete_error = False
659
+ pinecone_delete_error = False
660
+ document_found = False # Track if the initial ID exists
661
+
662
+ # --- 0. Check if the target ID is a chunk ---
663
+ try:
664
+ target_doc = self.mongo.find_one(
665
+ self.collection,
666
+ {"document_id": document_id},
667
+ )
668
+ if target_doc and target_doc.get("is_chunk"):
669
+ logger.warning(
670
+ f"Cannot delete chunk {document_id} directly. Delete the parent document."
671
+ )
672
+ return False # Prevent deleting chunks directly
673
+ if target_doc:
674
+ document_found = True
675
+ except Exception as e: # pragma: no cover
676
+ logger.error(
677
+ f"Error checking document type for {document_id} in MongoDB: {e}"
678
+ ) # pragma: no cover
679
+ return False # pragma: no cover # Fail if we can't even check the type
680
+
681
+ if not document_found:
682
+ logger.warning(f"Document {document_id} not found for deletion.")
683
+ # Even if not found, attempt cleanup in Pinecone just in case of inconsistency
684
+ # but the overall result should be False as the primary doc wasn't there.
685
+ pass # Continue to attempt Pinecone cleanup, but final result will be False
686
+
687
+ # --- 1. Find all associated document IDs in MongoDB ---
688
+ mongo_ids_to_delete = set()
689
+ pinecone_ids_to_delete = set()
690
+ try:
691
+ # Find parent doc and all chunk docs linked to it
692
+ # Use the ID confirmed not to be a chunk
693
+ docs_to_delete_mongo = list(
694
+ self.mongo.find(
695
+ self.collection,
696
+ {
697
+ "$or": [
698
+ {"document_id": document_id},
699
+ {"parent_document_id": document_id},
700
+ ]
701
+ },
702
+ )
703
+ )
704
+ if docs_to_delete_mongo:
705
+ document_found = True # Confirm something was found related to the ID
706
+ for doc in docs_to_delete_mongo:
707
+ mongo_ids_to_delete.add(doc["document_id"])
708
+ pinecone_ids_to_delete.add(doc["document_id"])
709
+ elif document_found: # Parent existed but no chunks found (plain text doc)
710
+ mongo_ids_to_delete.add(document_id)
711
+ pinecone_ids_to_delete.add(document_id)
712
+ # If !document_found initially, sets remain empty unless fallback below happens
713
+
714
+ except Exception as e:
715
+ logger.warning(
716
+ f"Error finding associated documents in MongoDB for deletion ({document_id}): {e}. Attempting Pinecone/Mongo deletion with main ID only."
717
+ )
718
+ # Fallback: try deleting the main ID from Pinecone/Mongo
719
+ if document_found: # Only add if we confirmed the initial doc existed
720
+ pinecone_ids_to_delete.add(document_id)
721
+ mongo_ids_to_delete.add(document_id)
722
+
723
+ # Convert sets to lists for deletion methods
724
+ pinecone_ids_list = list(pinecone_ids_to_delete)
725
+ mongo_ids_list = list(mongo_ids_to_delete)
726
+
727
+ # If no IDs were found at all, and the initial doc wasn't found, return False
728
+ if not document_found and not mongo_ids_list and not pinecone_ids_list:
729
+ logger.info(f"No trace of document {document_id} found to delete.")
730
+ return False
731
+
732
+ # --- 2. Delete from Pinecone ---
733
+ if pinecone_ids_list:
734
+ try:
735
+ await self.pinecone.delete(ids=pinecone_ids_list, namespace=namespace)
736
+ logger.info(
737
+ f"Attempted deletion of {len(pinecone_ids_list)} vectors from Pinecone for {document_id}."
738
+ )
739
+ except Exception as e:
740
+ logger.error(
741
+ f"Error deleting vectors from Pinecone for {document_id} (IDs: {pinecone_ids_list}): {e}"
742
+ )
743
+ pinecone_delete_error = True # Track error
744
+
745
+ # --- 3. Delete from MongoDB ---
746
+ mongo_deleted_count = 0
747
+ if mongo_ids_list:
748
+ try:
749
+ delete_result = self.mongo.delete_many(
750
+ self.collection, {"document_id": {"$in": mongo_ids_list}}
751
+ )
752
+ mongo_deleted_count = delete_result.deleted_count
753
+ if mongo_deleted_count > 0:
754
+ logger.info(
755
+ f"Deleted {mongo_deleted_count} documents from MongoDB for {document_id}."
756
+ )
757
+ # else: # No need to log if count is 0, covered by initial find log
758
+ # logger.info(f"No documents found to delete in MongoDB for {document_id} with IDs: {mongo_ids_list}")
759
+
760
+ except Exception as e:
761
+ logger.error(
762
+ f"Error deleting documents from MongoDB for {document_id} (IDs: {mongo_ids_list}): {e}"
763
+ )
764
+ mongo_delete_error = True # Track error
765
+
766
+ # Return True only if the document was initially found and no errors occurred during deletion attempts
767
+ # If the document wasn't found initially, return False even if cleanup attempts were made.
768
+ return document_found and not mongo_delete_error and not pinecone_delete_error