solana-agent 27.2.0__py3-none-any.whl → 27.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,771 @@
1
+ from datetime import datetime as dt
2
+ from typing import Dict, List, Any, Optional, Union
3
+ import uuid
4
+ import asyncio
5
+ import io
6
+
7
+ # PDF and Chunking Imports
8
+ import pypdf
9
+ from llama_index.core import Document as LlamaDocument
10
+ from llama_index.core.node_parser import SemanticSplitterNodeParser
11
+ from llama_index.embeddings.openai import OpenAIEmbedding
12
+
13
+ from solana_agent.adapters.pinecone_adapter import PineconeAdapter
14
+ from solana_agent.adapters.mongodb_adapter import MongoDBAdapter
15
+ from solana_agent.interfaces.services.knowledge_base import KnowledgeBaseService as KnowledgeBaseInterface
16
+
17
+
18
+ class KnowledgeBaseService(KnowledgeBaseInterface):
19
+ """
20
+ Knowledge Base service using Pinecone for vector search and MongoDB for metadata/full document storage.
21
+ Supports text documents and PDF semantic chunking using OpenAI embeddings via LlamaIndex.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ pinecone_adapter: PineconeAdapter,
27
+ mongodb_adapter: MongoDBAdapter,
28
+ openai_api_key: str,
29
+ openai_model_name: str = "text-embedding-3-large",
30
+ collection_name: str = "knowledge_documents",
31
+ rerank_results: bool = False,
32
+ rerank_top_k: int = 3,
33
+ # Semantic Splitter Config
34
+ splitter_buffer_size: int = 1,
35
+ splitter_breakpoint_percentile: int = 95,
36
+ ):
37
+ """
38
+ Initialize the Knowledge Base service.
39
+
40
+ Args:
41
+ pinecone_adapter: Configured PineconeAdapter instance.
42
+ mongodb_adapter: Configured MongoDBAdapter instance.
43
+ openai_api_key: OpenAI API key for embedding.
44
+ openai_model_name: OpenAI embedding model name.
45
+ collection_name: MongoDB collection for storing document metadata and full PDFs.
46
+ rerank_results: Whether PineconeAdapter should rerank results.
47
+ rerank_top_k: Number of results to return after reranking (used by PineconeAdapter).
48
+ splitter_buffer_size: Buffer size for SemanticSplitterNodeParser.
49
+ splitter_breakpoint_percentile: Breakpoint percentile for SemanticSplitterNodeParser.
50
+ """
51
+ self.pinecone = pinecone_adapter
52
+ self.mongo = mongodb_adapter
53
+ self.collection = collection_name
54
+ # Store rerank config for reference, but PineconeAdapter applies it
55
+ self.rerank_results = rerank_results
56
+ self.rerank_top_k = rerank_top_k
57
+
58
+ # --- Initialize Semantic Splitter with OpenAIEmbedding ---
59
+ api_key = openai_api_key
60
+ if not api_key:
61
+ raise ValueError(
62
+ "OpenAI API key not provided via argument or OPENAI_API_KEY environment variable.")
63
+
64
+ # Determine expected embedding dimensions based on model name
65
+ if openai_model_name == "text-embedding-3-large":
66
+ openai_dimensions = 3072
67
+ elif openai_model_name == "text-embedding-3-small":
68
+ openai_dimensions = 1536
69
+ else:
70
+ # Attempt to get dimension from Pinecone config if available, otherwise raise error
71
+ openai_dimensions = getattr(
72
+ self.pinecone, 'embedding_dimensions', 0)
73
+ if openai_dimensions <= 0:
74
+ raise ValueError(
75
+ f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured.")
76
+ print(
77
+ f"Warning: Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct.")
78
+
79
+ # Instantiate OpenAIEmbedding
80
+ # Note: LlamaIndex OpenAIEmbedding doesn't directly support reducing dimensions via 'dimensions' param during init
81
+ # like the raw OpenAI client does. It uses the model's default.
82
+ try:
83
+ llama_embed_model = OpenAIEmbedding(
84
+ model=openai_model_name,
85
+ api_key=api_key,
86
+ # embed_batch_size=10 # Optional: Adjust batch size if needed
87
+ )
88
+ except Exception as e:
89
+ print(f"Error initializing OpenAIEmbedding: {e}")
90
+ raise
91
+
92
+ self.semantic_splitter = SemanticSplitterNodeParser(
93
+ buffer_size=splitter_buffer_size,
94
+ breakpoint_percentile_threshold=splitter_breakpoint_percentile,
95
+ embed_model=llama_embed_model # Use the OpenAIEmbedding instance
96
+ )
97
+ # Store model name for logging/reference
98
+ self.openai_model_name = openai_model_name
99
+ # --- End Semantic Splitter Init ---
100
+
101
+ self._ensure_collection()
102
+
103
+ def _ensure_collection(self) -> None:
104
+ """Set up MongoDB collection with appropriate indexes."""
105
+ if not self.mongo.collection_exists(self.collection):
106
+ self.mongo.create_collection(self.collection)
107
+ print(f"Created MongoDB collection: {self.collection}")
108
+
109
+ # Indexes for retrieval and filtering
110
+ self.mongo.create_index(
111
+ self.collection, [("document_id", 1)], unique=True)
112
+ self.mongo.create_index(self.collection, [("parent_document_id", 1)])
113
+ self.mongo.create_index(self.collection, [("source", 1)])
114
+ self.mongo.create_index(self.collection, [("created_at", -1)])
115
+ self.mongo.create_index(self.collection, [("tags", 1)])
116
+ self.mongo.create_index(self.collection, [("is_chunk", 1)])
117
+ print(
118
+ f"Ensured indexes exist for MongoDB collection: {self.collection}")
119
+
120
+ async def add_document(
121
+ self,
122
+ text: str,
123
+ metadata: Dict[str, Any],
124
+ document_id: Optional[str] = None,
125
+ namespace: Optional[str] = None
126
+ ) -> str:
127
+ """
128
+ Add a plain text document to the knowledge base. Embeds using OpenAI.
129
+
130
+ Args:
131
+ text: Document text content.
132
+ metadata: Document metadata.
133
+ document_id: Optional document ID.
134
+ namespace: Optional Pinecone namespace.
135
+
136
+ Returns:
137
+ The document ID.
138
+ """
139
+ doc_id = document_id or str(uuid.uuid4())
140
+
141
+ # Store metadata and content in MongoDB
142
+ mongo_doc = {
143
+ "document_id": doc_id,
144
+ "content": text,
145
+ "is_chunk": False,
146
+ "parent_document_id": None,
147
+ **metadata,
148
+ # Use timezone aware datetime
149
+ "created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
150
+ "updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
151
+ }
152
+ try:
153
+ self.mongo.insert_one(self.collection, mongo_doc)
154
+ except Exception as e:
155
+ print(f"Error inserting document {doc_id} into MongoDB: {e}")
156
+ raise
157
+
158
+ # Embed text using OpenAIEmbedding
159
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
160
+ try:
161
+ embedding = await embed_model.aget_text_embedding(text)
162
+ except Exception as e:
163
+ print(
164
+ f"Error embedding document {doc_id} using {self.openai_model_name}: {e}")
165
+ # Decide how to handle - Mongo insert succeeded, embedding failed
166
+ raise # Re-raise for now
167
+
168
+ # Prepare Pinecone metadata
169
+ pinecone_metadata = {
170
+ "document_id": doc_id,
171
+ "is_chunk": False,
172
+ "source": metadata.get("source", "unknown"),
173
+ "tags": metadata.get("tags", [])
174
+ }
175
+ # Add text itself if Pinecone adapter reranking is used
176
+ if self.pinecone.use_reranking:
177
+ pinecone_metadata[self.pinecone.rerank_text_field] = text
178
+
179
+ # Upsert vector to Pinecone using the generic upsert method
180
+ try:
181
+ await self.pinecone.upsert(
182
+ vectors=[{"id": doc_id, "values": embedding,
183
+ "metadata": pinecone_metadata}],
184
+ namespace=namespace
185
+ )
186
+ except Exception as e:
187
+ print(f"Error upserting vector for {doc_id} to Pinecone: {e}")
188
+ # Decide how to handle - Mongo insert succeeded, Pinecone failed
189
+ raise # Re-raise for now
190
+
191
+ return doc_id
192
+
193
+ async def add_pdf_document(
194
+ self,
195
+ pdf_data: Union[bytes, str],
196
+ metadata: Dict[str, Any],
197
+ document_id: Optional[str] = None,
198
+ namespace: Optional[str] = None,
199
+ chunk_batch_size: int = 50
200
+ ) -> str:
201
+ """
202
+ Add a PDF document, performs semantic chunking using OpenAI embeddings,
203
+ stores full PDF in Mongo, and chunk vectors in Pinecone.
204
+
205
+ Args:
206
+ pdf_data: PDF content as bytes or a path to the PDF file.
207
+ metadata: Document metadata (applies to the parent PDF).
208
+ document_id: Optional parent document ID.
209
+ namespace: Optional Pinecone namespace for chunks.
210
+ chunk_batch_size: Batch size for embedding and upserting chunks.
211
+
212
+ Returns:
213
+ The parent document ID.
214
+ """
215
+ parent_doc_id = document_id or str(uuid.uuid4())
216
+ pdf_bytes: bytes
217
+
218
+ # --- 1. Read PDF and Extract Text ---
219
+ try:
220
+ if isinstance(pdf_data, str):
221
+ with open(pdf_data, "rb") as f:
222
+ pdf_bytes = f.read()
223
+ elif isinstance(pdf_data, bytes):
224
+ pdf_bytes = pdf_data
225
+ else:
226
+ raise ValueError(
227
+ "pdf_data must be bytes or a file path string.")
228
+
229
+ reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
230
+ extracted_text = "".join(
231
+ page.extract_text() or "" for page in reader.pages)
232
+ if not extracted_text.strip():
233
+ print(f"Warning: No text extracted from PDF {parent_doc_id}.")
234
+ except Exception as e:
235
+ print(
236
+ f"Error reading or extracting text from PDF {parent_doc_id}: {e}")
237
+ raise
238
+
239
+ # --- 2. Store Full PDF and Metadata in MongoDB ---
240
+ mongo_parent_doc = {
241
+ "document_id": parent_doc_id,
242
+ "content": extracted_text,
243
+ "pdf_data": pdf_bytes,
244
+ "is_chunk": False,
245
+ "parent_document_id": None,
246
+ **metadata,
247
+ "created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
248
+ "updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
249
+ }
250
+ try:
251
+ self.mongo.insert_one(self.collection, mongo_parent_doc)
252
+ print(f"Stored full PDF {parent_doc_id} in MongoDB.")
253
+ except Exception as e: # pragma: no cover
254
+ print(
255
+ f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}") # pragma: no cover
256
+ raise # pragma: no cover
257
+
258
+ # --- 3. Semantic Chunking ---
259
+ if not extracted_text.strip():
260
+ print(
261
+ f"Skipping chunking for PDF {parent_doc_id} due to no extracted text.")
262
+ return parent_doc_id
263
+
264
+ try:
265
+ llama_doc = LlamaDocument(text=extracted_text)
266
+ # Run synchronous splitter in thread pool
267
+ nodes = await asyncio.to_thread(
268
+ self.semantic_splitter.get_nodes_from_documents,
269
+ [llama_doc]
270
+ )
271
+ print(
272
+ f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}.")
273
+ except Exception as e:
274
+ print(
275
+ f"Error during semantic chunking for PDF {parent_doc_id}: {e}")
276
+ raise
277
+
278
+ # --- 4. Embed Chunks and Batch Upsert to Pinecone ---
279
+ if not nodes:
280
+ return parent_doc_id # No chunks generated
281
+
282
+ print(
283
+ f"Embedding {len(nodes)} chunks using {self.openai_model_name}...")
284
+ chunk_texts = [node.get_content() for node in nodes]
285
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
286
+ all_chunk_embeddings = []
287
+
288
+ # Embed chunks in batches (using embed_model's internal batching)
289
+ try:
290
+ # Use aget_text_embedding_batch for async embedding
291
+ # Note: LlamaIndex OpenAIEmbedding might handle batch size internally.
292
+ # If large number of nodes, consider explicit batching here if needed.
293
+ all_chunk_embeddings = await embed_model.aget_text_embedding_batch(
294
+ chunk_texts, show_progress=True
295
+ )
296
+ except Exception as e:
297
+ print(f"Error embedding chunks for PDF {parent_doc_id}: {e}")
298
+ raise # Stop if embedding fails
299
+
300
+ print("Embedding complete. Preparing vectors for Pinecone.")
301
+ pinecone_vectors = []
302
+ for i, node in enumerate(nodes):
303
+ chunk_id = f"{parent_doc_id}_chunk_{i}"
304
+ chunk_metadata = {
305
+ "document_id": chunk_id,
306
+ "parent_document_id": parent_doc_id,
307
+ "chunk_index": i,
308
+ "is_chunk": True,
309
+ "source": metadata.get("source", "unknown"),
310
+ "tags": metadata.get("tags", [])
311
+ }
312
+ # Add chunk text itself if Pinecone adapter reranking is used
313
+ if self.pinecone.use_reranking:
314
+ chunk_metadata[self.pinecone.rerank_text_field] = chunk_texts[i]
315
+
316
+ pinecone_vectors.append({
317
+ "id": chunk_id,
318
+ "values": all_chunk_embeddings[i],
319
+ "metadata": chunk_metadata
320
+ })
321
+
322
+ # Upsert vectors in batches using the generic upsert method
323
+ print(
324
+ f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}...")
325
+ upsert_tasks = []
326
+ for i in range(0, len(pinecone_vectors), chunk_batch_size):
327
+ batch_vectors = pinecone_vectors[i:i+chunk_batch_size]
328
+ # Create task for each batch upsert
329
+ upsert_tasks.append(
330
+ self.pinecone.upsert(
331
+ vectors=batch_vectors,
332
+ namespace=namespace
333
+ )
334
+ )
335
+ # Optional: Add a small delay between initiating tasks if rate limiting is a concern
336
+ # await asyncio.sleep(0.05)
337
+
338
+ # Run upsert tasks concurrently
339
+ results = await asyncio.gather(*upsert_tasks, return_exceptions=True)
340
+
341
+ # Check for errors during upsert
342
+ for idx, result in enumerate(results):
343
+ if isinstance(result, Exception):
344
+ print(
345
+ f"Error upserting vector batch {idx + 1} to Pinecone: {result}")
346
+ # Decide on error handling: log, raise, etc.
347
+
348
+ print(f"Finished processing PDF {parent_doc_id}.")
349
+ return parent_doc_id
350
+
351
+ async def query(
352
+ self,
353
+ query_text: str,
354
+ filter: Optional[Dict[str, Any]] = None,
355
+ top_k: int = 5,
356
+ namespace: Optional[str] = None,
357
+ include_content: bool = True,
358
+ include_metadata: bool = True
359
+ ) -> List[Dict[str, Any]]:
360
+ """
361
+ Query the knowledge base using semantic search with OpenAI embeddings.
362
+
363
+ Args:
364
+ query_text: The query text.
365
+ filter: Optional Pinecone metadata filter.
366
+ top_k: Number of results to retrieve initially.
367
+ namespace: Optional Pinecone namespace.
368
+ include_content: Whether to include document/chunk content in results.
369
+ include_metadata: Whether to include document/chunk metadata in results.
370
+
371
+ Returns:
372
+ List of result dictionaries, potentially reranked by PineconeAdapter.
373
+ """
374
+ # Determine initial K for Pinecone query, considering reranking multiplier
375
+ initial_k = top_k
376
+ if self.pinecone.use_reranking:
377
+ initial_k = top_k * self.pinecone.initial_query_top_k_multiplier
378
+
379
+ # --- Embed Query using OpenAIEmbedding ---
380
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
381
+ try:
382
+ query_vector = await embed_model.aget_query_embedding(query_text)
383
+ except Exception as e:
384
+ print(f"Error embedding query text '{query_text}': {e}")
385
+ return []
386
+
387
+ # --- Query Pinecone using the vector ---
388
+ try:
389
+ # Use the generic query method with the vector
390
+ # PineconeAdapter handles reranking internally if configured
391
+ pinecone_results = await self.pinecone.query(
392
+ vector=query_vector,
393
+ filter=filter,
394
+ top_k=initial_k, # Fetch more initially if reranking
395
+ namespace=namespace,
396
+ include_values=False, # Don't need embeddings back
397
+ include_metadata=True # Need metadata for linking
398
+ )
399
+ except Exception as e:
400
+ print(f"Error querying Pinecone: {e}")
401
+ return []
402
+
403
+ if not pinecone_results:
404
+ return []
405
+
406
+ # Extract IDs, scores, and metadata from Pinecone results
407
+ # PineconeAdapter might have already reranked and truncated to final top_k
408
+ result_ids = [res['id'] for res in pinecone_results]
409
+ scores = {res['id']: res['score'] for res in pinecone_results}
410
+ pinecone_metadatas = {res['id']: res.get(
411
+ 'metadata', {}) for res in pinecone_results}
412
+
413
+ # --- Fetch corresponding data from MongoDB ---
414
+ mongo_docs_map = {}
415
+ parent_ids_to_fetch = set()
416
+ for res_id in result_ids:
417
+ meta = pinecone_metadatas.get(res_id, {})
418
+ if meta.get("is_chunk"):
419
+ parent_id = meta.get("parent_document_id")
420
+ if parent_id:
421
+ parent_ids_to_fetch.add(parent_id)
422
+ else:
423
+ # If it's not a chunk, its own ID might be in Mongo
424
+ parent_ids_to_fetch.add(res_id)
425
+
426
+ if parent_ids_to_fetch:
427
+ try:
428
+ mongo_docs = self.mongo.find(
429
+ self.collection,
430
+ {"document_id": {"$in": list(parent_ids_to_fetch)}}
431
+ )
432
+ mongo_docs_map = {doc["document_id"]
433
+ : doc for doc in mongo_docs}
434
+ except Exception as e:
435
+ print(f"Error fetching documents from MongoDB: {e}")
436
+ # Proceed with potentially missing Mongo data
437
+
438
+ # --- Combine Results ---
439
+ results = []
440
+ for res_id in result_ids: # Iterate in Pinecone's score order
441
+ pinecone_meta = pinecone_metadatas.get(res_id, {})
442
+ is_chunk = pinecone_meta.get("is_chunk", False)
443
+ parent_doc_id = pinecone_meta.get("parent_document_id")
444
+
445
+ # Determine which Mongo doc holds the relevant info
446
+ mongo_doc_for_meta = None
447
+ mongo_doc_for_content = None
448
+ if is_chunk and parent_doc_id:
449
+ mongo_doc_for_meta = mongo_docs_map.get(parent_doc_id)
450
+ mongo_doc_for_content = mongo_doc_for_meta # Parent holds full content
451
+ else: # Not a chunk
452
+ mongo_doc_for_meta = mongo_docs_map.get(res_id)
453
+ mongo_doc_for_content = mongo_doc_for_meta
454
+
455
+ result = {
456
+ "document_id": res_id,
457
+ "score": scores.get(res_id, 0.0),
458
+ "is_chunk": is_chunk,
459
+ "parent_document_id": parent_doc_id,
460
+ }
461
+
462
+ if include_content:
463
+ content = None
464
+ # Priority 1: Reranking field in Pinecone metadata (holds chunk text)
465
+ if self.pinecone.use_reranking and self.pinecone.rerank_text_field in pinecone_meta:
466
+ content = pinecone_meta[self.pinecone.rerank_text_field]
467
+ # Priority 2: Get content from the relevant Mongo doc
468
+ elif mongo_doc_for_content:
469
+ content = mongo_doc_for_content.get("content")
470
+ result["content"] = content or ""
471
+
472
+ if include_metadata:
473
+ combined_meta = {}
474
+ # Merge metadata from the relevant Mongo doc (parent or self)
475
+ if mongo_doc_for_meta:
476
+ combined_meta = {
477
+ k: v for k, v in mongo_doc_for_meta.items()
478
+ if k not in ["_id", "document_id", "content", "pdf_data", "is_chunk", "parent_document_id", "created_at", "updated_at"]
479
+ }
480
+ # Add/overwrite with chunk-specific info from Pinecone meta
481
+ combined_meta.update({
482
+ k: v for k, v in pinecone_meta.items()
483
+ # Avoid redundancy
484
+ if k not in ["document_id", self.pinecone.rerank_text_field]
485
+ })
486
+ result["metadata"] = combined_meta
487
+
488
+ results.append(result)
489
+
490
+ return results
491
+
492
+ async def delete_document(
493
+ self,
494
+ document_id: str,
495
+ namespace: Optional[str] = None
496
+ ) -> bool:
497
+ """
498
+ Delete a document (plain text or PDF) and all its associated chunks.
499
+
500
+ Args:
501
+ document_id: ID of the parent document (or plain text document).
502
+ namespace: Optional Pinecone namespace.
503
+
504
+ Returns:
505
+ True if deletion was successful (or partially successful).
506
+ """
507
+ print(
508
+ f"Attempting to delete document and associated data for ID: {document_id}")
509
+ mongo_deleted_count = 0
510
+ pinecone_deleted = False
511
+
512
+ # --- 1. Find all associated document IDs in MongoDB ---
513
+ # This includes the parent doc and potentially chunk metadata if we stored it
514
+ # We primarily need the IDs to delete from Pinecone.
515
+ # A more robust way might be to query Pinecone directly for vectors with parent_document_id == document_id
516
+ # For now, assume IDs in Mongo cover what needs deletion.
517
+ docs_to_delete_mongo = []
518
+ mongo_ids_to_delete = set([document_id]) # Start with the main ID
519
+ try:
520
+ docs_to_delete_mongo = list(self.mongo.find(
521
+ self.collection,
522
+ {"$or": [{"document_id": document_id}, {
523
+ "parent_document_id": document_id}]}
524
+ ))
525
+ for doc in docs_to_delete_mongo:
526
+ mongo_ids_to_delete.add(doc["document_id"])
527
+ except Exception as e:
528
+ print(
529
+ f"Warning: Error finding documents in MongoDB for deletion ({document_id}): {e}. Proceeding with main ID only.")
530
+
531
+ pinecone_ids_to_delete = list(mongo_ids_to_delete)
532
+
533
+ # --- 2. Delete from Pinecone ---
534
+ if pinecone_ids_to_delete:
535
+ try:
536
+ await self.pinecone.delete(ids=pinecone_ids_to_delete, namespace=namespace)
537
+ print(
538
+ f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}.")
539
+ pinecone_deleted = True
540
+ except Exception as e:
541
+ print(
542
+ f"Error deleting vectors from Pinecone for {document_id}: {e}")
543
+
544
+ # --- 3. Delete from MongoDB ---
545
+ # Use the IDs confirmed to be in Mongo
546
+ mongo_ids_found_in_db = [doc["document_id"]
547
+ for doc in docs_to_delete_mongo]
548
+ if mongo_ids_found_in_db:
549
+ try:
550
+ delete_result = self.mongo.delete_many(
551
+ self.collection,
552
+ {"document_id": {"$in": mongo_ids_found_in_db}}
553
+ )
554
+ mongo_deleted_count = delete_result.deleted_count
555
+ print(
556
+ f"Deleted {mongo_deleted_count} documents from MongoDB for parent {document_id}.")
557
+ except Exception as e:
558
+ print(
559
+ f"Error deleting documents from MongoDB for {document_id}: {e}")
560
+
561
+ return pinecone_deleted or mongo_deleted_count > 0
562
+
563
+ async def update_document(
564
+ self,
565
+ document_id: str,
566
+ text: Optional[str] = None,
567
+ metadata: Optional[Dict[str, Any]] = None,
568
+ namespace: Optional[str] = None
569
+ ) -> bool:
570
+ """
571
+ Update an existing plain text document or metadata. Embeds using OpenAI.
572
+ Updating PDF content requires deleting and re-adding.
573
+
574
+ Args:
575
+ document_id: ID of document to update.
576
+ text: Optional new text content (for plain text docs only).
577
+ metadata: Optional metadata to update.
578
+ namespace: Optional Pinecone namespace.
579
+
580
+ Returns:
581
+ True if successful.
582
+ """
583
+ current_doc = self.mongo.find_one(
584
+ self.collection, {"document_id": document_id})
585
+ if not current_doc:
586
+ print(f"Document {document_id} not found for update.")
587
+ return False
588
+
589
+ if current_doc.get("is_chunk"):
590
+ print(f"Cannot update chunk {document_id} directly.")
591
+ return False
592
+ if current_doc.get("pdf_data") and text is not None:
593
+ print(f"Cannot update PDF content via this method. Delete and re-add.")
594
+ return False
595
+
596
+ update_text = text is not None and not current_doc.get("pdf_data")
597
+ text_content = text if update_text else current_doc.get("content", "")
598
+
599
+ # --- Update MongoDB ---
600
+ mongo_update = {}
601
+ if metadata:
602
+ mongo_update.update(metadata)
603
+ if update_text:
604
+ mongo_update["content"] = text_content
605
+ mongo_update["updated_at"] = dt.now(tz=dt.now().astimezone().tzinfo)
606
+
607
+ mongo_updated = False
608
+ if mongo_update: # Only update if there are changes
609
+ try:
610
+ update_result = self.mongo.update_one(
611
+ self.collection, {"document_id": document_id}, {
612
+ "$set": mongo_update}
613
+ )
614
+ mongo_updated = update_result.modified_count > 0
615
+ except Exception as e:
616
+ print(f"Error updating document {document_id} in MongoDB: {e}")
617
+ # Decide if we should proceed to Pinecone update if Mongo failed
618
+ return False # Return False if Mongo update fails
619
+
620
+ # --- Update Pinecone (only if text changed) ---
621
+ pinecone_updated = False
622
+ if update_text:
623
+ # Embed updated text
624
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
625
+ try:
626
+ embedding = await embed_model.aget_text_embedding(text_content)
627
+ except Exception as e:
628
+ print(f"Error embedding updated text for {document_id}: {e}")
629
+ # Mongo update might have succeeded, but embedding failed
630
+ return mongo_updated # Return based on Mongo success
631
+
632
+ # Prepare Pinecone metadata
633
+ final_metadata = {**current_doc, **
634
+ mongo_update} # Use updated data
635
+ pinecone_metadata = {"document_id": document_id, "is_chunk": False}
636
+ for key, value in final_metadata.items():
637
+ if key not in ["_id", "content", "pdf_data", "created_at", "updated_at", "document_id", "is_chunk", "parent_document_id"]:
638
+ pinecone_metadata[key] = value
639
+ if self.pinecone.use_reranking:
640
+ pinecone_metadata[self.pinecone.rerank_text_field] = text_content
641
+
642
+ # Upsert vector to Pinecone
643
+ try:
644
+ await self.pinecone.upsert(
645
+ vectors=[{"id": document_id, "values": embedding,
646
+ "metadata": pinecone_metadata}],
647
+ namespace=namespace
648
+ )
649
+ pinecone_updated = True
650
+ except Exception as e:
651
+ print(
652
+ f"Error upserting updated vector in Pinecone for {document_id}: {e}")
653
+ # Mongo update succeeded, Pinecone failed
654
+
655
+ return mongo_updated or pinecone_updated
656
+
657
+ async def add_documents_batch(
658
+ self,
659
+ # Expects {'text': ..., 'metadata': ...}
660
+ documents: List[Dict[str, Any]],
661
+ namespace: Optional[str] = None,
662
+ batch_size: int = 50
663
+ ) -> List[str]:
664
+ """
665
+ Add multiple plain text documents in batches using OpenAI embeddings.
666
+
667
+ Args:
668
+ documents: List of documents, each with 'text' and 'metadata'.
669
+ namespace: Optional Pinecone namespace.
670
+ batch_size: Number of documents per embedding/upsert batch.
671
+
672
+ Returns:
673
+ List of added document IDs.
674
+ """
675
+ all_doc_ids = []
676
+ embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
677
+
678
+ for i in range(0, len(documents), batch_size):
679
+ batch_docs_input = documents[i:i+batch_size]
680
+ batch_texts = [doc['text'] for doc in batch_docs_input]
681
+ batch_metadatas = [doc['metadata'] for doc in batch_docs_input]
682
+ # Generate IDs if not provided in metadata
683
+ batch_doc_ids = [doc['metadata'].get('document_id') or str(
684
+ uuid.uuid4()) for doc in batch_docs_input]
685
+ all_doc_ids.extend(batch_doc_ids)
686
+
687
+ # Prepare MongoDB docs
688
+ mongo_batch = []
689
+ for idx, text in enumerate(batch_texts):
690
+ doc_id = batch_doc_ids[idx]
691
+ metadata = batch_metadatas[idx]
692
+ mongo_doc = {
693
+ "document_id": doc_id, "content": text, "is_chunk": False,
694
+ "parent_document_id": None, **metadata,
695
+ "created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
696
+ "updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
697
+ }
698
+ # Ensure generated ID is in the doc for Mongo
699
+ if 'document_id' not in metadata:
700
+ mongo_doc['document_id'] = doc_id
701
+ mongo_batch.append(mongo_doc)
702
+
703
+ # Insert into MongoDB
704
+ if mongo_batch:
705
+ try:
706
+ self.mongo.insert_many(self.collection, mongo_batch)
707
+ except Exception as e:
708
+ print(
709
+ f"Error inserting batch {i//batch_size + 1} into MongoDB: {e}")
710
+ # Decide if we should skip Pinecone for this batch
711
+ continue # Skip to next batch
712
+
713
+ # Embed batch using OpenAIEmbedding
714
+ try:
715
+ batch_embeddings = await embed_model.aget_text_embedding_batch(batch_texts, show_progress=True)
716
+ except Exception as e:
717
+ print(
718
+ f"Error embedding batch {i//batch_size + 1} using {self.openai_model_name}: {e}")
719
+ continue # Skip Pinecone upsert for this batch
720
+
721
+ # Prepare Pinecone vectors
722
+ pinecone_vectors = []
723
+ for idx, doc_id in enumerate(batch_doc_ids):
724
+ metadata = batch_metadatas[idx]
725
+ pinecone_meta = {
726
+ "document_id": doc_id, "is_chunk": False,
727
+ "source": metadata.get("source", "unknown"),
728
+ "tags": metadata.get("tags", [])
729
+ }
730
+ if self.pinecone.use_reranking:
731
+ pinecone_meta[self.pinecone.rerank_text_field] = batch_texts[idx]
732
+
733
+ pinecone_vectors.append({
734
+ "id": doc_id,
735
+ "values": batch_embeddings[idx],
736
+ "metadata": pinecone_meta
737
+ })
738
+
739
+ # Upsert vectors to Pinecone
740
+ if pinecone_vectors:
741
+ try:
742
+ await self.pinecone.upsert(
743
+ vectors=pinecone_vectors,
744
+ namespace=namespace
745
+ )
746
+ except Exception as e:
747
+ print(
748
+ f"Error upserting vector batch {i//batch_size + 1} to Pinecone: {e}")
749
+
750
+ # Optional delay
751
+ if i + batch_size < len(documents):
752
+ await asyncio.sleep(0.1)
753
+
754
+ return all_doc_ids
755
+
756
+ async def get_full_document(self, document_id: str) -> Optional[Dict[str, Any]]:
757
+ """
758
+ Retrieve the full document entry (including PDF data if applicable) from MongoDB.
759
+
760
+ Args:
761
+ document_id: The ID of the document (parent ID if it was a PDF).
762
+
763
+ Returns:
764
+ The document dictionary from MongoDB, or None if not found.
765
+ """
766
+ try:
767
+ return self.mongo.find_one(self.collection, {"document_id": document_id})
768
+ except Exception as e:
769
+ print(
770
+ f"Error retrieving full document {document_id} from MongoDB: {e}")
771
+ return None