solana-agent 27.3.5__py3-none-any.whl → 27.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. solana_agent/__init__.py +1 -3
  2. solana_agent/adapters/mongodb_adapter.py +5 -2
  3. solana_agent/adapters/openai_adapter.py +32 -27
  4. solana_agent/adapters/pinecone_adapter.py +91 -63
  5. solana_agent/client/solana_agent.py +38 -23
  6. solana_agent/domains/agent.py +7 -13
  7. solana_agent/domains/routing.py +5 -5
  8. solana_agent/factories/agent_factory.py +49 -34
  9. solana_agent/interfaces/client/client.py +22 -13
  10. solana_agent/interfaces/plugins/plugins.py +2 -1
  11. solana_agent/interfaces/providers/data_storage.py +9 -2
  12. solana_agent/interfaces/providers/llm.py +26 -12
  13. solana_agent/interfaces/providers/memory.py +1 -1
  14. solana_agent/interfaces/providers/vector_storage.py +3 -9
  15. solana_agent/interfaces/services/agent.py +21 -6
  16. solana_agent/interfaces/services/knowledge_base.py +6 -8
  17. solana_agent/interfaces/services/query.py +16 -5
  18. solana_agent/interfaces/services/routing.py +0 -1
  19. solana_agent/plugins/manager.py +14 -9
  20. solana_agent/plugins/registry.py +13 -11
  21. solana_agent/plugins/tools/__init__.py +0 -5
  22. solana_agent/plugins/tools/auto_tool.py +1 -0
  23. solana_agent/repositories/memory.py +20 -22
  24. solana_agent/services/__init__.py +1 -1
  25. solana_agent/services/agent.py +119 -89
  26. solana_agent/services/knowledge_base.py +182 -131
  27. solana_agent/services/query.py +48 -24
  28. solana_agent/services/routing.py +30 -18
  29. {solana_agent-27.3.5.dist-info → solana_agent-27.3.7.dist-info}/METADATA +6 -3
  30. solana_agent-27.3.7.dist-info/RECORD +39 -0
  31. solana_agent-27.3.5.dist-info/RECORD +0 -39
  32. {solana_agent-27.3.5.dist-info → solana_agent-27.3.7.dist-info}/LICENSE +0 -0
  33. {solana_agent-27.3.5.dist-info → solana_agent-27.3.7.dist-info}/WHEEL +0 -0
@@ -12,7 +12,9 @@ from llama_index.embeddings.openai import OpenAIEmbedding
12
12
 
13
13
  from solana_agent.adapters.pinecone_adapter import PineconeAdapter
14
14
  from solana_agent.adapters.mongodb_adapter import MongoDBAdapter
15
- from solana_agent.interfaces.services.knowledge_base import KnowledgeBaseService as KnowledgeBaseInterface
15
+ from solana_agent.interfaces.services.knowledge_base import (
16
+ KnowledgeBaseService as KnowledgeBaseInterface,
17
+ )
16
18
 
17
19
 
18
20
  class KnowledgeBaseService(KnowledgeBaseInterface):
@@ -59,7 +61,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
59
61
  api_key = openai_api_key
60
62
  if not api_key:
61
63
  raise ValueError(
62
- "OpenAI API key not provided via argument or OPENAI_API_KEY environment variable.")
64
+ "OpenAI API key not provided via argument or OPENAI_API_KEY environment variable."
65
+ )
63
66
 
64
67
  # Determine expected embedding dimensions based on model name
65
68
  if openai_model_name == "text-embedding-3-large":
@@ -68,13 +71,14 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
68
71
  openai_dimensions = 1536
69
72
  else:
70
73
  # Attempt to get dimension from Pinecone config if available, otherwise raise error
71
- openai_dimensions = getattr(
72
- self.pinecone, 'embedding_dimensions', 0)
74
+ openai_dimensions = getattr(self.pinecone, "embedding_dimensions", 0)
73
75
  if openai_dimensions <= 0:
74
76
  raise ValueError(
75
- f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured.")
77
+ f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured."
78
+ )
76
79
  print(
77
- f"Warning: Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct.")
80
+ f"Warning: Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct."
81
+ )
78
82
 
79
83
  # Instantiate OpenAIEmbedding
80
84
  # Note: LlamaIndex OpenAIEmbedding doesn't directly support reducing dimensions via 'dimensions' param during init
@@ -92,7 +96,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
92
96
  self.semantic_splitter = SemanticSplitterNodeParser(
93
97
  buffer_size=splitter_buffer_size,
94
98
  breakpoint_percentile_threshold=splitter_breakpoint_percentile,
95
- embed_model=llama_embed_model # Use the OpenAIEmbedding instance
99
+ embed_model=llama_embed_model, # Use the OpenAIEmbedding instance
96
100
  )
97
101
  # Store model name for logging/reference
98
102
  self.openai_model_name = openai_model_name
@@ -107,22 +111,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
107
111
  print(f"Created MongoDB collection: {self.collection}")
108
112
 
109
113
  # Indexes for retrieval and filtering
110
- self.mongo.create_index(
111
- self.collection, [("document_id", 1)], unique=True)
114
+ self.mongo.create_index(self.collection, [("document_id", 1)], unique=True)
112
115
  self.mongo.create_index(self.collection, [("parent_document_id", 1)])
113
116
  self.mongo.create_index(self.collection, [("source", 1)])
114
117
  self.mongo.create_index(self.collection, [("created_at", -1)])
115
118
  self.mongo.create_index(self.collection, [("tags", 1)])
116
119
  self.mongo.create_index(self.collection, [("is_chunk", 1)])
117
- print(
118
- f"Ensured indexes exist for MongoDB collection: {self.collection}")
120
+ print(f"Ensured indexes exist for MongoDB collection: {self.collection}")
119
121
 
120
122
  async def add_document(
121
123
  self,
122
124
  text: str,
123
125
  metadata: Dict[str, Any],
124
126
  document_id: Optional[str] = None,
125
- namespace: Optional[str] = None
127
+ namespace: Optional[str] = None,
126
128
  ) -> str:
127
129
  """
128
130
  Add a plain text document to the knowledge base. Embeds using OpenAI.
@@ -146,8 +148,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
146
148
  "parent_document_id": None,
147
149
  **metadata,
148
150
  # Use timezone aware datetime
149
- "created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
150
- "updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
151
+ "created_at": metadata.get(
152
+ "created_at", dt.now(tz=dt.now().astimezone().tzinfo)
153
+ ),
154
+ "updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
151
155
  }
152
156
  try:
153
157
  self.mongo.insert_one(self.collection, mongo_doc)
@@ -161,7 +165,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
161
165
  embedding = await embed_model.aget_text_embedding(text)
162
166
  except Exception as e:
163
167
  print(
164
- f"Error embedding document {doc_id} using {self.openai_model_name}: {e}")
168
+ f"Error embedding document {doc_id} using {self.openai_model_name}: {e}"
169
+ )
165
170
  # Decide how to handle - Mongo insert succeeded, embedding failed
166
171
  raise # Re-raise for now
167
172
 
@@ -170,7 +175,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
170
175
  "document_id": doc_id,
171
176
  "is_chunk": False,
172
177
  "source": metadata.get("source", "unknown"),
173
- "tags": metadata.get("tags", [])
178
+ "tags": metadata.get("tags", []),
174
179
  }
175
180
  # Add text itself if Pinecone adapter reranking is used
176
181
  if self.pinecone.use_reranking:
@@ -179,9 +184,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
179
184
  # Upsert vector to Pinecone using the generic upsert method
180
185
  try:
181
186
  await self.pinecone.upsert(
182
- vectors=[{"id": doc_id, "values": embedding,
183
- "metadata": pinecone_metadata}],
184
- namespace=namespace
187
+ vectors=[
188
+ {"id": doc_id, "values": embedding, "metadata": pinecone_metadata}
189
+ ],
190
+ namespace=namespace,
185
191
  )
186
192
  except Exception as e:
187
193
  print(f"Error upserting vector for {doc_id} to Pinecone: {e}")
@@ -196,7 +202,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
196
202
  metadata: Dict[str, Any],
197
203
  document_id: Optional[str] = None,
198
204
  namespace: Optional[str] = None,
199
- chunk_batch_size: int = 50
205
+ chunk_batch_size: int = 50,
200
206
  ) -> str:
201
207
  """
202
208
  Add a PDF document, performs semantic chunking using OpenAI embeddings,
@@ -223,17 +229,14 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
223
229
  elif isinstance(pdf_data, bytes):
224
230
  pdf_bytes = pdf_data
225
231
  else:
226
- raise ValueError(
227
- "pdf_data must be bytes or a file path string.")
232
+ raise ValueError("pdf_data must be bytes or a file path string.")
228
233
 
229
234
  reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
230
- extracted_text = "".join(
231
- page.extract_text() or "" for page in reader.pages)
235
+ extracted_text = "".join(page.extract_text() or "" for page in reader.pages)
232
236
  if not extracted_text.strip():
233
237
  print(f"Warning: No text extracted from PDF {parent_doc_id}.")
234
238
  except Exception as e:
235
- print(
236
- f"Error reading or extracting text from PDF {parent_doc_id}: {e}")
239
+ print(f"Error reading or extracting text from PDF {parent_doc_id}: {e}")
237
240
  raise
238
241
 
239
242
  # --- 2. Store Full PDF and Metadata in MongoDB ---
@@ -244,43 +247,43 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
244
247
  "is_chunk": False,
245
248
  "parent_document_id": None,
246
249
  **metadata,
247
- "created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
248
- "updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
250
+ "created_at": metadata.get(
251
+ "created_at", dt.now(tz=dt.now().astimezone().tzinfo)
252
+ ),
253
+ "updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
249
254
  }
250
255
  try:
251
256
  self.mongo.insert_one(self.collection, mongo_parent_doc)
252
257
  print(f"Stored full PDF {parent_doc_id} in MongoDB.")
253
258
  except Exception as e: # pragma: no cover
254
259
  print(
255
- f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}") # pragma: no cover
260
+ f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}"
261
+ ) # pragma: no cover
256
262
  raise # pragma: no cover
257
263
 
258
264
  # --- 3. Semantic Chunking ---
259
265
  if not extracted_text.strip():
260
266
  print(
261
- f"Skipping chunking for PDF {parent_doc_id} due to no extracted text.")
267
+ f"Skipping chunking for PDF {parent_doc_id} due to no extracted text."
268
+ )
262
269
  return parent_doc_id
263
270
 
264
271
  try:
265
272
  llama_doc = LlamaDocument(text=extracted_text)
266
273
  # Run synchronous splitter in thread pool
267
274
  nodes = await asyncio.to_thread(
268
- self.semantic_splitter.get_nodes_from_documents,
269
- [llama_doc]
275
+ self.semantic_splitter.get_nodes_from_documents, [llama_doc]
270
276
  )
271
- print(
272
- f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}.")
277
+ print(f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}.")
273
278
  except Exception as e:
274
- print(
275
- f"Error during semantic chunking for PDF {parent_doc_id}: {e}")
279
+ print(f"Error during semantic chunking for PDF {parent_doc_id}: {e}")
276
280
  raise
277
281
 
278
282
  # --- 4. Embed Chunks and Batch Upsert to Pinecone ---
279
283
  if not nodes:
280
284
  return parent_doc_id # No chunks generated
281
285
 
282
- print(
283
- f"Embedding {len(nodes)} chunks using {self.openai_model_name}...")
286
+ print(f"Embedding {len(nodes)} chunks using {self.openai_model_name}...")
284
287
  chunk_texts = [node.get_content() for node in nodes]
285
288
  embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
286
289
  all_chunk_embeddings = []
@@ -307,30 +310,30 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
307
310
  "chunk_index": i,
308
311
  "is_chunk": True,
309
312
  "source": metadata.get("source", "unknown"),
310
- "tags": metadata.get("tags", [])
313
+ "tags": metadata.get("tags", []),
311
314
  }
312
315
  # Add chunk text itself if Pinecone adapter reranking is used
313
316
  if self.pinecone.use_reranking:
314
317
  chunk_metadata[self.pinecone.rerank_text_field] = chunk_texts[i]
315
318
 
316
- pinecone_vectors.append({
317
- "id": chunk_id,
318
- "values": all_chunk_embeddings[i],
319
- "metadata": chunk_metadata
320
- })
319
+ pinecone_vectors.append(
320
+ {
321
+ "id": chunk_id,
322
+ "values": all_chunk_embeddings[i],
323
+ "metadata": chunk_metadata,
324
+ }
325
+ )
321
326
 
322
327
  # Upsert vectors in batches using the generic upsert method
323
328
  print(
324
- f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}...")
329
+ f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}..."
330
+ )
325
331
  upsert_tasks = []
326
332
  for i in range(0, len(pinecone_vectors), chunk_batch_size):
327
- batch_vectors = pinecone_vectors[i:i+chunk_batch_size]
333
+ batch_vectors = pinecone_vectors[i : i + chunk_batch_size]
328
334
  # Create task for each batch upsert
329
335
  upsert_tasks.append(
330
- self.pinecone.upsert(
331
- vectors=batch_vectors,
332
- namespace=namespace
333
- )
336
+ self.pinecone.upsert(vectors=batch_vectors, namespace=namespace)
334
337
  )
335
338
  # Optional: Add a small delay between initiating tasks if rate limiting is a concern
336
339
  # await asyncio.sleep(0.05)
@@ -341,8 +344,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
341
344
  # Check for errors during upsert
342
345
  for idx, result in enumerate(results):
343
346
  if isinstance(result, Exception):
344
- print(
345
- f"Error upserting vector batch {idx + 1} to Pinecone: {result}")
347
+ print(f"Error upserting vector batch {idx + 1} to Pinecone: {result}")
346
348
  # Decide on error handling: log, raise, etc.
347
349
 
348
350
  print(f"Finished processing PDF {parent_doc_id}.")
@@ -355,7 +357,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
355
357
  top_k: int = 5,
356
358
  namespace: Optional[str] = None,
357
359
  include_content: bool = True,
358
- include_metadata: bool = True
360
+ include_metadata: bool = True,
359
361
  ) -> List[Dict[str, Any]]:
360
362
  """
361
363
  Query the knowledge base using semantic search with OpenAI embeddings.
@@ -394,7 +396,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
394
396
  top_k=initial_k, # Fetch more initially if reranking
395
397
  namespace=namespace,
396
398
  include_values=False, # Don't need embeddings back
397
- include_metadata=True # Need metadata for linking
399
+ include_metadata=True, # Need metadata for linking
398
400
  )
399
401
  except Exception as e:
400
402
  print(f"Error querying Pinecone: {e}")
@@ -405,10 +407,11 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
405
407
 
406
408
  # Extract IDs, scores, and metadata from Pinecone results
407
409
  # PineconeAdapter might have already reranked and truncated to final top_k
408
- result_ids = [res['id'] for res in pinecone_results]
409
- scores = {res['id']: res['score'] for res in pinecone_results}
410
- pinecone_metadatas = {res['id']: res.get(
411
- 'metadata', {}) for res in pinecone_results}
410
+ result_ids = [res["id"] for res in pinecone_results]
411
+ scores = {res["id"]: res["score"] for res in pinecone_results}
412
+ pinecone_metadatas = {
413
+ res["id"]: res.get("metadata", {}) for res in pinecone_results
414
+ }
412
415
 
413
416
  # --- Fetch corresponding data from MongoDB ---
414
417
  mongo_docs_map = {}
@@ -426,11 +429,9 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
426
429
  if parent_ids_to_fetch:
427
430
  try:
428
431
  mongo_docs = self.mongo.find(
429
- self.collection,
430
- {"document_id": {"$in": list(parent_ids_to_fetch)}}
432
+ self.collection, {"document_id": {"$in": list(parent_ids_to_fetch)}}
431
433
  )
432
- mongo_docs_map = {doc["document_id"]
433
- : doc for doc in mongo_docs}
434
+ mongo_docs_map = {doc["document_id"]: doc for doc in mongo_docs}
434
435
  except Exception as e:
435
436
  print(f"Error fetching documents from MongoDB: {e}")
436
437
  # Proceed with potentially missing Mongo data
@@ -462,7 +463,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
462
463
  if include_content:
463
464
  content = None
464
465
  # Priority 1: Reranking field in Pinecone metadata (holds chunk text)
465
- if self.pinecone.use_reranking and self.pinecone.rerank_text_field in pinecone_meta:
466
+ if (
467
+ self.pinecone.use_reranking
468
+ and self.pinecone.rerank_text_field in pinecone_meta
469
+ ):
466
470
  content = pinecone_meta[self.pinecone.rerank_text_field]
467
471
  # Priority 2: Get content from the relevant Mongo doc
468
472
  elif mongo_doc_for_content:
@@ -474,15 +478,29 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
474
478
  # Merge metadata from the relevant Mongo doc (parent or self)
475
479
  if mongo_doc_for_meta:
476
480
  combined_meta = {
477
- k: v for k, v in mongo_doc_for_meta.items()
478
- if k not in ["_id", "document_id", "content", "pdf_data", "is_chunk", "parent_document_id", "created_at", "updated_at"]
481
+ k: v
482
+ for k, v in mongo_doc_for_meta.items()
483
+ if k
484
+ not in [
485
+ "_id",
486
+ "document_id",
487
+ "content",
488
+ "pdf_data",
489
+ "is_chunk",
490
+ "parent_document_id",
491
+ "created_at",
492
+ "updated_at",
493
+ ]
479
494
  }
480
495
  # Add/overwrite with chunk-specific info from Pinecone meta
481
- combined_meta.update({
482
- k: v for k, v in pinecone_meta.items()
483
- # Avoid redundancy
484
- if k not in ["document_id", self.pinecone.rerank_text_field]
485
- })
496
+ combined_meta.update(
497
+ {
498
+ k: v
499
+ for k, v in pinecone_meta.items()
500
+ # Avoid redundancy
501
+ if k not in ["document_id", self.pinecone.rerank_text_field]
502
+ }
503
+ )
486
504
  result["metadata"] = combined_meta
487
505
 
488
506
  results.append(result)
@@ -490,9 +508,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
490
508
  return results
491
509
 
492
510
  async def delete_document(
493
- self,
494
- document_id: str,
495
- namespace: Optional[str] = None
511
+ self, document_id: str, namespace: Optional[str] = None
496
512
  ) -> bool:
497
513
  """
498
514
  Delete a document (plain text or PDF) and all its associated chunks.
@@ -505,7 +521,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
505
521
  True if deletion was successful (or partially successful).
506
522
  """
507
523
  print(
508
- f"Attempting to delete document and associated data for ID: {document_id}")
524
+ f"Attempting to delete document and associated data for ID: {document_id}"
525
+ )
509
526
  mongo_deleted_count = 0
510
527
  pinecone_deleted = False
511
528
 
@@ -517,46 +534,53 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
517
534
  docs_to_delete_mongo = []
518
535
  mongo_ids_to_delete = set([document_id]) # Start with the main ID
519
536
  try:
520
- docs_to_delete_mongo = list(self.mongo.find(
521
- self.collection,
522
- {"$or": [{"document_id": document_id}, {
523
- "parent_document_id": document_id}]}
524
- ))
537
+ docs_to_delete_mongo = list(
538
+ self.mongo.find(
539
+ self.collection,
540
+ {
541
+ "$or": [
542
+ {"document_id": document_id},
543
+ {"parent_document_id": document_id},
544
+ ]
545
+ },
546
+ )
547
+ )
525
548
  for doc in docs_to_delete_mongo:
526
549
  mongo_ids_to_delete.add(doc["document_id"])
527
550
  except Exception as e:
528
551
  print(
529
- f"Warning: Error finding documents in MongoDB for deletion ({document_id}): {e}. Proceeding with main ID only.")
552
+ f"Warning: Error finding documents in MongoDB for deletion ({document_id}): {e}. Proceeding with main ID only."
553
+ )
530
554
 
531
555
  pinecone_ids_to_delete = list(mongo_ids_to_delete)
532
556
 
533
557
  # --- 2. Delete from Pinecone ---
534
558
  if pinecone_ids_to_delete:
535
559
  try:
536
- await self.pinecone.delete(ids=pinecone_ids_to_delete, namespace=namespace)
560
+ await self.pinecone.delete(
561
+ ids=pinecone_ids_to_delete, namespace=namespace
562
+ )
537
563
  print(
538
- f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}.")
564
+ f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}."
565
+ )
539
566
  pinecone_deleted = True
540
567
  except Exception as e:
541
- print(
542
- f"Error deleting vectors from Pinecone for {document_id}: {e}")
568
+ print(f"Error deleting vectors from Pinecone for {document_id}: {e}")
543
569
 
544
570
  # --- 3. Delete from MongoDB ---
545
571
  # Use the IDs confirmed to be in Mongo
546
- mongo_ids_found_in_db = [doc["document_id"]
547
- for doc in docs_to_delete_mongo]
572
+ mongo_ids_found_in_db = [doc["document_id"] for doc in docs_to_delete_mongo]
548
573
  if mongo_ids_found_in_db:
549
574
  try:
550
575
  delete_result = self.mongo.delete_many(
551
- self.collection,
552
- {"document_id": {"$in": mongo_ids_found_in_db}}
576
+ self.collection, {"document_id": {"$in": mongo_ids_found_in_db}}
553
577
  )
554
578
  mongo_deleted_count = delete_result.deleted_count
555
579
  print(
556
- f"Deleted {mongo_deleted_count} documents from MongoDB for parent {document_id}.")
580
+ f"Deleted {mongo_deleted_count} documents from MongoDB for parent {document_id}."
581
+ )
557
582
  except Exception as e:
558
- print(
559
- f"Error deleting documents from MongoDB for {document_id}: {e}")
583
+ print(f"Error deleting documents from MongoDB for {document_id}: {e}")
560
584
 
561
585
  return pinecone_deleted or mongo_deleted_count > 0
562
586
 
@@ -565,7 +589,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
565
589
  document_id: str,
566
590
  text: Optional[str] = None,
567
591
  metadata: Optional[Dict[str, Any]] = None,
568
- namespace: Optional[str] = None
592
+ namespace: Optional[str] = None,
569
593
  ) -> bool:
570
594
  """
571
595
  Update an existing plain text document or metadata. Embeds using OpenAI.
@@ -580,8 +604,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
580
604
  Returns:
581
605
  True if successful.
582
606
  """
583
- current_doc = self.mongo.find_one(
584
- self.collection, {"document_id": document_id})
607
+ current_doc = self.mongo.find_one(self.collection, {"document_id": document_id})
585
608
  if not current_doc:
586
609
  print(f"Document {document_id} not found for update.")
587
610
  return False
@@ -590,7 +613,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
590
613
  print(f"Cannot update chunk {document_id} directly.")
591
614
  return False
592
615
  if current_doc.get("pdf_data") and text is not None:
593
- print(f"Cannot update PDF content via this method. Delete and re-add.")
616
+ print("Cannot update PDF content via this method. Delete and re-add.")
594
617
  return False
595
618
 
596
619
  update_text = text is not None and not current_doc.get("pdf_data")
@@ -608,8 +631,9 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
608
631
  if mongo_update: # Only update if there are changes
609
632
  try:
610
633
  update_result = self.mongo.update_one(
611
- self.collection, {"document_id": document_id}, {
612
- "$set": mongo_update}
634
+ self.collection,
635
+ {"document_id": document_id},
636
+ {"$set": mongo_update},
613
637
  )
614
638
  mongo_updated = update_result.modified_count > 0
615
639
  except Exception as e:
@@ -630,11 +654,19 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
630
654
  return mongo_updated # Return based on Mongo success
631
655
 
632
656
  # Prepare Pinecone metadata
633
- final_metadata = {**current_doc, **
634
- mongo_update} # Use updated data
657
+ final_metadata = {**current_doc, **mongo_update} # Use updated data
635
658
  pinecone_metadata = {"document_id": document_id, "is_chunk": False}
636
659
  for key, value in final_metadata.items():
637
- if key not in ["_id", "content", "pdf_data", "created_at", "updated_at", "document_id", "is_chunk", "parent_document_id"]:
660
+ if key not in [
661
+ "_id",
662
+ "content",
663
+ "pdf_data",
664
+ "created_at",
665
+ "updated_at",
666
+ "document_id",
667
+ "is_chunk",
668
+ "parent_document_id",
669
+ ]:
638
670
  pinecone_metadata[key] = value
639
671
  if self.pinecone.use_reranking:
640
672
  pinecone_metadata[self.pinecone.rerank_text_field] = text_content
@@ -642,14 +674,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
642
674
  # Upsert vector to Pinecone
643
675
  try:
644
676
  await self.pinecone.upsert(
645
- vectors=[{"id": document_id, "values": embedding,
646
- "metadata": pinecone_metadata}],
647
- namespace=namespace
677
+ vectors=[
678
+ {
679
+ "id": document_id,
680
+ "values": embedding,
681
+ "metadata": pinecone_metadata,
682
+ }
683
+ ],
684
+ namespace=namespace,
648
685
  )
649
686
  pinecone_updated = True
650
687
  except Exception as e:
651
688
  print(
652
- f"Error upserting updated vector in Pinecone for {document_id}: {e}")
689
+ f"Error upserting updated vector in Pinecone for {document_id}: {e}"
690
+ )
653
691
  # Mongo update succeeded, Pinecone failed
654
692
 
655
693
  return mongo_updated or pinecone_updated
@@ -659,7 +697,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
659
697
  # Expects {'text': ..., 'metadata': ...}
660
698
  documents: List[Dict[str, Any]],
661
699
  namespace: Optional[str] = None,
662
- batch_size: int = 50
700
+ batch_size: int = 50,
663
701
  ) -> List[str]:
664
702
  """
665
703
  Add multiple plain text documents in batches using OpenAI embeddings.
@@ -676,12 +714,14 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
676
714
  embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
677
715
 
678
716
  for i in range(0, len(documents), batch_size):
679
- batch_docs_input = documents[i:i+batch_size]
680
- batch_texts = [doc['text'] for doc in batch_docs_input]
681
- batch_metadatas = [doc['metadata'] for doc in batch_docs_input]
717
+ batch_docs_input = documents[i : i + batch_size]
718
+ batch_texts = [doc["text"] for doc in batch_docs_input]
719
+ batch_metadatas = [doc["metadata"] for doc in batch_docs_input]
682
720
  # Generate IDs if not provided in metadata
683
- batch_doc_ids = [doc['metadata'].get('document_id') or str(
684
- uuid.uuid4()) for doc in batch_docs_input]
721
+ batch_doc_ids = [
722
+ doc["metadata"].get("document_id") or str(uuid.uuid4())
723
+ for doc in batch_docs_input
724
+ ]
685
725
  all_doc_ids.extend(batch_doc_ids)
686
726
 
687
727
  # Prepare MongoDB docs
@@ -690,14 +730,19 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
690
730
  doc_id = batch_doc_ids[idx]
691
731
  metadata = batch_metadatas[idx]
692
732
  mongo_doc = {
693
- "document_id": doc_id, "content": text, "is_chunk": False,
694
- "parent_document_id": None, **metadata,
695
- "created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
696
- "updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
733
+ "document_id": doc_id,
734
+ "content": text,
735
+ "is_chunk": False,
736
+ "parent_document_id": None,
737
+ **metadata,
738
+ "created_at": metadata.get(
739
+ "created_at", dt.now(tz=dt.now().astimezone().tzinfo)
740
+ ),
741
+ "updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
697
742
  }
698
743
  # Ensure generated ID is in the doc for Mongo
699
- if 'document_id' not in metadata:
700
- mongo_doc['document_id'] = doc_id
744
+ if "document_id" not in metadata:
745
+ mongo_doc["document_id"] = doc_id
701
746
  mongo_batch.append(mongo_doc)
702
747
 
703
748
  # Insert into MongoDB
@@ -706,16 +751,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
706
751
  self.mongo.insert_many(self.collection, mongo_batch)
707
752
  except Exception as e:
708
753
  print(
709
- f"Error inserting batch {i//batch_size + 1} into MongoDB: {e}")
754
+ f"Error inserting batch {i // batch_size + 1} into MongoDB: {e}"
755
+ )
710
756
  # Decide if we should skip Pinecone for this batch
711
757
  continue # Skip to next batch
712
758
 
713
759
  # Embed batch using OpenAIEmbedding
714
760
  try:
715
- batch_embeddings = await embed_model.aget_text_embedding_batch(batch_texts, show_progress=True)
761
+ batch_embeddings = await embed_model.aget_text_embedding_batch(
762
+ batch_texts, show_progress=True
763
+ )
716
764
  except Exception as e:
717
765
  print(
718
- f"Error embedding batch {i//batch_size + 1} using {self.openai_model_name}: {e}")
766
+ f"Error embedding batch {i // batch_size + 1} using {self.openai_model_name}: {e}"
767
+ )
719
768
  continue # Skip Pinecone upsert for this batch
720
769
 
721
770
  # Prepare Pinecone vectors
@@ -723,29 +772,32 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
723
772
  for idx, doc_id in enumerate(batch_doc_ids):
724
773
  metadata = batch_metadatas[idx]
725
774
  pinecone_meta = {
726
- "document_id": doc_id, "is_chunk": False,
775
+ "document_id": doc_id,
776
+ "is_chunk": False,
727
777
  "source": metadata.get("source", "unknown"),
728
- "tags": metadata.get("tags", [])
778
+ "tags": metadata.get("tags", []),
729
779
  }
730
780
  if self.pinecone.use_reranking:
731
781
  pinecone_meta[self.pinecone.rerank_text_field] = batch_texts[idx]
732
782
 
733
- pinecone_vectors.append({
734
- "id": doc_id,
735
- "values": batch_embeddings[idx],
736
- "metadata": pinecone_meta
737
- })
783
+ pinecone_vectors.append(
784
+ {
785
+ "id": doc_id,
786
+ "values": batch_embeddings[idx],
787
+ "metadata": pinecone_meta,
788
+ }
789
+ )
738
790
 
739
791
  # Upsert vectors to Pinecone
740
792
  if pinecone_vectors:
741
793
  try:
742
794
  await self.pinecone.upsert(
743
- vectors=pinecone_vectors,
744
- namespace=namespace
795
+ vectors=pinecone_vectors, namespace=namespace
745
796
  )
746
797
  except Exception as e:
747
798
  print(
748
- f"Error upserting vector batch {i//batch_size + 1} to Pinecone: {e}")
799
+ f"Error upserting vector batch {i // batch_size + 1} to Pinecone: {e}"
800
+ )
749
801
 
750
802
  # Optional delay
751
803
  if i + batch_size < len(documents):
@@ -766,6 +818,5 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
766
818
  try:
767
819
  return self.mongo.find_one(self.collection, {"document_id": document_id})
768
820
  except Exception as e:
769
- print(
770
- f"Error retrieving full document {document_id} from MongoDB: {e}")
821
+ print(f"Error retrieving full document {document_id} from MongoDB: {e}")
771
822
  return None