solana-agent 27.3.5__py3-none-any.whl → 27.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/__init__.py +1 -3
- solana_agent/adapters/mongodb_adapter.py +5 -2
- solana_agent/adapters/openai_adapter.py +32 -27
- solana_agent/adapters/pinecone_adapter.py +91 -63
- solana_agent/client/solana_agent.py +38 -23
- solana_agent/domains/agent.py +7 -13
- solana_agent/domains/routing.py +5 -5
- solana_agent/factories/agent_factory.py +49 -34
- solana_agent/interfaces/client/client.py +22 -13
- solana_agent/interfaces/plugins/plugins.py +2 -1
- solana_agent/interfaces/providers/data_storage.py +9 -2
- solana_agent/interfaces/providers/llm.py +26 -12
- solana_agent/interfaces/providers/memory.py +1 -1
- solana_agent/interfaces/providers/vector_storage.py +3 -9
- solana_agent/interfaces/services/agent.py +21 -6
- solana_agent/interfaces/services/knowledge_base.py +6 -8
- solana_agent/interfaces/services/query.py +16 -5
- solana_agent/interfaces/services/routing.py +0 -1
- solana_agent/plugins/manager.py +14 -9
- solana_agent/plugins/registry.py +13 -11
- solana_agent/plugins/tools/__init__.py +0 -5
- solana_agent/plugins/tools/auto_tool.py +1 -0
- solana_agent/repositories/memory.py +20 -22
- solana_agent/services/__init__.py +1 -1
- solana_agent/services/agent.py +119 -89
- solana_agent/services/knowledge_base.py +182 -131
- solana_agent/services/query.py +48 -24
- solana_agent/services/routing.py +30 -18
- {solana_agent-27.3.5.dist-info → solana_agent-27.3.7.dist-info}/METADATA +6 -3
- solana_agent-27.3.7.dist-info/RECORD +39 -0
- solana_agent-27.3.5.dist-info/RECORD +0 -39
- {solana_agent-27.3.5.dist-info → solana_agent-27.3.7.dist-info}/LICENSE +0 -0
- {solana_agent-27.3.5.dist-info → solana_agent-27.3.7.dist-info}/WHEEL +0 -0
@@ -12,7 +12,9 @@ from llama_index.embeddings.openai import OpenAIEmbedding
|
|
12
12
|
|
13
13
|
from solana_agent.adapters.pinecone_adapter import PineconeAdapter
|
14
14
|
from solana_agent.adapters.mongodb_adapter import MongoDBAdapter
|
15
|
-
from solana_agent.interfaces.services.knowledge_base import
|
15
|
+
from solana_agent.interfaces.services.knowledge_base import (
|
16
|
+
KnowledgeBaseService as KnowledgeBaseInterface,
|
17
|
+
)
|
16
18
|
|
17
19
|
|
18
20
|
class KnowledgeBaseService(KnowledgeBaseInterface):
|
@@ -59,7 +61,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
59
61
|
api_key = openai_api_key
|
60
62
|
if not api_key:
|
61
63
|
raise ValueError(
|
62
|
-
"OpenAI API key not provided via argument or OPENAI_API_KEY environment variable."
|
64
|
+
"OpenAI API key not provided via argument or OPENAI_API_KEY environment variable."
|
65
|
+
)
|
63
66
|
|
64
67
|
# Determine expected embedding dimensions based on model name
|
65
68
|
if openai_model_name == "text-embedding-3-large":
|
@@ -68,13 +71,14 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
68
71
|
openai_dimensions = 1536
|
69
72
|
else:
|
70
73
|
# Attempt to get dimension from Pinecone config if available, otherwise raise error
|
71
|
-
openai_dimensions = getattr(
|
72
|
-
self.pinecone, 'embedding_dimensions', 0)
|
74
|
+
openai_dimensions = getattr(self.pinecone, "embedding_dimensions", 0)
|
73
75
|
if openai_dimensions <= 0:
|
74
76
|
raise ValueError(
|
75
|
-
f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured."
|
77
|
+
f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured."
|
78
|
+
)
|
76
79
|
print(
|
77
|
-
f"Warning: Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct."
|
80
|
+
f"Warning: Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct."
|
81
|
+
)
|
78
82
|
|
79
83
|
# Instantiate OpenAIEmbedding
|
80
84
|
# Note: LlamaIndex OpenAIEmbedding doesn't directly support reducing dimensions via 'dimensions' param during init
|
@@ -92,7 +96,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
92
96
|
self.semantic_splitter = SemanticSplitterNodeParser(
|
93
97
|
buffer_size=splitter_buffer_size,
|
94
98
|
breakpoint_percentile_threshold=splitter_breakpoint_percentile,
|
95
|
-
embed_model=llama_embed_model # Use the OpenAIEmbedding instance
|
99
|
+
embed_model=llama_embed_model, # Use the OpenAIEmbedding instance
|
96
100
|
)
|
97
101
|
# Store model name for logging/reference
|
98
102
|
self.openai_model_name = openai_model_name
|
@@ -107,22 +111,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
107
111
|
print(f"Created MongoDB collection: {self.collection}")
|
108
112
|
|
109
113
|
# Indexes for retrieval and filtering
|
110
|
-
self.mongo.create_index(
|
111
|
-
self.collection, [("document_id", 1)], unique=True)
|
114
|
+
self.mongo.create_index(self.collection, [("document_id", 1)], unique=True)
|
112
115
|
self.mongo.create_index(self.collection, [("parent_document_id", 1)])
|
113
116
|
self.mongo.create_index(self.collection, [("source", 1)])
|
114
117
|
self.mongo.create_index(self.collection, [("created_at", -1)])
|
115
118
|
self.mongo.create_index(self.collection, [("tags", 1)])
|
116
119
|
self.mongo.create_index(self.collection, [("is_chunk", 1)])
|
117
|
-
print(
|
118
|
-
f"Ensured indexes exist for MongoDB collection: {self.collection}")
|
120
|
+
print(f"Ensured indexes exist for MongoDB collection: {self.collection}")
|
119
121
|
|
120
122
|
async def add_document(
|
121
123
|
self,
|
122
124
|
text: str,
|
123
125
|
metadata: Dict[str, Any],
|
124
126
|
document_id: Optional[str] = None,
|
125
|
-
namespace: Optional[str] = None
|
127
|
+
namespace: Optional[str] = None,
|
126
128
|
) -> str:
|
127
129
|
"""
|
128
130
|
Add a plain text document to the knowledge base. Embeds using OpenAI.
|
@@ -146,8 +148,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
146
148
|
"parent_document_id": None,
|
147
149
|
**metadata,
|
148
150
|
# Use timezone aware datetime
|
149
|
-
"created_at": metadata.get(
|
150
|
-
|
151
|
+
"created_at": metadata.get(
|
152
|
+
"created_at", dt.now(tz=dt.now().astimezone().tzinfo)
|
153
|
+
),
|
154
|
+
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
|
151
155
|
}
|
152
156
|
try:
|
153
157
|
self.mongo.insert_one(self.collection, mongo_doc)
|
@@ -161,7 +165,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
161
165
|
embedding = await embed_model.aget_text_embedding(text)
|
162
166
|
except Exception as e:
|
163
167
|
print(
|
164
|
-
f"Error embedding document {doc_id} using {self.openai_model_name}: {e}"
|
168
|
+
f"Error embedding document {doc_id} using {self.openai_model_name}: {e}"
|
169
|
+
)
|
165
170
|
# Decide how to handle - Mongo insert succeeded, embedding failed
|
166
171
|
raise # Re-raise for now
|
167
172
|
|
@@ -170,7 +175,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
170
175
|
"document_id": doc_id,
|
171
176
|
"is_chunk": False,
|
172
177
|
"source": metadata.get("source", "unknown"),
|
173
|
-
"tags": metadata.get("tags", [])
|
178
|
+
"tags": metadata.get("tags", []),
|
174
179
|
}
|
175
180
|
# Add text itself if Pinecone adapter reranking is used
|
176
181
|
if self.pinecone.use_reranking:
|
@@ -179,9 +184,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
179
184
|
# Upsert vector to Pinecone using the generic upsert method
|
180
185
|
try:
|
181
186
|
await self.pinecone.upsert(
|
182
|
-
vectors=[
|
183
|
-
|
184
|
-
|
187
|
+
vectors=[
|
188
|
+
{"id": doc_id, "values": embedding, "metadata": pinecone_metadata}
|
189
|
+
],
|
190
|
+
namespace=namespace,
|
185
191
|
)
|
186
192
|
except Exception as e:
|
187
193
|
print(f"Error upserting vector for {doc_id} to Pinecone: {e}")
|
@@ -196,7 +202,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
196
202
|
metadata: Dict[str, Any],
|
197
203
|
document_id: Optional[str] = None,
|
198
204
|
namespace: Optional[str] = None,
|
199
|
-
chunk_batch_size: int = 50
|
205
|
+
chunk_batch_size: int = 50,
|
200
206
|
) -> str:
|
201
207
|
"""
|
202
208
|
Add a PDF document, performs semantic chunking using OpenAI embeddings,
|
@@ -223,17 +229,14 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
223
229
|
elif isinstance(pdf_data, bytes):
|
224
230
|
pdf_bytes = pdf_data
|
225
231
|
else:
|
226
|
-
raise ValueError(
|
227
|
-
"pdf_data must be bytes or a file path string.")
|
232
|
+
raise ValueError("pdf_data must be bytes or a file path string.")
|
228
233
|
|
229
234
|
reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
|
230
|
-
extracted_text = "".join(
|
231
|
-
page.extract_text() or "" for page in reader.pages)
|
235
|
+
extracted_text = "".join(page.extract_text() or "" for page in reader.pages)
|
232
236
|
if not extracted_text.strip():
|
233
237
|
print(f"Warning: No text extracted from PDF {parent_doc_id}.")
|
234
238
|
except Exception as e:
|
235
|
-
print(
|
236
|
-
f"Error reading or extracting text from PDF {parent_doc_id}: {e}")
|
239
|
+
print(f"Error reading or extracting text from PDF {parent_doc_id}: {e}")
|
237
240
|
raise
|
238
241
|
|
239
242
|
# --- 2. Store Full PDF and Metadata in MongoDB ---
|
@@ -244,43 +247,43 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
244
247
|
"is_chunk": False,
|
245
248
|
"parent_document_id": None,
|
246
249
|
**metadata,
|
247
|
-
"created_at": metadata.get(
|
248
|
-
|
250
|
+
"created_at": metadata.get(
|
251
|
+
"created_at", dt.now(tz=dt.now().astimezone().tzinfo)
|
252
|
+
),
|
253
|
+
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
|
249
254
|
}
|
250
255
|
try:
|
251
256
|
self.mongo.insert_one(self.collection, mongo_parent_doc)
|
252
257
|
print(f"Stored full PDF {parent_doc_id} in MongoDB.")
|
253
258
|
except Exception as e: # pragma: no cover
|
254
259
|
print(
|
255
|
-
f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}"
|
260
|
+
f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}"
|
261
|
+
) # pragma: no cover
|
256
262
|
raise # pragma: no cover
|
257
263
|
|
258
264
|
# --- 3. Semantic Chunking ---
|
259
265
|
if not extracted_text.strip():
|
260
266
|
print(
|
261
|
-
f"Skipping chunking for PDF {parent_doc_id} due to no extracted text."
|
267
|
+
f"Skipping chunking for PDF {parent_doc_id} due to no extracted text."
|
268
|
+
)
|
262
269
|
return parent_doc_id
|
263
270
|
|
264
271
|
try:
|
265
272
|
llama_doc = LlamaDocument(text=extracted_text)
|
266
273
|
# Run synchronous splitter in thread pool
|
267
274
|
nodes = await asyncio.to_thread(
|
268
|
-
self.semantic_splitter.get_nodes_from_documents,
|
269
|
-
[llama_doc]
|
275
|
+
self.semantic_splitter.get_nodes_from_documents, [llama_doc]
|
270
276
|
)
|
271
|
-
print(
|
272
|
-
f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}.")
|
277
|
+
print(f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}.")
|
273
278
|
except Exception as e:
|
274
|
-
print(
|
275
|
-
f"Error during semantic chunking for PDF {parent_doc_id}: {e}")
|
279
|
+
print(f"Error during semantic chunking for PDF {parent_doc_id}: {e}")
|
276
280
|
raise
|
277
281
|
|
278
282
|
# --- 4. Embed Chunks and Batch Upsert to Pinecone ---
|
279
283
|
if not nodes:
|
280
284
|
return parent_doc_id # No chunks generated
|
281
285
|
|
282
|
-
print(
|
283
|
-
f"Embedding {len(nodes)} chunks using {self.openai_model_name}...")
|
286
|
+
print(f"Embedding {len(nodes)} chunks using {self.openai_model_name}...")
|
284
287
|
chunk_texts = [node.get_content() for node in nodes]
|
285
288
|
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
286
289
|
all_chunk_embeddings = []
|
@@ -307,30 +310,30 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
307
310
|
"chunk_index": i,
|
308
311
|
"is_chunk": True,
|
309
312
|
"source": metadata.get("source", "unknown"),
|
310
|
-
"tags": metadata.get("tags", [])
|
313
|
+
"tags": metadata.get("tags", []),
|
311
314
|
}
|
312
315
|
# Add chunk text itself if Pinecone adapter reranking is used
|
313
316
|
if self.pinecone.use_reranking:
|
314
317
|
chunk_metadata[self.pinecone.rerank_text_field] = chunk_texts[i]
|
315
318
|
|
316
|
-
pinecone_vectors.append(
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
319
|
+
pinecone_vectors.append(
|
320
|
+
{
|
321
|
+
"id": chunk_id,
|
322
|
+
"values": all_chunk_embeddings[i],
|
323
|
+
"metadata": chunk_metadata,
|
324
|
+
}
|
325
|
+
)
|
321
326
|
|
322
327
|
# Upsert vectors in batches using the generic upsert method
|
323
328
|
print(
|
324
|
-
f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}..."
|
329
|
+
f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}..."
|
330
|
+
)
|
325
331
|
upsert_tasks = []
|
326
332
|
for i in range(0, len(pinecone_vectors), chunk_batch_size):
|
327
|
-
batch_vectors = pinecone_vectors[i:i+chunk_batch_size]
|
333
|
+
batch_vectors = pinecone_vectors[i : i + chunk_batch_size]
|
328
334
|
# Create task for each batch upsert
|
329
335
|
upsert_tasks.append(
|
330
|
-
self.pinecone.upsert(
|
331
|
-
vectors=batch_vectors,
|
332
|
-
namespace=namespace
|
333
|
-
)
|
336
|
+
self.pinecone.upsert(vectors=batch_vectors, namespace=namespace)
|
334
337
|
)
|
335
338
|
# Optional: Add a small delay between initiating tasks if rate limiting is a concern
|
336
339
|
# await asyncio.sleep(0.05)
|
@@ -341,8 +344,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
341
344
|
# Check for errors during upsert
|
342
345
|
for idx, result in enumerate(results):
|
343
346
|
if isinstance(result, Exception):
|
344
|
-
print(
|
345
|
-
f"Error upserting vector batch {idx + 1} to Pinecone: {result}")
|
347
|
+
print(f"Error upserting vector batch {idx + 1} to Pinecone: {result}")
|
346
348
|
# Decide on error handling: log, raise, etc.
|
347
349
|
|
348
350
|
print(f"Finished processing PDF {parent_doc_id}.")
|
@@ -355,7 +357,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
355
357
|
top_k: int = 5,
|
356
358
|
namespace: Optional[str] = None,
|
357
359
|
include_content: bool = True,
|
358
|
-
include_metadata: bool = True
|
360
|
+
include_metadata: bool = True,
|
359
361
|
) -> List[Dict[str, Any]]:
|
360
362
|
"""
|
361
363
|
Query the knowledge base using semantic search with OpenAI embeddings.
|
@@ -394,7 +396,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
394
396
|
top_k=initial_k, # Fetch more initially if reranking
|
395
397
|
namespace=namespace,
|
396
398
|
include_values=False, # Don't need embeddings back
|
397
|
-
include_metadata=True # Need metadata for linking
|
399
|
+
include_metadata=True, # Need metadata for linking
|
398
400
|
)
|
399
401
|
except Exception as e:
|
400
402
|
print(f"Error querying Pinecone: {e}")
|
@@ -405,10 +407,11 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
405
407
|
|
406
408
|
# Extract IDs, scores, and metadata from Pinecone results
|
407
409
|
# PineconeAdapter might have already reranked and truncated to final top_k
|
408
|
-
result_ids = [res[
|
409
|
-
scores = {res[
|
410
|
-
pinecone_metadatas = {
|
411
|
-
|
410
|
+
result_ids = [res["id"] for res in pinecone_results]
|
411
|
+
scores = {res["id"]: res["score"] for res in pinecone_results}
|
412
|
+
pinecone_metadatas = {
|
413
|
+
res["id"]: res.get("metadata", {}) for res in pinecone_results
|
414
|
+
}
|
412
415
|
|
413
416
|
# --- Fetch corresponding data from MongoDB ---
|
414
417
|
mongo_docs_map = {}
|
@@ -426,11 +429,9 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
426
429
|
if parent_ids_to_fetch:
|
427
430
|
try:
|
428
431
|
mongo_docs = self.mongo.find(
|
429
|
-
self.collection,
|
430
|
-
{"document_id": {"$in": list(parent_ids_to_fetch)}}
|
432
|
+
self.collection, {"document_id": {"$in": list(parent_ids_to_fetch)}}
|
431
433
|
)
|
432
|
-
mongo_docs_map = {doc["document_id"]
|
433
|
-
: doc for doc in mongo_docs}
|
434
|
+
mongo_docs_map = {doc["document_id"]: doc for doc in mongo_docs}
|
434
435
|
except Exception as e:
|
435
436
|
print(f"Error fetching documents from MongoDB: {e}")
|
436
437
|
# Proceed with potentially missing Mongo data
|
@@ -462,7 +463,10 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
462
463
|
if include_content:
|
463
464
|
content = None
|
464
465
|
# Priority 1: Reranking field in Pinecone metadata (holds chunk text)
|
465
|
-
if
|
466
|
+
if (
|
467
|
+
self.pinecone.use_reranking
|
468
|
+
and self.pinecone.rerank_text_field in pinecone_meta
|
469
|
+
):
|
466
470
|
content = pinecone_meta[self.pinecone.rerank_text_field]
|
467
471
|
# Priority 2: Get content from the relevant Mongo doc
|
468
472
|
elif mongo_doc_for_content:
|
@@ -474,15 +478,29 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
474
478
|
# Merge metadata from the relevant Mongo doc (parent or self)
|
475
479
|
if mongo_doc_for_meta:
|
476
480
|
combined_meta = {
|
477
|
-
k: v
|
478
|
-
|
481
|
+
k: v
|
482
|
+
for k, v in mongo_doc_for_meta.items()
|
483
|
+
if k
|
484
|
+
not in [
|
485
|
+
"_id",
|
486
|
+
"document_id",
|
487
|
+
"content",
|
488
|
+
"pdf_data",
|
489
|
+
"is_chunk",
|
490
|
+
"parent_document_id",
|
491
|
+
"created_at",
|
492
|
+
"updated_at",
|
493
|
+
]
|
479
494
|
}
|
480
495
|
# Add/overwrite with chunk-specific info from Pinecone meta
|
481
|
-
combined_meta.update(
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
496
|
+
combined_meta.update(
|
497
|
+
{
|
498
|
+
k: v
|
499
|
+
for k, v in pinecone_meta.items()
|
500
|
+
# Avoid redundancy
|
501
|
+
if k not in ["document_id", self.pinecone.rerank_text_field]
|
502
|
+
}
|
503
|
+
)
|
486
504
|
result["metadata"] = combined_meta
|
487
505
|
|
488
506
|
results.append(result)
|
@@ -490,9 +508,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
490
508
|
return results
|
491
509
|
|
492
510
|
async def delete_document(
|
493
|
-
self,
|
494
|
-
document_id: str,
|
495
|
-
namespace: Optional[str] = None
|
511
|
+
self, document_id: str, namespace: Optional[str] = None
|
496
512
|
) -> bool:
|
497
513
|
"""
|
498
514
|
Delete a document (plain text or PDF) and all its associated chunks.
|
@@ -505,7 +521,8 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
505
521
|
True if deletion was successful (or partially successful).
|
506
522
|
"""
|
507
523
|
print(
|
508
|
-
f"Attempting to delete document and associated data for ID: {document_id}"
|
524
|
+
f"Attempting to delete document and associated data for ID: {document_id}"
|
525
|
+
)
|
509
526
|
mongo_deleted_count = 0
|
510
527
|
pinecone_deleted = False
|
511
528
|
|
@@ -517,46 +534,53 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
517
534
|
docs_to_delete_mongo = []
|
518
535
|
mongo_ids_to_delete = set([document_id]) # Start with the main ID
|
519
536
|
try:
|
520
|
-
docs_to_delete_mongo = list(
|
521
|
-
self.
|
522
|
-
|
523
|
-
|
524
|
-
|
537
|
+
docs_to_delete_mongo = list(
|
538
|
+
self.mongo.find(
|
539
|
+
self.collection,
|
540
|
+
{
|
541
|
+
"$or": [
|
542
|
+
{"document_id": document_id},
|
543
|
+
{"parent_document_id": document_id},
|
544
|
+
]
|
545
|
+
},
|
546
|
+
)
|
547
|
+
)
|
525
548
|
for doc in docs_to_delete_mongo:
|
526
549
|
mongo_ids_to_delete.add(doc["document_id"])
|
527
550
|
except Exception as e:
|
528
551
|
print(
|
529
|
-
f"Warning: Error finding documents in MongoDB for deletion ({document_id}): {e}. Proceeding with main ID only."
|
552
|
+
f"Warning: Error finding documents in MongoDB for deletion ({document_id}): {e}. Proceeding with main ID only."
|
553
|
+
)
|
530
554
|
|
531
555
|
pinecone_ids_to_delete = list(mongo_ids_to_delete)
|
532
556
|
|
533
557
|
# --- 2. Delete from Pinecone ---
|
534
558
|
if pinecone_ids_to_delete:
|
535
559
|
try:
|
536
|
-
await self.pinecone.delete(
|
560
|
+
await self.pinecone.delete(
|
561
|
+
ids=pinecone_ids_to_delete, namespace=namespace
|
562
|
+
)
|
537
563
|
print(
|
538
|
-
f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}."
|
564
|
+
f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}."
|
565
|
+
)
|
539
566
|
pinecone_deleted = True
|
540
567
|
except Exception as e:
|
541
|
-
print(
|
542
|
-
f"Error deleting vectors from Pinecone for {document_id}: {e}")
|
568
|
+
print(f"Error deleting vectors from Pinecone for {document_id}: {e}")
|
543
569
|
|
544
570
|
# --- 3. Delete from MongoDB ---
|
545
571
|
# Use the IDs confirmed to be in Mongo
|
546
|
-
mongo_ids_found_in_db = [doc["document_id"]
|
547
|
-
for doc in docs_to_delete_mongo]
|
572
|
+
mongo_ids_found_in_db = [doc["document_id"] for doc in docs_to_delete_mongo]
|
548
573
|
if mongo_ids_found_in_db:
|
549
574
|
try:
|
550
575
|
delete_result = self.mongo.delete_many(
|
551
|
-
self.collection,
|
552
|
-
{"document_id": {"$in": mongo_ids_found_in_db}}
|
576
|
+
self.collection, {"document_id": {"$in": mongo_ids_found_in_db}}
|
553
577
|
)
|
554
578
|
mongo_deleted_count = delete_result.deleted_count
|
555
579
|
print(
|
556
|
-
f"Deleted {mongo_deleted_count} documents from MongoDB for parent {document_id}."
|
580
|
+
f"Deleted {mongo_deleted_count} documents from MongoDB for parent {document_id}."
|
581
|
+
)
|
557
582
|
except Exception as e:
|
558
|
-
print(
|
559
|
-
f"Error deleting documents from MongoDB for {document_id}: {e}")
|
583
|
+
print(f"Error deleting documents from MongoDB for {document_id}: {e}")
|
560
584
|
|
561
585
|
return pinecone_deleted or mongo_deleted_count > 0
|
562
586
|
|
@@ -565,7 +589,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
565
589
|
document_id: str,
|
566
590
|
text: Optional[str] = None,
|
567
591
|
metadata: Optional[Dict[str, Any]] = None,
|
568
|
-
namespace: Optional[str] = None
|
592
|
+
namespace: Optional[str] = None,
|
569
593
|
) -> bool:
|
570
594
|
"""
|
571
595
|
Update an existing plain text document or metadata. Embeds using OpenAI.
|
@@ -580,8 +604,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
580
604
|
Returns:
|
581
605
|
True if successful.
|
582
606
|
"""
|
583
|
-
current_doc = self.mongo.find_one(
|
584
|
-
self.collection, {"document_id": document_id})
|
607
|
+
current_doc = self.mongo.find_one(self.collection, {"document_id": document_id})
|
585
608
|
if not current_doc:
|
586
609
|
print(f"Document {document_id} not found for update.")
|
587
610
|
return False
|
@@ -590,7 +613,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
590
613
|
print(f"Cannot update chunk {document_id} directly.")
|
591
614
|
return False
|
592
615
|
if current_doc.get("pdf_data") and text is not None:
|
593
|
-
print(
|
616
|
+
print("Cannot update PDF content via this method. Delete and re-add.")
|
594
617
|
return False
|
595
618
|
|
596
619
|
update_text = text is not None and not current_doc.get("pdf_data")
|
@@ -608,8 +631,9 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
608
631
|
if mongo_update: # Only update if there are changes
|
609
632
|
try:
|
610
633
|
update_result = self.mongo.update_one(
|
611
|
-
self.collection,
|
612
|
-
|
634
|
+
self.collection,
|
635
|
+
{"document_id": document_id},
|
636
|
+
{"$set": mongo_update},
|
613
637
|
)
|
614
638
|
mongo_updated = update_result.modified_count > 0
|
615
639
|
except Exception as e:
|
@@ -630,11 +654,19 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
630
654
|
return mongo_updated # Return based on Mongo success
|
631
655
|
|
632
656
|
# Prepare Pinecone metadata
|
633
|
-
final_metadata = {**current_doc, **
|
634
|
-
mongo_update} # Use updated data
|
657
|
+
final_metadata = {**current_doc, **mongo_update} # Use updated data
|
635
658
|
pinecone_metadata = {"document_id": document_id, "is_chunk": False}
|
636
659
|
for key, value in final_metadata.items():
|
637
|
-
if key not in [
|
660
|
+
if key not in [
|
661
|
+
"_id",
|
662
|
+
"content",
|
663
|
+
"pdf_data",
|
664
|
+
"created_at",
|
665
|
+
"updated_at",
|
666
|
+
"document_id",
|
667
|
+
"is_chunk",
|
668
|
+
"parent_document_id",
|
669
|
+
]:
|
638
670
|
pinecone_metadata[key] = value
|
639
671
|
if self.pinecone.use_reranking:
|
640
672
|
pinecone_metadata[self.pinecone.rerank_text_field] = text_content
|
@@ -642,14 +674,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
642
674
|
# Upsert vector to Pinecone
|
643
675
|
try:
|
644
676
|
await self.pinecone.upsert(
|
645
|
-
vectors=[
|
646
|
-
|
647
|
-
|
677
|
+
vectors=[
|
678
|
+
{
|
679
|
+
"id": document_id,
|
680
|
+
"values": embedding,
|
681
|
+
"metadata": pinecone_metadata,
|
682
|
+
}
|
683
|
+
],
|
684
|
+
namespace=namespace,
|
648
685
|
)
|
649
686
|
pinecone_updated = True
|
650
687
|
except Exception as e:
|
651
688
|
print(
|
652
|
-
f"Error upserting updated vector in Pinecone for {document_id}: {e}"
|
689
|
+
f"Error upserting updated vector in Pinecone for {document_id}: {e}"
|
690
|
+
)
|
653
691
|
# Mongo update succeeded, Pinecone failed
|
654
692
|
|
655
693
|
return mongo_updated or pinecone_updated
|
@@ -659,7 +697,7 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
659
697
|
# Expects {'text': ..., 'metadata': ...}
|
660
698
|
documents: List[Dict[str, Any]],
|
661
699
|
namespace: Optional[str] = None,
|
662
|
-
batch_size: int = 50
|
700
|
+
batch_size: int = 50,
|
663
701
|
) -> List[str]:
|
664
702
|
"""
|
665
703
|
Add multiple plain text documents in batches using OpenAI embeddings.
|
@@ -676,12 +714,14 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
676
714
|
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
677
715
|
|
678
716
|
for i in range(0, len(documents), batch_size):
|
679
|
-
batch_docs_input = documents[i:i+batch_size]
|
680
|
-
batch_texts = [doc[
|
681
|
-
batch_metadatas = [doc[
|
717
|
+
batch_docs_input = documents[i : i + batch_size]
|
718
|
+
batch_texts = [doc["text"] for doc in batch_docs_input]
|
719
|
+
batch_metadatas = [doc["metadata"] for doc in batch_docs_input]
|
682
720
|
# Generate IDs if not provided in metadata
|
683
|
-
batch_doc_ids = [
|
684
|
-
uuid.uuid4())
|
721
|
+
batch_doc_ids = [
|
722
|
+
doc["metadata"].get("document_id") or str(uuid.uuid4())
|
723
|
+
for doc in batch_docs_input
|
724
|
+
]
|
685
725
|
all_doc_ids.extend(batch_doc_ids)
|
686
726
|
|
687
727
|
# Prepare MongoDB docs
|
@@ -690,14 +730,19 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
690
730
|
doc_id = batch_doc_ids[idx]
|
691
731
|
metadata = batch_metadatas[idx]
|
692
732
|
mongo_doc = {
|
693
|
-
"document_id": doc_id,
|
694
|
-
"
|
695
|
-
"
|
696
|
-
"
|
733
|
+
"document_id": doc_id,
|
734
|
+
"content": text,
|
735
|
+
"is_chunk": False,
|
736
|
+
"parent_document_id": None,
|
737
|
+
**metadata,
|
738
|
+
"created_at": metadata.get(
|
739
|
+
"created_at", dt.now(tz=dt.now().astimezone().tzinfo)
|
740
|
+
),
|
741
|
+
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo),
|
697
742
|
}
|
698
743
|
# Ensure generated ID is in the doc for Mongo
|
699
|
-
if
|
700
|
-
mongo_doc[
|
744
|
+
if "document_id" not in metadata:
|
745
|
+
mongo_doc["document_id"] = doc_id
|
701
746
|
mongo_batch.append(mongo_doc)
|
702
747
|
|
703
748
|
# Insert into MongoDB
|
@@ -706,16 +751,20 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
706
751
|
self.mongo.insert_many(self.collection, mongo_batch)
|
707
752
|
except Exception as e:
|
708
753
|
print(
|
709
|
-
f"Error inserting batch {i//batch_size + 1} into MongoDB: {e}"
|
754
|
+
f"Error inserting batch {i // batch_size + 1} into MongoDB: {e}"
|
755
|
+
)
|
710
756
|
# Decide if we should skip Pinecone for this batch
|
711
757
|
continue # Skip to next batch
|
712
758
|
|
713
759
|
# Embed batch using OpenAIEmbedding
|
714
760
|
try:
|
715
|
-
batch_embeddings = await embed_model.aget_text_embedding_batch(
|
761
|
+
batch_embeddings = await embed_model.aget_text_embedding_batch(
|
762
|
+
batch_texts, show_progress=True
|
763
|
+
)
|
716
764
|
except Exception as e:
|
717
765
|
print(
|
718
|
-
f"Error embedding batch {i//batch_size + 1} using {self.openai_model_name}: {e}"
|
766
|
+
f"Error embedding batch {i // batch_size + 1} using {self.openai_model_name}: {e}"
|
767
|
+
)
|
719
768
|
continue # Skip Pinecone upsert for this batch
|
720
769
|
|
721
770
|
# Prepare Pinecone vectors
|
@@ -723,29 +772,32 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
723
772
|
for idx, doc_id in enumerate(batch_doc_ids):
|
724
773
|
metadata = batch_metadatas[idx]
|
725
774
|
pinecone_meta = {
|
726
|
-
"document_id": doc_id,
|
775
|
+
"document_id": doc_id,
|
776
|
+
"is_chunk": False,
|
727
777
|
"source": metadata.get("source", "unknown"),
|
728
|
-
"tags": metadata.get("tags", [])
|
778
|
+
"tags": metadata.get("tags", []),
|
729
779
|
}
|
730
780
|
if self.pinecone.use_reranking:
|
731
781
|
pinecone_meta[self.pinecone.rerank_text_field] = batch_texts[idx]
|
732
782
|
|
733
|
-
pinecone_vectors.append(
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
783
|
+
pinecone_vectors.append(
|
784
|
+
{
|
785
|
+
"id": doc_id,
|
786
|
+
"values": batch_embeddings[idx],
|
787
|
+
"metadata": pinecone_meta,
|
788
|
+
}
|
789
|
+
)
|
738
790
|
|
739
791
|
# Upsert vectors to Pinecone
|
740
792
|
if pinecone_vectors:
|
741
793
|
try:
|
742
794
|
await self.pinecone.upsert(
|
743
|
-
vectors=pinecone_vectors,
|
744
|
-
namespace=namespace
|
795
|
+
vectors=pinecone_vectors, namespace=namespace
|
745
796
|
)
|
746
797
|
except Exception as e:
|
747
798
|
print(
|
748
|
-
f"Error upserting vector batch {i//batch_size + 1} to Pinecone: {e}"
|
799
|
+
f"Error upserting vector batch {i // batch_size + 1} to Pinecone: {e}"
|
800
|
+
)
|
749
801
|
|
750
802
|
# Optional delay
|
751
803
|
if i + batch_size < len(documents):
|
@@ -766,6 +818,5 @@ class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
766
818
|
try:
|
767
819
|
return self.mongo.find_one(self.collection, {"document_id": document_id})
|
768
820
|
except Exception as e:
|
769
|
-
print(
|
770
|
-
f"Error retrieving full document {document_id} from MongoDB: {e}")
|
821
|
+
print(f"Error retrieving full document {document_id} from MongoDB: {e}")
|
771
822
|
return None
|