solana-agent 27.2.0__py3-none-any.whl → 27.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/adapters/openai_adapter.py +59 -5
- solana_agent/adapters/pinecone_adapter.py +496 -0
- solana_agent/client/solana_agent.py +145 -1
- solana_agent/factories/agent_factory.py +73 -7
- solana_agent/interfaces/client/client.py +75 -5
- solana_agent/interfaces/providers/llm.py +20 -0
- solana_agent/interfaces/providers/vector_storage.py +59 -0
- solana_agent/interfaces/services/knowledge_base.py +86 -0
- solana_agent/services/knowledge_base.py +771 -0
- solana_agent/services/query.py +33 -2
- {solana_agent-27.2.0.dist-info → solana_agent-27.3.1.dist-info}/METADATA +177 -15
- {solana_agent-27.2.0.dist-info → solana_agent-27.3.1.dist-info}/RECORD +14 -10
- {solana_agent-27.2.0.dist-info → solana_agent-27.3.1.dist-info}/LICENSE +0 -0
- {solana_agent-27.2.0.dist-info → solana_agent-27.3.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,771 @@
|
|
1
|
+
from datetime import datetime as dt
|
2
|
+
from typing import Dict, List, Any, Optional, Union
|
3
|
+
import uuid
|
4
|
+
import asyncio
|
5
|
+
import io
|
6
|
+
|
7
|
+
# PDF and Chunking Imports
|
8
|
+
import pypdf
|
9
|
+
from llama_index.core import Document as LlamaDocument
|
10
|
+
from llama_index.core.node_parser import SemanticSplitterNodeParser
|
11
|
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
12
|
+
|
13
|
+
from solana_agent.adapters.pinecone_adapter import PineconeAdapter
|
14
|
+
from solana_agent.adapters.mongodb_adapter import MongoDBAdapter
|
15
|
+
from solana_agent.interfaces.services.knowledge_base import KnowledgeBaseService as KnowledgeBaseInterface
|
16
|
+
|
17
|
+
|
18
|
+
class KnowledgeBaseService(KnowledgeBaseInterface):
|
19
|
+
"""
|
20
|
+
Knowledge Base service using Pinecone for vector search and MongoDB for metadata/full document storage.
|
21
|
+
Supports text documents and PDF semantic chunking using OpenAI embeddings via LlamaIndex.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
pinecone_adapter: PineconeAdapter,
|
27
|
+
mongodb_adapter: MongoDBAdapter,
|
28
|
+
openai_api_key: str,
|
29
|
+
openai_model_name: str = "text-embedding-3-large",
|
30
|
+
collection_name: str = "knowledge_documents",
|
31
|
+
rerank_results: bool = False,
|
32
|
+
rerank_top_k: int = 3,
|
33
|
+
# Semantic Splitter Config
|
34
|
+
splitter_buffer_size: int = 1,
|
35
|
+
splitter_breakpoint_percentile: int = 95,
|
36
|
+
):
|
37
|
+
"""
|
38
|
+
Initialize the Knowledge Base service.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
pinecone_adapter: Configured PineconeAdapter instance.
|
42
|
+
mongodb_adapter: Configured MongoDBAdapter instance.
|
43
|
+
openai_api_key: OpenAI API key for embedding.
|
44
|
+
openai_model_name: OpenAI embedding model name.
|
45
|
+
collection_name: MongoDB collection for storing document metadata and full PDFs.
|
46
|
+
rerank_results: Whether PineconeAdapter should rerank results.
|
47
|
+
rerank_top_k: Number of results to return after reranking (used by PineconeAdapter).
|
48
|
+
splitter_buffer_size: Buffer size for SemanticSplitterNodeParser.
|
49
|
+
splitter_breakpoint_percentile: Breakpoint percentile for SemanticSplitterNodeParser.
|
50
|
+
"""
|
51
|
+
self.pinecone = pinecone_adapter
|
52
|
+
self.mongo = mongodb_adapter
|
53
|
+
self.collection = collection_name
|
54
|
+
# Store rerank config for reference, but PineconeAdapter applies it
|
55
|
+
self.rerank_results = rerank_results
|
56
|
+
self.rerank_top_k = rerank_top_k
|
57
|
+
|
58
|
+
# --- Initialize Semantic Splitter with OpenAIEmbedding ---
|
59
|
+
api_key = openai_api_key
|
60
|
+
if not api_key:
|
61
|
+
raise ValueError(
|
62
|
+
"OpenAI API key not provided via argument or OPENAI_API_KEY environment variable.")
|
63
|
+
|
64
|
+
# Determine expected embedding dimensions based on model name
|
65
|
+
if openai_model_name == "text-embedding-3-large":
|
66
|
+
openai_dimensions = 3072
|
67
|
+
elif openai_model_name == "text-embedding-3-small":
|
68
|
+
openai_dimensions = 1536
|
69
|
+
else:
|
70
|
+
# Attempt to get dimension from Pinecone config if available, otherwise raise error
|
71
|
+
openai_dimensions = getattr(
|
72
|
+
self.pinecone, 'embedding_dimensions', 0)
|
73
|
+
if openai_dimensions <= 0:
|
74
|
+
raise ValueError(
|
75
|
+
f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured.")
|
76
|
+
print(
|
77
|
+
f"Warning: Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct.")
|
78
|
+
|
79
|
+
# Instantiate OpenAIEmbedding
|
80
|
+
# Note: LlamaIndex OpenAIEmbedding doesn't directly support reducing dimensions via 'dimensions' param during init
|
81
|
+
# like the raw OpenAI client does. It uses the model's default.
|
82
|
+
try:
|
83
|
+
llama_embed_model = OpenAIEmbedding(
|
84
|
+
model=openai_model_name,
|
85
|
+
api_key=api_key,
|
86
|
+
# embed_batch_size=10 # Optional: Adjust batch size if needed
|
87
|
+
)
|
88
|
+
except Exception as e:
|
89
|
+
print(f"Error initializing OpenAIEmbedding: {e}")
|
90
|
+
raise
|
91
|
+
|
92
|
+
self.semantic_splitter = SemanticSplitterNodeParser(
|
93
|
+
buffer_size=splitter_buffer_size,
|
94
|
+
breakpoint_percentile_threshold=splitter_breakpoint_percentile,
|
95
|
+
embed_model=llama_embed_model # Use the OpenAIEmbedding instance
|
96
|
+
)
|
97
|
+
# Store model name for logging/reference
|
98
|
+
self.openai_model_name = openai_model_name
|
99
|
+
# --- End Semantic Splitter Init ---
|
100
|
+
|
101
|
+
self._ensure_collection()
|
102
|
+
|
103
|
+
def _ensure_collection(self) -> None:
|
104
|
+
"""Set up MongoDB collection with appropriate indexes."""
|
105
|
+
if not self.mongo.collection_exists(self.collection):
|
106
|
+
self.mongo.create_collection(self.collection)
|
107
|
+
print(f"Created MongoDB collection: {self.collection}")
|
108
|
+
|
109
|
+
# Indexes for retrieval and filtering
|
110
|
+
self.mongo.create_index(
|
111
|
+
self.collection, [("document_id", 1)], unique=True)
|
112
|
+
self.mongo.create_index(self.collection, [("parent_document_id", 1)])
|
113
|
+
self.mongo.create_index(self.collection, [("source", 1)])
|
114
|
+
self.mongo.create_index(self.collection, [("created_at", -1)])
|
115
|
+
self.mongo.create_index(self.collection, [("tags", 1)])
|
116
|
+
self.mongo.create_index(self.collection, [("is_chunk", 1)])
|
117
|
+
print(
|
118
|
+
f"Ensured indexes exist for MongoDB collection: {self.collection}")
|
119
|
+
|
120
|
+
async def add_document(
|
121
|
+
self,
|
122
|
+
text: str,
|
123
|
+
metadata: Dict[str, Any],
|
124
|
+
document_id: Optional[str] = None,
|
125
|
+
namespace: Optional[str] = None
|
126
|
+
) -> str:
|
127
|
+
"""
|
128
|
+
Add a plain text document to the knowledge base. Embeds using OpenAI.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
text: Document text content.
|
132
|
+
metadata: Document metadata.
|
133
|
+
document_id: Optional document ID.
|
134
|
+
namespace: Optional Pinecone namespace.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
The document ID.
|
138
|
+
"""
|
139
|
+
doc_id = document_id or str(uuid.uuid4())
|
140
|
+
|
141
|
+
# Store metadata and content in MongoDB
|
142
|
+
mongo_doc = {
|
143
|
+
"document_id": doc_id,
|
144
|
+
"content": text,
|
145
|
+
"is_chunk": False,
|
146
|
+
"parent_document_id": None,
|
147
|
+
**metadata,
|
148
|
+
# Use timezone aware datetime
|
149
|
+
"created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
|
150
|
+
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
|
151
|
+
}
|
152
|
+
try:
|
153
|
+
self.mongo.insert_one(self.collection, mongo_doc)
|
154
|
+
except Exception as e:
|
155
|
+
print(f"Error inserting document {doc_id} into MongoDB: {e}")
|
156
|
+
raise
|
157
|
+
|
158
|
+
# Embed text using OpenAIEmbedding
|
159
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
160
|
+
try:
|
161
|
+
embedding = await embed_model.aget_text_embedding(text)
|
162
|
+
except Exception as e:
|
163
|
+
print(
|
164
|
+
f"Error embedding document {doc_id} using {self.openai_model_name}: {e}")
|
165
|
+
# Decide how to handle - Mongo insert succeeded, embedding failed
|
166
|
+
raise # Re-raise for now
|
167
|
+
|
168
|
+
# Prepare Pinecone metadata
|
169
|
+
pinecone_metadata = {
|
170
|
+
"document_id": doc_id,
|
171
|
+
"is_chunk": False,
|
172
|
+
"source": metadata.get("source", "unknown"),
|
173
|
+
"tags": metadata.get("tags", [])
|
174
|
+
}
|
175
|
+
# Add text itself if Pinecone adapter reranking is used
|
176
|
+
if self.pinecone.use_reranking:
|
177
|
+
pinecone_metadata[self.pinecone.rerank_text_field] = text
|
178
|
+
|
179
|
+
# Upsert vector to Pinecone using the generic upsert method
|
180
|
+
try:
|
181
|
+
await self.pinecone.upsert(
|
182
|
+
vectors=[{"id": doc_id, "values": embedding,
|
183
|
+
"metadata": pinecone_metadata}],
|
184
|
+
namespace=namespace
|
185
|
+
)
|
186
|
+
except Exception as e:
|
187
|
+
print(f"Error upserting vector for {doc_id} to Pinecone: {e}")
|
188
|
+
# Decide how to handle - Mongo insert succeeded, Pinecone failed
|
189
|
+
raise # Re-raise for now
|
190
|
+
|
191
|
+
return doc_id
|
192
|
+
|
193
|
+
async def add_pdf_document(
|
194
|
+
self,
|
195
|
+
pdf_data: Union[bytes, str],
|
196
|
+
metadata: Dict[str, Any],
|
197
|
+
document_id: Optional[str] = None,
|
198
|
+
namespace: Optional[str] = None,
|
199
|
+
chunk_batch_size: int = 50
|
200
|
+
) -> str:
|
201
|
+
"""
|
202
|
+
Add a PDF document, performs semantic chunking using OpenAI embeddings,
|
203
|
+
stores full PDF in Mongo, and chunk vectors in Pinecone.
|
204
|
+
|
205
|
+
Args:
|
206
|
+
pdf_data: PDF content as bytes or a path to the PDF file.
|
207
|
+
metadata: Document metadata (applies to the parent PDF).
|
208
|
+
document_id: Optional parent document ID.
|
209
|
+
namespace: Optional Pinecone namespace for chunks.
|
210
|
+
chunk_batch_size: Batch size for embedding and upserting chunks.
|
211
|
+
|
212
|
+
Returns:
|
213
|
+
The parent document ID.
|
214
|
+
"""
|
215
|
+
parent_doc_id = document_id or str(uuid.uuid4())
|
216
|
+
pdf_bytes: bytes
|
217
|
+
|
218
|
+
# --- 1. Read PDF and Extract Text ---
|
219
|
+
try:
|
220
|
+
if isinstance(pdf_data, str):
|
221
|
+
with open(pdf_data, "rb") as f:
|
222
|
+
pdf_bytes = f.read()
|
223
|
+
elif isinstance(pdf_data, bytes):
|
224
|
+
pdf_bytes = pdf_data
|
225
|
+
else:
|
226
|
+
raise ValueError(
|
227
|
+
"pdf_data must be bytes or a file path string.")
|
228
|
+
|
229
|
+
reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
|
230
|
+
extracted_text = "".join(
|
231
|
+
page.extract_text() or "" for page in reader.pages)
|
232
|
+
if not extracted_text.strip():
|
233
|
+
print(f"Warning: No text extracted from PDF {parent_doc_id}.")
|
234
|
+
except Exception as e:
|
235
|
+
print(
|
236
|
+
f"Error reading or extracting text from PDF {parent_doc_id}: {e}")
|
237
|
+
raise
|
238
|
+
|
239
|
+
# --- 2. Store Full PDF and Metadata in MongoDB ---
|
240
|
+
mongo_parent_doc = {
|
241
|
+
"document_id": parent_doc_id,
|
242
|
+
"content": extracted_text,
|
243
|
+
"pdf_data": pdf_bytes,
|
244
|
+
"is_chunk": False,
|
245
|
+
"parent_document_id": None,
|
246
|
+
**metadata,
|
247
|
+
"created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
|
248
|
+
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
|
249
|
+
}
|
250
|
+
try:
|
251
|
+
self.mongo.insert_one(self.collection, mongo_parent_doc)
|
252
|
+
print(f"Stored full PDF {parent_doc_id} in MongoDB.")
|
253
|
+
except Exception as e: # pragma: no cover
|
254
|
+
print(
|
255
|
+
f"Error inserting parent PDF {parent_doc_id} into MongoDB: {e}") # pragma: no cover
|
256
|
+
raise # pragma: no cover
|
257
|
+
|
258
|
+
# --- 3. Semantic Chunking ---
|
259
|
+
if not extracted_text.strip():
|
260
|
+
print(
|
261
|
+
f"Skipping chunking for PDF {parent_doc_id} due to no extracted text.")
|
262
|
+
return parent_doc_id
|
263
|
+
|
264
|
+
try:
|
265
|
+
llama_doc = LlamaDocument(text=extracted_text)
|
266
|
+
# Run synchronous splitter in thread pool
|
267
|
+
nodes = await asyncio.to_thread(
|
268
|
+
self.semantic_splitter.get_nodes_from_documents,
|
269
|
+
[llama_doc]
|
270
|
+
)
|
271
|
+
print(
|
272
|
+
f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}.")
|
273
|
+
except Exception as e:
|
274
|
+
print(
|
275
|
+
f"Error during semantic chunking for PDF {parent_doc_id}: {e}")
|
276
|
+
raise
|
277
|
+
|
278
|
+
# --- 4. Embed Chunks and Batch Upsert to Pinecone ---
|
279
|
+
if not nodes:
|
280
|
+
return parent_doc_id # No chunks generated
|
281
|
+
|
282
|
+
print(
|
283
|
+
f"Embedding {len(nodes)} chunks using {self.openai_model_name}...")
|
284
|
+
chunk_texts = [node.get_content() for node in nodes]
|
285
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
286
|
+
all_chunk_embeddings = []
|
287
|
+
|
288
|
+
# Embed chunks in batches (using embed_model's internal batching)
|
289
|
+
try:
|
290
|
+
# Use aget_text_embedding_batch for async embedding
|
291
|
+
# Note: LlamaIndex OpenAIEmbedding might handle batch size internally.
|
292
|
+
# If large number of nodes, consider explicit batching here if needed.
|
293
|
+
all_chunk_embeddings = await embed_model.aget_text_embedding_batch(
|
294
|
+
chunk_texts, show_progress=True
|
295
|
+
)
|
296
|
+
except Exception as e:
|
297
|
+
print(f"Error embedding chunks for PDF {parent_doc_id}: {e}")
|
298
|
+
raise # Stop if embedding fails
|
299
|
+
|
300
|
+
print("Embedding complete. Preparing vectors for Pinecone.")
|
301
|
+
pinecone_vectors = []
|
302
|
+
for i, node in enumerate(nodes):
|
303
|
+
chunk_id = f"{parent_doc_id}_chunk_{i}"
|
304
|
+
chunk_metadata = {
|
305
|
+
"document_id": chunk_id,
|
306
|
+
"parent_document_id": parent_doc_id,
|
307
|
+
"chunk_index": i,
|
308
|
+
"is_chunk": True,
|
309
|
+
"source": metadata.get("source", "unknown"),
|
310
|
+
"tags": metadata.get("tags", [])
|
311
|
+
}
|
312
|
+
# Add chunk text itself if Pinecone adapter reranking is used
|
313
|
+
if self.pinecone.use_reranking:
|
314
|
+
chunk_metadata[self.pinecone.rerank_text_field] = chunk_texts[i]
|
315
|
+
|
316
|
+
pinecone_vectors.append({
|
317
|
+
"id": chunk_id,
|
318
|
+
"values": all_chunk_embeddings[i],
|
319
|
+
"metadata": chunk_metadata
|
320
|
+
})
|
321
|
+
|
322
|
+
# Upsert vectors in batches using the generic upsert method
|
323
|
+
print(
|
324
|
+
f"Upserting {len(pinecone_vectors)} vectors to Pinecone in batches of {chunk_batch_size}...")
|
325
|
+
upsert_tasks = []
|
326
|
+
for i in range(0, len(pinecone_vectors), chunk_batch_size):
|
327
|
+
batch_vectors = pinecone_vectors[i:i+chunk_batch_size]
|
328
|
+
# Create task for each batch upsert
|
329
|
+
upsert_tasks.append(
|
330
|
+
self.pinecone.upsert(
|
331
|
+
vectors=batch_vectors,
|
332
|
+
namespace=namespace
|
333
|
+
)
|
334
|
+
)
|
335
|
+
# Optional: Add a small delay between initiating tasks if rate limiting is a concern
|
336
|
+
# await asyncio.sleep(0.05)
|
337
|
+
|
338
|
+
# Run upsert tasks concurrently
|
339
|
+
results = await asyncio.gather(*upsert_tasks, return_exceptions=True)
|
340
|
+
|
341
|
+
# Check for errors during upsert
|
342
|
+
for idx, result in enumerate(results):
|
343
|
+
if isinstance(result, Exception):
|
344
|
+
print(
|
345
|
+
f"Error upserting vector batch {idx + 1} to Pinecone: {result}")
|
346
|
+
# Decide on error handling: log, raise, etc.
|
347
|
+
|
348
|
+
print(f"Finished processing PDF {parent_doc_id}.")
|
349
|
+
return parent_doc_id
|
350
|
+
|
351
|
+
async def query(
|
352
|
+
self,
|
353
|
+
query_text: str,
|
354
|
+
filter: Optional[Dict[str, Any]] = None,
|
355
|
+
top_k: int = 5,
|
356
|
+
namespace: Optional[str] = None,
|
357
|
+
include_content: bool = True,
|
358
|
+
include_metadata: bool = True
|
359
|
+
) -> List[Dict[str, Any]]:
|
360
|
+
"""
|
361
|
+
Query the knowledge base using semantic search with OpenAI embeddings.
|
362
|
+
|
363
|
+
Args:
|
364
|
+
query_text: The query text.
|
365
|
+
filter: Optional Pinecone metadata filter.
|
366
|
+
top_k: Number of results to retrieve initially.
|
367
|
+
namespace: Optional Pinecone namespace.
|
368
|
+
include_content: Whether to include document/chunk content in results.
|
369
|
+
include_metadata: Whether to include document/chunk metadata in results.
|
370
|
+
|
371
|
+
Returns:
|
372
|
+
List of result dictionaries, potentially reranked by PineconeAdapter.
|
373
|
+
"""
|
374
|
+
# Determine initial K for Pinecone query, considering reranking multiplier
|
375
|
+
initial_k = top_k
|
376
|
+
if self.pinecone.use_reranking:
|
377
|
+
initial_k = top_k * self.pinecone.initial_query_top_k_multiplier
|
378
|
+
|
379
|
+
# --- Embed Query using OpenAIEmbedding ---
|
380
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
381
|
+
try:
|
382
|
+
query_vector = await embed_model.aget_query_embedding(query_text)
|
383
|
+
except Exception as e:
|
384
|
+
print(f"Error embedding query text '{query_text}': {e}")
|
385
|
+
return []
|
386
|
+
|
387
|
+
# --- Query Pinecone using the vector ---
|
388
|
+
try:
|
389
|
+
# Use the generic query method with the vector
|
390
|
+
# PineconeAdapter handles reranking internally if configured
|
391
|
+
pinecone_results = await self.pinecone.query(
|
392
|
+
vector=query_vector,
|
393
|
+
filter=filter,
|
394
|
+
top_k=initial_k, # Fetch more initially if reranking
|
395
|
+
namespace=namespace,
|
396
|
+
include_values=False, # Don't need embeddings back
|
397
|
+
include_metadata=True # Need metadata for linking
|
398
|
+
)
|
399
|
+
except Exception as e:
|
400
|
+
print(f"Error querying Pinecone: {e}")
|
401
|
+
return []
|
402
|
+
|
403
|
+
if not pinecone_results:
|
404
|
+
return []
|
405
|
+
|
406
|
+
# Extract IDs, scores, and metadata from Pinecone results
|
407
|
+
# PineconeAdapter might have already reranked and truncated to final top_k
|
408
|
+
result_ids = [res['id'] for res in pinecone_results]
|
409
|
+
scores = {res['id']: res['score'] for res in pinecone_results}
|
410
|
+
pinecone_metadatas = {res['id']: res.get(
|
411
|
+
'metadata', {}) for res in pinecone_results}
|
412
|
+
|
413
|
+
# --- Fetch corresponding data from MongoDB ---
|
414
|
+
mongo_docs_map = {}
|
415
|
+
parent_ids_to_fetch = set()
|
416
|
+
for res_id in result_ids:
|
417
|
+
meta = pinecone_metadatas.get(res_id, {})
|
418
|
+
if meta.get("is_chunk"):
|
419
|
+
parent_id = meta.get("parent_document_id")
|
420
|
+
if parent_id:
|
421
|
+
parent_ids_to_fetch.add(parent_id)
|
422
|
+
else:
|
423
|
+
# If it's not a chunk, its own ID might be in Mongo
|
424
|
+
parent_ids_to_fetch.add(res_id)
|
425
|
+
|
426
|
+
if parent_ids_to_fetch:
|
427
|
+
try:
|
428
|
+
mongo_docs = self.mongo.find(
|
429
|
+
self.collection,
|
430
|
+
{"document_id": {"$in": list(parent_ids_to_fetch)}}
|
431
|
+
)
|
432
|
+
mongo_docs_map = {doc["document_id"]
|
433
|
+
: doc for doc in mongo_docs}
|
434
|
+
except Exception as e:
|
435
|
+
print(f"Error fetching documents from MongoDB: {e}")
|
436
|
+
# Proceed with potentially missing Mongo data
|
437
|
+
|
438
|
+
# --- Combine Results ---
|
439
|
+
results = []
|
440
|
+
for res_id in result_ids: # Iterate in Pinecone's score order
|
441
|
+
pinecone_meta = pinecone_metadatas.get(res_id, {})
|
442
|
+
is_chunk = pinecone_meta.get("is_chunk", False)
|
443
|
+
parent_doc_id = pinecone_meta.get("parent_document_id")
|
444
|
+
|
445
|
+
# Determine which Mongo doc holds the relevant info
|
446
|
+
mongo_doc_for_meta = None
|
447
|
+
mongo_doc_for_content = None
|
448
|
+
if is_chunk and parent_doc_id:
|
449
|
+
mongo_doc_for_meta = mongo_docs_map.get(parent_doc_id)
|
450
|
+
mongo_doc_for_content = mongo_doc_for_meta # Parent holds full content
|
451
|
+
else: # Not a chunk
|
452
|
+
mongo_doc_for_meta = mongo_docs_map.get(res_id)
|
453
|
+
mongo_doc_for_content = mongo_doc_for_meta
|
454
|
+
|
455
|
+
result = {
|
456
|
+
"document_id": res_id,
|
457
|
+
"score": scores.get(res_id, 0.0),
|
458
|
+
"is_chunk": is_chunk,
|
459
|
+
"parent_document_id": parent_doc_id,
|
460
|
+
}
|
461
|
+
|
462
|
+
if include_content:
|
463
|
+
content = None
|
464
|
+
# Priority 1: Reranking field in Pinecone metadata (holds chunk text)
|
465
|
+
if self.pinecone.use_reranking and self.pinecone.rerank_text_field in pinecone_meta:
|
466
|
+
content = pinecone_meta[self.pinecone.rerank_text_field]
|
467
|
+
# Priority 2: Get content from the relevant Mongo doc
|
468
|
+
elif mongo_doc_for_content:
|
469
|
+
content = mongo_doc_for_content.get("content")
|
470
|
+
result["content"] = content or ""
|
471
|
+
|
472
|
+
if include_metadata:
|
473
|
+
combined_meta = {}
|
474
|
+
# Merge metadata from the relevant Mongo doc (parent or self)
|
475
|
+
if mongo_doc_for_meta:
|
476
|
+
combined_meta = {
|
477
|
+
k: v for k, v in mongo_doc_for_meta.items()
|
478
|
+
if k not in ["_id", "document_id", "content", "pdf_data", "is_chunk", "parent_document_id", "created_at", "updated_at"]
|
479
|
+
}
|
480
|
+
# Add/overwrite with chunk-specific info from Pinecone meta
|
481
|
+
combined_meta.update({
|
482
|
+
k: v for k, v in pinecone_meta.items()
|
483
|
+
# Avoid redundancy
|
484
|
+
if k not in ["document_id", self.pinecone.rerank_text_field]
|
485
|
+
})
|
486
|
+
result["metadata"] = combined_meta
|
487
|
+
|
488
|
+
results.append(result)
|
489
|
+
|
490
|
+
return results
|
491
|
+
|
492
|
+
async def delete_document(
|
493
|
+
self,
|
494
|
+
document_id: str,
|
495
|
+
namespace: Optional[str] = None
|
496
|
+
) -> bool:
|
497
|
+
"""
|
498
|
+
Delete a document (plain text or PDF) and all its associated chunks.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
document_id: ID of the parent document (or plain text document).
|
502
|
+
namespace: Optional Pinecone namespace.
|
503
|
+
|
504
|
+
Returns:
|
505
|
+
True if deletion was successful (or partially successful).
|
506
|
+
"""
|
507
|
+
print(
|
508
|
+
f"Attempting to delete document and associated data for ID: {document_id}")
|
509
|
+
mongo_deleted_count = 0
|
510
|
+
pinecone_deleted = False
|
511
|
+
|
512
|
+
# --- 1. Find all associated document IDs in MongoDB ---
|
513
|
+
# This includes the parent doc and potentially chunk metadata if we stored it
|
514
|
+
# We primarily need the IDs to delete from Pinecone.
|
515
|
+
# A more robust way might be to query Pinecone directly for vectors with parent_document_id == document_id
|
516
|
+
# For now, assume IDs in Mongo cover what needs deletion.
|
517
|
+
docs_to_delete_mongo = []
|
518
|
+
mongo_ids_to_delete = set([document_id]) # Start with the main ID
|
519
|
+
try:
|
520
|
+
docs_to_delete_mongo = list(self.mongo.find(
|
521
|
+
self.collection,
|
522
|
+
{"$or": [{"document_id": document_id}, {
|
523
|
+
"parent_document_id": document_id}]}
|
524
|
+
))
|
525
|
+
for doc in docs_to_delete_mongo:
|
526
|
+
mongo_ids_to_delete.add(doc["document_id"])
|
527
|
+
except Exception as e:
|
528
|
+
print(
|
529
|
+
f"Warning: Error finding documents in MongoDB for deletion ({document_id}): {e}. Proceeding with main ID only.")
|
530
|
+
|
531
|
+
pinecone_ids_to_delete = list(mongo_ids_to_delete)
|
532
|
+
|
533
|
+
# --- 2. Delete from Pinecone ---
|
534
|
+
if pinecone_ids_to_delete:
|
535
|
+
try:
|
536
|
+
await self.pinecone.delete(ids=pinecone_ids_to_delete, namespace=namespace)
|
537
|
+
print(
|
538
|
+
f"Deleted {len(pinecone_ids_to_delete)} vectors from Pinecone for parent {document_id}.")
|
539
|
+
pinecone_deleted = True
|
540
|
+
except Exception as e:
|
541
|
+
print(
|
542
|
+
f"Error deleting vectors from Pinecone for {document_id}: {e}")
|
543
|
+
|
544
|
+
# --- 3. Delete from MongoDB ---
|
545
|
+
# Use the IDs confirmed to be in Mongo
|
546
|
+
mongo_ids_found_in_db = [doc["document_id"]
|
547
|
+
for doc in docs_to_delete_mongo]
|
548
|
+
if mongo_ids_found_in_db:
|
549
|
+
try:
|
550
|
+
delete_result = self.mongo.delete_many(
|
551
|
+
self.collection,
|
552
|
+
{"document_id": {"$in": mongo_ids_found_in_db}}
|
553
|
+
)
|
554
|
+
mongo_deleted_count = delete_result.deleted_count
|
555
|
+
print(
|
556
|
+
f"Deleted {mongo_deleted_count} documents from MongoDB for parent {document_id}.")
|
557
|
+
except Exception as e:
|
558
|
+
print(
|
559
|
+
f"Error deleting documents from MongoDB for {document_id}: {e}")
|
560
|
+
|
561
|
+
return pinecone_deleted or mongo_deleted_count > 0
|
562
|
+
|
563
|
+
async def update_document(
|
564
|
+
self,
|
565
|
+
document_id: str,
|
566
|
+
text: Optional[str] = None,
|
567
|
+
metadata: Optional[Dict[str, Any]] = None,
|
568
|
+
namespace: Optional[str] = None
|
569
|
+
) -> bool:
|
570
|
+
"""
|
571
|
+
Update an existing plain text document or metadata. Embeds using OpenAI.
|
572
|
+
Updating PDF content requires deleting and re-adding.
|
573
|
+
|
574
|
+
Args:
|
575
|
+
document_id: ID of document to update.
|
576
|
+
text: Optional new text content (for plain text docs only).
|
577
|
+
metadata: Optional metadata to update.
|
578
|
+
namespace: Optional Pinecone namespace.
|
579
|
+
|
580
|
+
Returns:
|
581
|
+
True if successful.
|
582
|
+
"""
|
583
|
+
current_doc = self.mongo.find_one(
|
584
|
+
self.collection, {"document_id": document_id})
|
585
|
+
if not current_doc:
|
586
|
+
print(f"Document {document_id} not found for update.")
|
587
|
+
return False
|
588
|
+
|
589
|
+
if current_doc.get("is_chunk"):
|
590
|
+
print(f"Cannot update chunk {document_id} directly.")
|
591
|
+
return False
|
592
|
+
if current_doc.get("pdf_data") and text is not None:
|
593
|
+
print(f"Cannot update PDF content via this method. Delete and re-add.")
|
594
|
+
return False
|
595
|
+
|
596
|
+
update_text = text is not None and not current_doc.get("pdf_data")
|
597
|
+
text_content = text if update_text else current_doc.get("content", "")
|
598
|
+
|
599
|
+
# --- Update MongoDB ---
|
600
|
+
mongo_update = {}
|
601
|
+
if metadata:
|
602
|
+
mongo_update.update(metadata)
|
603
|
+
if update_text:
|
604
|
+
mongo_update["content"] = text_content
|
605
|
+
mongo_update["updated_at"] = dt.now(tz=dt.now().astimezone().tzinfo)
|
606
|
+
|
607
|
+
mongo_updated = False
|
608
|
+
if mongo_update: # Only update if there are changes
|
609
|
+
try:
|
610
|
+
update_result = self.mongo.update_one(
|
611
|
+
self.collection, {"document_id": document_id}, {
|
612
|
+
"$set": mongo_update}
|
613
|
+
)
|
614
|
+
mongo_updated = update_result.modified_count > 0
|
615
|
+
except Exception as e:
|
616
|
+
print(f"Error updating document {document_id} in MongoDB: {e}")
|
617
|
+
# Decide if we should proceed to Pinecone update if Mongo failed
|
618
|
+
return False # Return False if Mongo update fails
|
619
|
+
|
620
|
+
# --- Update Pinecone (only if text changed) ---
|
621
|
+
pinecone_updated = False
|
622
|
+
if update_text:
|
623
|
+
# Embed updated text
|
624
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
625
|
+
try:
|
626
|
+
embedding = await embed_model.aget_text_embedding(text_content)
|
627
|
+
except Exception as e:
|
628
|
+
print(f"Error embedding updated text for {document_id}: {e}")
|
629
|
+
# Mongo update might have succeeded, but embedding failed
|
630
|
+
return mongo_updated # Return based on Mongo success
|
631
|
+
|
632
|
+
# Prepare Pinecone metadata
|
633
|
+
final_metadata = {**current_doc, **
|
634
|
+
mongo_update} # Use updated data
|
635
|
+
pinecone_metadata = {"document_id": document_id, "is_chunk": False}
|
636
|
+
for key, value in final_metadata.items():
|
637
|
+
if key not in ["_id", "content", "pdf_data", "created_at", "updated_at", "document_id", "is_chunk", "parent_document_id"]:
|
638
|
+
pinecone_metadata[key] = value
|
639
|
+
if self.pinecone.use_reranking:
|
640
|
+
pinecone_metadata[self.pinecone.rerank_text_field] = text_content
|
641
|
+
|
642
|
+
# Upsert vector to Pinecone
|
643
|
+
try:
|
644
|
+
await self.pinecone.upsert(
|
645
|
+
vectors=[{"id": document_id, "values": embedding,
|
646
|
+
"metadata": pinecone_metadata}],
|
647
|
+
namespace=namespace
|
648
|
+
)
|
649
|
+
pinecone_updated = True
|
650
|
+
except Exception as e:
|
651
|
+
print(
|
652
|
+
f"Error upserting updated vector in Pinecone for {document_id}: {e}")
|
653
|
+
# Mongo update succeeded, Pinecone failed
|
654
|
+
|
655
|
+
return mongo_updated or pinecone_updated
|
656
|
+
|
657
|
+
async def add_documents_batch(
|
658
|
+
self,
|
659
|
+
# Expects {'text': ..., 'metadata': ...}
|
660
|
+
documents: List[Dict[str, Any]],
|
661
|
+
namespace: Optional[str] = None,
|
662
|
+
batch_size: int = 50
|
663
|
+
) -> List[str]:
|
664
|
+
"""
|
665
|
+
Add multiple plain text documents in batches using OpenAI embeddings.
|
666
|
+
|
667
|
+
Args:
|
668
|
+
documents: List of documents, each with 'text' and 'metadata'.
|
669
|
+
namespace: Optional Pinecone namespace.
|
670
|
+
batch_size: Number of documents per embedding/upsert batch.
|
671
|
+
|
672
|
+
Returns:
|
673
|
+
List of added document IDs.
|
674
|
+
"""
|
675
|
+
all_doc_ids = []
|
676
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
677
|
+
|
678
|
+
for i in range(0, len(documents), batch_size):
|
679
|
+
batch_docs_input = documents[i:i+batch_size]
|
680
|
+
batch_texts = [doc['text'] for doc in batch_docs_input]
|
681
|
+
batch_metadatas = [doc['metadata'] for doc in batch_docs_input]
|
682
|
+
# Generate IDs if not provided in metadata
|
683
|
+
batch_doc_ids = [doc['metadata'].get('document_id') or str(
|
684
|
+
uuid.uuid4()) for doc in batch_docs_input]
|
685
|
+
all_doc_ids.extend(batch_doc_ids)
|
686
|
+
|
687
|
+
# Prepare MongoDB docs
|
688
|
+
mongo_batch = []
|
689
|
+
for idx, text in enumerate(batch_texts):
|
690
|
+
doc_id = batch_doc_ids[idx]
|
691
|
+
metadata = batch_metadatas[idx]
|
692
|
+
mongo_doc = {
|
693
|
+
"document_id": doc_id, "content": text, "is_chunk": False,
|
694
|
+
"parent_document_id": None, **metadata,
|
695
|
+
"created_at": metadata.get("created_at", dt.now(tz=dt.now().astimezone().tzinfo)),
|
696
|
+
"updated_at": dt.now(tz=dt.now().astimezone().tzinfo)
|
697
|
+
}
|
698
|
+
# Ensure generated ID is in the doc for Mongo
|
699
|
+
if 'document_id' not in metadata:
|
700
|
+
mongo_doc['document_id'] = doc_id
|
701
|
+
mongo_batch.append(mongo_doc)
|
702
|
+
|
703
|
+
# Insert into MongoDB
|
704
|
+
if mongo_batch:
|
705
|
+
try:
|
706
|
+
self.mongo.insert_many(self.collection, mongo_batch)
|
707
|
+
except Exception as e:
|
708
|
+
print(
|
709
|
+
f"Error inserting batch {i//batch_size + 1} into MongoDB: {e}")
|
710
|
+
# Decide if we should skip Pinecone for this batch
|
711
|
+
continue # Skip to next batch
|
712
|
+
|
713
|
+
# Embed batch using OpenAIEmbedding
|
714
|
+
try:
|
715
|
+
batch_embeddings = await embed_model.aget_text_embedding_batch(batch_texts, show_progress=True)
|
716
|
+
except Exception as e:
|
717
|
+
print(
|
718
|
+
f"Error embedding batch {i//batch_size + 1} using {self.openai_model_name}: {e}")
|
719
|
+
continue # Skip Pinecone upsert for this batch
|
720
|
+
|
721
|
+
# Prepare Pinecone vectors
|
722
|
+
pinecone_vectors = []
|
723
|
+
for idx, doc_id in enumerate(batch_doc_ids):
|
724
|
+
metadata = batch_metadatas[idx]
|
725
|
+
pinecone_meta = {
|
726
|
+
"document_id": doc_id, "is_chunk": False,
|
727
|
+
"source": metadata.get("source", "unknown"),
|
728
|
+
"tags": metadata.get("tags", [])
|
729
|
+
}
|
730
|
+
if self.pinecone.use_reranking:
|
731
|
+
pinecone_meta[self.pinecone.rerank_text_field] = batch_texts[idx]
|
732
|
+
|
733
|
+
pinecone_vectors.append({
|
734
|
+
"id": doc_id,
|
735
|
+
"values": batch_embeddings[idx],
|
736
|
+
"metadata": pinecone_meta
|
737
|
+
})
|
738
|
+
|
739
|
+
# Upsert vectors to Pinecone
|
740
|
+
if pinecone_vectors:
|
741
|
+
try:
|
742
|
+
await self.pinecone.upsert(
|
743
|
+
vectors=pinecone_vectors,
|
744
|
+
namespace=namespace
|
745
|
+
)
|
746
|
+
except Exception as e:
|
747
|
+
print(
|
748
|
+
f"Error upserting vector batch {i//batch_size + 1} to Pinecone: {e}")
|
749
|
+
|
750
|
+
# Optional delay
|
751
|
+
if i + batch_size < len(documents):
|
752
|
+
await asyncio.sleep(0.1)
|
753
|
+
|
754
|
+
return all_doc_ids
|
755
|
+
|
756
|
+
async def get_full_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
757
|
+
"""
|
758
|
+
Retrieve the full document entry (including PDF data if applicable) from MongoDB.
|
759
|
+
|
760
|
+
Args:
|
761
|
+
document_id: The ID of the document (parent ID if it was a PDF).
|
762
|
+
|
763
|
+
Returns:
|
764
|
+
The document dictionary from MongoDB, or None if not found.
|
765
|
+
"""
|
766
|
+
try:
|
767
|
+
return self.mongo.find_one(self.collection, {"document_id": document_id})
|
768
|
+
except Exception as e:
|
769
|
+
print(
|
770
|
+
f"Error retrieving full document {document_id} from MongoDB: {e}")
|
771
|
+
return None
|