solana-agent 20.1.2__py3-none-any.whl → 31.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/__init__.py +10 -5
- solana_agent/adapters/ffmpeg_transcoder.py +375 -0
- solana_agent/adapters/mongodb_adapter.py +15 -2
- solana_agent/adapters/openai_adapter.py +679 -0
- solana_agent/adapters/openai_realtime_ws.py +1813 -0
- solana_agent/adapters/pinecone_adapter.py +543 -0
- solana_agent/cli.py +128 -0
- solana_agent/client/solana_agent.py +180 -20
- solana_agent/domains/agent.py +13 -13
- solana_agent/domains/routing.py +18 -8
- solana_agent/factories/agent_factory.py +239 -38
- solana_agent/guardrails/pii.py +107 -0
- solana_agent/interfaces/client/client.py +95 -12
- solana_agent/interfaces/guardrails/guardrails.py +26 -0
- solana_agent/interfaces/plugins/plugins.py +2 -1
- solana_agent/interfaces/providers/__init__.py +0 -0
- solana_agent/interfaces/providers/audio.py +40 -0
- solana_agent/interfaces/providers/data_storage.py +9 -2
- solana_agent/interfaces/providers/llm.py +86 -9
- solana_agent/interfaces/providers/memory.py +13 -1
- solana_agent/interfaces/providers/realtime.py +212 -0
- solana_agent/interfaces/providers/vector_storage.py +53 -0
- solana_agent/interfaces/services/agent.py +27 -12
- solana_agent/interfaces/services/knowledge_base.py +59 -0
- solana_agent/interfaces/services/query.py +41 -8
- solana_agent/interfaces/services/routing.py +0 -1
- solana_agent/plugins/manager.py +37 -16
- solana_agent/plugins/registry.py +34 -19
- solana_agent/plugins/tools/__init__.py +0 -5
- solana_agent/plugins/tools/auto_tool.py +1 -0
- solana_agent/repositories/memory.py +332 -111
- solana_agent/services/__init__.py +1 -1
- solana_agent/services/agent.py +390 -241
- solana_agent/services/knowledge_base.py +768 -0
- solana_agent/services/query.py +1858 -153
- solana_agent/services/realtime.py +626 -0
- solana_agent/services/routing.py +104 -51
- solana_agent-31.4.0.dist-info/METADATA +1070 -0
- solana_agent-31.4.0.dist-info/RECORD +49 -0
- {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info}/WHEEL +1 -1
- solana_agent-31.4.0.dist-info/entry_points.txt +3 -0
- solana_agent/adapters/llm_adapter.py +0 -160
- solana_agent-20.1.2.dist-info/METADATA +0 -464
- solana_agent-20.1.2.dist-info/RECORD +0 -35
- {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,768 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime as dt
|
|
3
|
+
from typing import Dict, List, Any, Optional, Union
|
|
4
|
+
import uuid
|
|
5
|
+
import asyncio
|
|
6
|
+
import io
|
|
7
|
+
|
|
8
|
+
# PDF and Chunking Imports
|
|
9
|
+
import pypdf
|
|
10
|
+
from llama_index.core import Document as LlamaDocument
|
|
11
|
+
from llama_index.core.node_parser import SemanticSplitterNodeParser
|
|
12
|
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
|
13
|
+
|
|
14
|
+
from solana_agent.adapters.pinecone_adapter import PineconeAdapter
|
|
15
|
+
from solana_agent.adapters.mongodb_adapter import MongoDBAdapter
|
|
16
|
+
from solana_agent.interfaces.services.knowledge_base import (
|
|
17
|
+
KnowledgeBaseService as KnowledgeBaseInterface,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Setup logger for this module
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class KnowledgeBaseService(KnowledgeBaseInterface):
|
|
25
|
+
"""
|
|
26
|
+
Knowledge Base service using Pinecone for vector search and MongoDB for metadata/chunk storage.
|
|
27
|
+
Supports text documents and PDF semantic chunking using OpenAI embeddings via LlamaIndex.
|
|
28
|
+
PDF binary data is not stored. Chunks are stored individually in MongoDB.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
pinecone_adapter: PineconeAdapter,
|
|
34
|
+
mongodb_adapter: MongoDBAdapter,
|
|
35
|
+
openai_api_key: str,
|
|
36
|
+
openai_model_name: str = "text-embedding-3-large",
|
|
37
|
+
collection_name: str = "knowledge_documents",
|
|
38
|
+
rerank_results: bool = False,
|
|
39
|
+
rerank_top_k: int = 3,
|
|
40
|
+
# Semantic Splitter Config
|
|
41
|
+
splitter_buffer_size: int = 1,
|
|
42
|
+
splitter_breakpoint_percentile: int = 95,
|
|
43
|
+
):
|
|
44
|
+
"""
|
|
45
|
+
Initialize the Knowledge Base service.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
pinecone_adapter: Configured PineconeAdapter instance.
|
|
49
|
+
mongodb_adapter: Configured MongoDBAdapter instance.
|
|
50
|
+
openai_api_key: OpenAI API key for embedding.
|
|
51
|
+
openai_model_name: OpenAI embedding model name.
|
|
52
|
+
collection_name: MongoDB collection for storing document metadata and chunks.
|
|
53
|
+
rerank_results: Whether PineconeAdapter should rerank results.
|
|
54
|
+
rerank_top_k: Number of results to return after reranking (used by PineconeAdapter).
|
|
55
|
+
splitter_buffer_size: Buffer size for SemanticSplitterNodeParser.
|
|
56
|
+
splitter_breakpoint_percentile: Breakpoint percentile for SemanticSplitterNodeParser.
|
|
57
|
+
"""
|
|
58
|
+
self.pinecone = pinecone_adapter
|
|
59
|
+
self.mongo = mongodb_adapter
|
|
60
|
+
self.collection = collection_name
|
|
61
|
+
# Store rerank config for reference, but PineconeAdapter applies it
|
|
62
|
+
self.rerank_results = rerank_results
|
|
63
|
+
self.rerank_top_k = rerank_top_k
|
|
64
|
+
|
|
65
|
+
# --- Initialize Semantic Splitter with OpenAIEmbedding ---
|
|
66
|
+
api_key = openai_api_key
|
|
67
|
+
if not api_key:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"OpenAI API key not provided via argument or OPENAI_API_KEY environment variable."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Determine expected embedding dimensions based on model name
|
|
73
|
+
if openai_model_name == "text-embedding-3-large":
|
|
74
|
+
openai_dimensions = 3072
|
|
75
|
+
elif openai_model_name == "text-embedding-3-small":
|
|
76
|
+
openai_dimensions = 1536
|
|
77
|
+
else:
|
|
78
|
+
# Attempt to get dimension from Pinecone config if available, otherwise raise error
|
|
79
|
+
openai_dimensions = getattr(self.pinecone, "embedding_dimensions", 0)
|
|
80
|
+
if openai_dimensions <= 0:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Cannot determine dimension for unknown OpenAI model '{openai_model_name}' and Pinecone dimension not configured."
|
|
83
|
+
)
|
|
84
|
+
logger.warning( # Use logger.warning
|
|
85
|
+
f"Unknown OpenAI model '{openai_model_name}'. Using dimension {openai_dimensions} from Pinecone config. Ensure this is correct."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Instantiate OpenAIEmbedding
|
|
89
|
+
# Note: LlamaIndex OpenAIEmbedding doesn't directly support reducing dimensions via 'dimensions' param during init
|
|
90
|
+
# like the raw OpenAI client does. It uses the model's default.
|
|
91
|
+
try:
|
|
92
|
+
llama_embed_model = OpenAIEmbedding(
|
|
93
|
+
model=openai_model_name,
|
|
94
|
+
api_key=api_key,
|
|
95
|
+
# embed_batch_size=10 # Optional: Adjust batch size if needed
|
|
96
|
+
)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error(f"Error initializing OpenAIEmbedding: {e}") # Use logger.error
|
|
99
|
+
raise
|
|
100
|
+
|
|
101
|
+
self.semantic_splitter = SemanticSplitterNodeParser(
|
|
102
|
+
buffer_size=splitter_buffer_size,
|
|
103
|
+
breakpoint_percentile_threshold=splitter_breakpoint_percentile,
|
|
104
|
+
embed_model=llama_embed_model, # Use the OpenAIEmbedding instance
|
|
105
|
+
)
|
|
106
|
+
# Store model name for logging/reference
|
|
107
|
+
self.openai_model_name = openai_model_name
|
|
108
|
+
# --- End Semantic Splitter Init ---
|
|
109
|
+
|
|
110
|
+
self._ensure_collection()
|
|
111
|
+
|
|
112
|
+
def _ensure_collection(self) -> None:
|
|
113
|
+
"""Set up MongoDB collection with appropriate indexes."""
|
|
114
|
+
if not self.mongo.collection_exists(self.collection):
|
|
115
|
+
self.mongo.create_collection(self.collection)
|
|
116
|
+
logger.info(
|
|
117
|
+
f"Created MongoDB collection: {self.collection}"
|
|
118
|
+
) # Use logger.info
|
|
119
|
+
|
|
120
|
+
# Indexes for retrieval and filtering
|
|
121
|
+
self.mongo.create_index(self.collection, [("document_id", 1)], unique=True)
|
|
122
|
+
self.mongo.create_index(self.collection, [("parent_document_id", 1)])
|
|
123
|
+
self.mongo.create_index(self.collection, [("source", 1)])
|
|
124
|
+
self.mongo.create_index(self.collection, [("created_at", -1)])
|
|
125
|
+
self.mongo.create_index(self.collection, [("tags", 1)])
|
|
126
|
+
self.mongo.create_index(self.collection, [("is_chunk", 1)])
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Ensured indexes exist for MongoDB collection: {self.collection}"
|
|
129
|
+
) # Use logger.info
|
|
130
|
+
|
|
131
|
+
async def add_document(
|
|
132
|
+
self,
|
|
133
|
+
text: str,
|
|
134
|
+
metadata: Dict[str, Any],
|
|
135
|
+
document_id: Optional[str] = None,
|
|
136
|
+
namespace: Optional[str] = None,
|
|
137
|
+
) -> str: # pragma: no cover
|
|
138
|
+
"""
|
|
139
|
+
Add a plain text document to the knowledge base. Embeds using OpenAI.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
text: Document text content.
|
|
143
|
+
metadata: Document metadata.
|
|
144
|
+
document_id: Optional document ID.
|
|
145
|
+
namespace: Optional Pinecone namespace.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
The document ID.
|
|
149
|
+
"""
|
|
150
|
+
doc_id = document_id or str(uuid.uuid4())
|
|
151
|
+
now = dt.now(tz=dt.now().astimezone().tzinfo)
|
|
152
|
+
|
|
153
|
+
# Store metadata and content in MongoDB
|
|
154
|
+
mongo_doc = {
|
|
155
|
+
"document_id": doc_id,
|
|
156
|
+
"content": text,
|
|
157
|
+
"is_chunk": False,
|
|
158
|
+
"parent_document_id": None,
|
|
159
|
+
**metadata,
|
|
160
|
+
# Use timezone aware datetime
|
|
161
|
+
"created_at": metadata.get("created_at", now),
|
|
162
|
+
"updated_at": now,
|
|
163
|
+
}
|
|
164
|
+
try:
|
|
165
|
+
self.mongo.insert_one(self.collection, mongo_doc)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(
|
|
168
|
+
f"Error inserting document {doc_id} into MongoDB: {e}"
|
|
169
|
+
) # Use logger.error
|
|
170
|
+
raise
|
|
171
|
+
|
|
172
|
+
# Embed text using OpenAIEmbedding
|
|
173
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
|
174
|
+
try:
|
|
175
|
+
embedding = await embed_model.aget_text_embedding(text)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.error( # Use logger.error
|
|
178
|
+
f"Error embedding document {doc_id} using {self.openai_model_name}: {e}"
|
|
179
|
+
)
|
|
180
|
+
# Decide how to handle - Mongo insert succeeded, embedding failed
|
|
181
|
+
raise # Re-raise for now
|
|
182
|
+
|
|
183
|
+
# Prepare Pinecone metadata
|
|
184
|
+
pinecone_metadata = {
|
|
185
|
+
"document_id": doc_id,
|
|
186
|
+
"is_chunk": False,
|
|
187
|
+
"parent_document_id": False, # Explicitly set for clarity - Pinecone can't use None
|
|
188
|
+
"source": metadata.get("source", "unknown"),
|
|
189
|
+
"tags": metadata.get("tags", []),
|
|
190
|
+
}
|
|
191
|
+
# Add text itself if Pinecone adapter reranking is used
|
|
192
|
+
if self.pinecone.use_reranking:
|
|
193
|
+
pinecone_metadata[self.pinecone.rerank_text_field] = text
|
|
194
|
+
|
|
195
|
+
# Upsert vector to Pinecone using the generic upsert method
|
|
196
|
+
try:
|
|
197
|
+
await self.pinecone.upsert(
|
|
198
|
+
vectors=[
|
|
199
|
+
{"id": doc_id, "values": embedding, "metadata": pinecone_metadata}
|
|
200
|
+
],
|
|
201
|
+
namespace=namespace,
|
|
202
|
+
)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.error(
|
|
205
|
+
f"Error upserting vector for {doc_id} to Pinecone: {e}"
|
|
206
|
+
) # Use logger.error
|
|
207
|
+
# Decide how to handle - Mongo insert succeeded, Pinecone failed
|
|
208
|
+
raise # Re-raise for now
|
|
209
|
+
|
|
210
|
+
return doc_id
|
|
211
|
+
|
|
212
|
+
async def add_pdf_document(
|
|
213
|
+
self,
|
|
214
|
+
pdf_data: Union[bytes, str],
|
|
215
|
+
metadata: Dict[str, Any],
|
|
216
|
+
document_id: Optional[str] = None,
|
|
217
|
+
namespace: Optional[str] = None,
|
|
218
|
+
chunk_batch_size: int = 50,
|
|
219
|
+
) -> str: # pragma: no cover
|
|
220
|
+
"""
|
|
221
|
+
Add a PDF document, performs semantic chunking using OpenAI embeddings,
|
|
222
|
+
stores parent metadata and individual chunks in Mongo, and chunk vectors in Pinecone.
|
|
223
|
+
Full PDF binary is NOT stored.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
pdf_data: PDF content as bytes or a path to the PDF file.
|
|
227
|
+
metadata: Document metadata (applies to the parent PDF).
|
|
228
|
+
document_id: Optional parent document ID.
|
|
229
|
+
namespace: Optional Pinecone namespace for chunks.
|
|
230
|
+
chunk_batch_size: Batch size for embedding and upserting chunks.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
The parent document ID.
|
|
234
|
+
"""
|
|
235
|
+
parent_doc_id = document_id or str(uuid.uuid4())
|
|
236
|
+
pdf_bytes: bytes
|
|
237
|
+
now = dt.now(tz=dt.now().astimezone().tzinfo)
|
|
238
|
+
|
|
239
|
+
# --- 1. Read PDF and Extract Text ---
|
|
240
|
+
try:
|
|
241
|
+
if isinstance(pdf_data, str):
|
|
242
|
+
with open(pdf_data, "rb") as f:
|
|
243
|
+
pdf_bytes = f.read()
|
|
244
|
+
elif isinstance(pdf_data, bytes):
|
|
245
|
+
pdf_bytes = pdf_data
|
|
246
|
+
else:
|
|
247
|
+
raise ValueError("pdf_data must be bytes or a file path string.")
|
|
248
|
+
|
|
249
|
+
reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
|
|
250
|
+
extracted_text = "".join(page.extract_text() or "" for page in reader.pages)
|
|
251
|
+
if not extracted_text.strip():
|
|
252
|
+
logger.warning(
|
|
253
|
+
f"No text extracted from PDF {parent_doc_id}."
|
|
254
|
+
) # Use logger.warning
|
|
255
|
+
# Still store parent metadata even if no text
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(
|
|
258
|
+
f"Error reading or extracting text from PDF {parent_doc_id}: {e}"
|
|
259
|
+
) # Use logger.error
|
|
260
|
+
raise
|
|
261
|
+
|
|
262
|
+
# --- 2. Store Parent PDF Metadata in MongoDB (NO BINARY) ---
|
|
263
|
+
mongo_parent_doc = {
|
|
264
|
+
"document_id": parent_doc_id,
|
|
265
|
+
"content": None,
|
|
266
|
+
"is_chunk": False,
|
|
267
|
+
"parent_document_id": None,
|
|
268
|
+
**metadata,
|
|
269
|
+
"created_at": metadata.get("created_at", now),
|
|
270
|
+
"updated_at": now,
|
|
271
|
+
}
|
|
272
|
+
try:
|
|
273
|
+
self.mongo.insert_one(self.collection, mongo_parent_doc)
|
|
274
|
+
logger.info(
|
|
275
|
+
f"Stored parent metadata for PDF {parent_doc_id} in MongoDB."
|
|
276
|
+
) # Use logger.info
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.error(
|
|
279
|
+
f"Error inserting parent PDF metadata {parent_doc_id} into MongoDB: {e}"
|
|
280
|
+
)
|
|
281
|
+
raise
|
|
282
|
+
|
|
283
|
+
# --- 3. Semantic Chunking ---
|
|
284
|
+
if not extracted_text.strip():
|
|
285
|
+
logger.info( # Use logger.info
|
|
286
|
+
f"Skipping chunking for PDF {parent_doc_id} due to no extracted text."
|
|
287
|
+
)
|
|
288
|
+
return parent_doc_id # Return parent ID even if no chunks
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
llama_doc = LlamaDocument(text=extracted_text)
|
|
292
|
+
# Run synchronous splitter in thread pool
|
|
293
|
+
nodes = await asyncio.to_thread(
|
|
294
|
+
self.semantic_splitter.get_nodes_from_documents, [llama_doc]
|
|
295
|
+
)
|
|
296
|
+
logger.info(
|
|
297
|
+
f"Generated {len(nodes)} semantic chunks for PDF {parent_doc_id}."
|
|
298
|
+
) # Use logger.info
|
|
299
|
+
except Exception as e:
|
|
300
|
+
logger.error(
|
|
301
|
+
f"Error during semantic chunking for PDF {parent_doc_id}: {e}"
|
|
302
|
+
) # Use logger.error
|
|
303
|
+
# Parent metadata is already stored, decide how to proceed. Raising for now.
|
|
304
|
+
raise
|
|
305
|
+
|
|
306
|
+
# --- 4. Embed Chunks and Batch Upsert to Pinecone AND Store Chunks in MongoDB ---
|
|
307
|
+
if not nodes:
|
|
308
|
+
return parent_doc_id # No chunks generated
|
|
309
|
+
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Embedding {len(nodes)} chunks using {self.openai_model_name}..."
|
|
312
|
+
) # Use logger.info
|
|
313
|
+
chunk_texts = [node.get_content() for node in nodes]
|
|
314
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
|
315
|
+
all_chunk_embeddings = []
|
|
316
|
+
|
|
317
|
+
# Embed chunks in batches (using embed_model's internal batching)
|
|
318
|
+
try:
|
|
319
|
+
# Use aget_text_embedding_batch for async embedding
|
|
320
|
+
all_chunk_embeddings = await embed_model.aget_text_embedding_batch(
|
|
321
|
+
chunk_texts, show_progress=True
|
|
322
|
+
)
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(
|
|
325
|
+
f"Error embedding chunks for PDF {parent_doc_id}: {e}"
|
|
326
|
+
) # Use logger.error
|
|
327
|
+
raise # Stop if embedding fails
|
|
328
|
+
|
|
329
|
+
logger.info(
|
|
330
|
+
"Embedding complete. Preparing vectors for Pinecone and documents for MongoDB."
|
|
331
|
+
) # Use logger.info
|
|
332
|
+
pinecone_vectors = []
|
|
333
|
+
mongo_chunk_docs = []
|
|
334
|
+
chunk_now = dt.now(
|
|
335
|
+
tz=dt.now().astimezone().tzinfo
|
|
336
|
+
) # Consistent timestamp for chunks
|
|
337
|
+
|
|
338
|
+
for i, node in enumerate(nodes):
|
|
339
|
+
chunk_id = f"{parent_doc_id}_chunk_{i}"
|
|
340
|
+
chunk_text = chunk_texts[i]
|
|
341
|
+
|
|
342
|
+
# Prepare Pinecone Vector Metadata
|
|
343
|
+
pinecone_chunk_metadata = {
|
|
344
|
+
"document_id": chunk_id, # Pinecone ID is the chunk ID
|
|
345
|
+
"parent_document_id": parent_doc_id,
|
|
346
|
+
"chunk_index": i,
|
|
347
|
+
"is_chunk": True,
|
|
348
|
+
"source": metadata.get("source", "unknown"), # Inherit from parent
|
|
349
|
+
"tags": metadata.get("tags", []), # Inherit from parent
|
|
350
|
+
}
|
|
351
|
+
# Add chunk text itself if Pinecone adapter reranking is used
|
|
352
|
+
if self.pinecone.use_reranking:
|
|
353
|
+
pinecone_chunk_metadata[self.pinecone.rerank_text_field] = chunk_text
|
|
354
|
+
|
|
355
|
+
pinecone_vectors.append(
|
|
356
|
+
{
|
|
357
|
+
"id": chunk_id,
|
|
358
|
+
"values": all_chunk_embeddings[i],
|
|
359
|
+
"metadata": pinecone_chunk_metadata,
|
|
360
|
+
}
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Prepare MongoDB Chunk Document
|
|
364
|
+
mongo_chunk_doc = {
|
|
365
|
+
"document_id": chunk_id, # Mongo ID is the chunk ID
|
|
366
|
+
"parent_document_id": parent_doc_id,
|
|
367
|
+
"chunk_index": i,
|
|
368
|
+
"is_chunk": True,
|
|
369
|
+
"content": chunk_text, # Store chunk text in Mongo
|
|
370
|
+
"source": metadata.get("source", "unknown"), # Inherit from parent
|
|
371
|
+
"tags": metadata.get("tags", []), # Inherit from parent
|
|
372
|
+
# Add other relevant parent metadata if needed, avoid duplication if possible
|
|
373
|
+
"created_at": chunk_now, # Use consistent time for batch
|
|
374
|
+
"updated_at": chunk_now,
|
|
375
|
+
}
|
|
376
|
+
mongo_chunk_docs.append(mongo_chunk_doc)
|
|
377
|
+
|
|
378
|
+
# --- 5. Store Chunks in MongoDB ---
|
|
379
|
+
if mongo_chunk_docs:
|
|
380
|
+
try:
|
|
381
|
+
self.mongo.insert_many(self.collection, mongo_chunk_docs)
|
|
382
|
+
logger.info(
|
|
383
|
+
f"Stored {len(mongo_chunk_docs)} chunks in MongoDB for parent {parent_doc_id}."
|
|
384
|
+
)
|
|
385
|
+
except Exception as e:
|
|
386
|
+
logger.error(
|
|
387
|
+
f"Error inserting chunks into MongoDB for parent {parent_doc_id}: {e}"
|
|
388
|
+
)
|
|
389
|
+
# Decide how to handle: Pinecone upsert might still proceed or fail.
|
|
390
|
+
# For now, log the error and continue to Pinecone upsert attempt.
|
|
391
|
+
|
|
392
|
+
# --- 6. Upsert Chunk Vectors to Pinecone in Batches ---
|
|
393
|
+
if not pinecone_vectors:
|
|
394
|
+
logger.warning(f"No vectors generated to upsert for PDF {parent_doc_id}.")
|
|
395
|
+
return parent_doc_id
|
|
396
|
+
|
|
397
|
+
logger.info( # Use logger.info
|
|
398
|
+
f"Upserting {len(pinecone_vectors)} chunk vectors to Pinecone in batches of {chunk_batch_size}..."
|
|
399
|
+
)
|
|
400
|
+
upsert_tasks = []
|
|
401
|
+
for i in range(0, len(pinecone_vectors), chunk_batch_size):
|
|
402
|
+
batch_vectors = pinecone_vectors[i : i + chunk_batch_size]
|
|
403
|
+
# Create task for each batch upsert
|
|
404
|
+
upsert_tasks.append(
|
|
405
|
+
self.pinecone.upsert(vectors=batch_vectors, namespace=namespace)
|
|
406
|
+
)
|
|
407
|
+
# Optional: Add a small delay between initiating tasks if rate limiting is a concern
|
|
408
|
+
# await asyncio.sleep(0.05)
|
|
409
|
+
|
|
410
|
+
# Run upsert tasks concurrently
|
|
411
|
+
results = await asyncio.gather(*upsert_tasks, return_exceptions=True)
|
|
412
|
+
|
|
413
|
+
# Check for errors during upsert
|
|
414
|
+
upsert_errors = False
|
|
415
|
+
for idx, result in enumerate(results):
|
|
416
|
+
if isinstance(result, Exception):
|
|
417
|
+
upsert_errors = True
|
|
418
|
+
logger.error(
|
|
419
|
+
f"Error upserting vector batch {idx + 1} to Pinecone for parent {parent_doc_id}: {result}"
|
|
420
|
+
) # Use logger.error
|
|
421
|
+
# Decide on error handling: log, raise, etc. Consider cleanup?
|
|
422
|
+
|
|
423
|
+
if upsert_errors:
|
|
424
|
+
logger.warning(
|
|
425
|
+
f"Some errors occurred during Pinecone vector upsert for {parent_doc_id}."
|
|
426
|
+
)
|
|
427
|
+
# Consider if partial success requires specific handling or cleanup
|
|
428
|
+
|
|
429
|
+
logger.info(f"Finished processing PDF {parent_doc_id}.") # Use logger.info
|
|
430
|
+
return parent_doc_id
|
|
431
|
+
|
|
432
|
+
async def query(
|
|
433
|
+
self,
|
|
434
|
+
query_text: str,
|
|
435
|
+
filter: Optional[Dict[str, Any]] = None,
|
|
436
|
+
top_k: int = 5,
|
|
437
|
+
namespace: Optional[str] = None,
|
|
438
|
+
include_content: bool = True,
|
|
439
|
+
include_metadata: bool = True,
|
|
440
|
+
) -> List[Dict[str, Any]]: # pragma: no cover
|
|
441
|
+
"""
|
|
442
|
+
Query the knowledge base using semantic search with OpenAI embeddings.
|
|
443
|
+
Retrieves chunk or document content and metadata from MongoDB based on Pinecone results.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
query_text: The query text.
|
|
447
|
+
filter: Optional Pinecone metadata filter.
|
|
448
|
+
top_k: Number of results to retrieve initially.
|
|
449
|
+
namespace: Optional Pinecone namespace.
|
|
450
|
+
include_content: Whether to include document/chunk content in results.
|
|
451
|
+
include_metadata: Whether to include document/chunk metadata in results.
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
List of result dictionaries, potentially reranked by PineconeAdapter.
|
|
455
|
+
"""
|
|
456
|
+
# Determine initial K for Pinecone query, considering reranking multiplier
|
|
457
|
+
initial_k = top_k
|
|
458
|
+
if self.pinecone.use_reranking:
|
|
459
|
+
initial_k = top_k * self.pinecone.initial_query_top_k_multiplier
|
|
460
|
+
|
|
461
|
+
# --- Embed Query using OpenAIEmbedding ---
|
|
462
|
+
embed_model: OpenAIEmbedding = self.semantic_splitter.embed_model
|
|
463
|
+
try:
|
|
464
|
+
query_vector = await embed_model.aget_query_embedding(query_text)
|
|
465
|
+
except Exception as e:
|
|
466
|
+
logger.error(
|
|
467
|
+
f"Error embedding query text '{query_text}': {e}"
|
|
468
|
+
) # Use logger.error
|
|
469
|
+
return []
|
|
470
|
+
|
|
471
|
+
# --- Query Pinecone using the vector ---
|
|
472
|
+
try:
|
|
473
|
+
# Use the generic query method with the vector
|
|
474
|
+
# PineconeAdapter handles reranking internally if configured
|
|
475
|
+
pinecone_results = await self.pinecone.query(
|
|
476
|
+
vector=query_vector,
|
|
477
|
+
filter=filter,
|
|
478
|
+
top_k=initial_k, # Fetch more initially if reranking
|
|
479
|
+
namespace=namespace,
|
|
480
|
+
include_values=False, # Don't need embeddings back
|
|
481
|
+
include_metadata=True, # Need metadata for linking
|
|
482
|
+
)
|
|
483
|
+
except Exception as e:
|
|
484
|
+
logger.error(f"Error querying Pinecone: {e}") # Use logger.error
|
|
485
|
+
return []
|
|
486
|
+
|
|
487
|
+
if not pinecone_results:
|
|
488
|
+
return []
|
|
489
|
+
|
|
490
|
+
# Extract IDs, scores, and metadata from Pinecone results
|
|
491
|
+
# PineconeAdapter might have already reranked and truncated to final top_k
|
|
492
|
+
result_ids = [
|
|
493
|
+
res["id"] for res in pinecone_results
|
|
494
|
+
] # These are chunk IDs or plain doc IDs
|
|
495
|
+
scores = {res["id"]: res["score"] for res in pinecone_results}
|
|
496
|
+
pinecone_metadatas = {
|
|
497
|
+
res["id"]: res.get("metadata", {}) for res in pinecone_results
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
# --- Fetch corresponding data from MongoDB ---
|
|
501
|
+
# We need:
|
|
502
|
+
# 1. Chunk documents (using result_ids where is_chunk is True)
|
|
503
|
+
# 2. Parent documents (using parent_document_id from chunk metadata)
|
|
504
|
+
# 3. Plain documents (using result_ids where is_chunk is False)
|
|
505
|
+
chunk_ids_to_fetch = set()
|
|
506
|
+
parent_ids_to_fetch = set()
|
|
507
|
+
plain_doc_ids_to_fetch = set()
|
|
508
|
+
|
|
509
|
+
for res_id in result_ids:
|
|
510
|
+
meta = pinecone_metadatas.get(res_id, {})
|
|
511
|
+
if meta.get("is_chunk"):
|
|
512
|
+
chunk_ids_to_fetch.add(res_id)
|
|
513
|
+
parent_id = meta.get("parent_document_id")
|
|
514
|
+
if parent_id:
|
|
515
|
+
parent_ids_to_fetch.add(parent_id)
|
|
516
|
+
else:
|
|
517
|
+
plain_doc_ids_to_fetch.add(res_id)
|
|
518
|
+
|
|
519
|
+
# Fetch all required docs from Mongo in potentially fewer queries
|
|
520
|
+
mongo_docs_map = {}
|
|
521
|
+
ids_to_fetch_mongo = list(
|
|
522
|
+
chunk_ids_to_fetch | parent_ids_to_fetch | plain_doc_ids_to_fetch
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if ids_to_fetch_mongo:
|
|
526
|
+
try:
|
|
527
|
+
mongo_docs = self.mongo.find(
|
|
528
|
+
self.collection, {"document_id": {"$in": ids_to_fetch_mongo}}
|
|
529
|
+
)
|
|
530
|
+
mongo_docs_map = {doc["document_id"]: doc for doc in mongo_docs}
|
|
531
|
+
except Exception as e:
|
|
532
|
+
logger.error(
|
|
533
|
+
f"Error fetching documents from MongoDB: {e}"
|
|
534
|
+
) # Use logger.error
|
|
535
|
+
# Proceed with potentially missing Mongo data
|
|
536
|
+
|
|
537
|
+
# --- Combine Results ---
|
|
538
|
+
results = []
|
|
539
|
+
for res_id in result_ids: # Iterate in Pinecone's score order
|
|
540
|
+
pinecone_meta = pinecone_metadatas.get(res_id, {})
|
|
541
|
+
is_chunk = pinecone_meta.get("is_chunk", False)
|
|
542
|
+
parent_doc_id = pinecone_meta.get("parent_document_id")
|
|
543
|
+
|
|
544
|
+
result = {
|
|
545
|
+
"document_id": res_id, # This is the chunk_id if is_chunk, else the doc_id
|
|
546
|
+
"score": scores.get(res_id, 0.0),
|
|
547
|
+
"is_chunk": is_chunk,
|
|
548
|
+
"parent_document_id": parent_doc_id, # Null if not a chunk
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
mongo_doc = mongo_docs_map.get(
|
|
552
|
+
res_id
|
|
553
|
+
) # Get the specific chunk or plain doc
|
|
554
|
+
|
|
555
|
+
# --- FIX: Skip result if corresponding Mongo doc not found ---
|
|
556
|
+
if not mongo_doc:
|
|
557
|
+
logger.warning(
|
|
558
|
+
f"Document/chunk {res_id} found in Pinecone but not in MongoDB. Skipping."
|
|
559
|
+
)
|
|
560
|
+
continue
|
|
561
|
+
# --- End FIX ---
|
|
562
|
+
|
|
563
|
+
if include_content:
|
|
564
|
+
content = None
|
|
565
|
+
# Priority 1: Reranking field in Pinecone metadata (holds chunk text if reranking)
|
|
566
|
+
# Note: This might be redundant if we fetch from Mongo anyway, but keep for flexibility
|
|
567
|
+
if (
|
|
568
|
+
self.pinecone.use_reranking
|
|
569
|
+
and self.pinecone.rerank_text_field in pinecone_meta
|
|
570
|
+
):
|
|
571
|
+
content = pinecone_meta[self.pinecone.rerank_text_field]
|
|
572
|
+
# Priority 2: Get content from the fetched Mongo doc (chunk or plain doc)
|
|
573
|
+
elif mongo_doc:
|
|
574
|
+
content = mongo_doc.get("content")
|
|
575
|
+
result["content"] = content or ""
|
|
576
|
+
|
|
577
|
+
if include_metadata:
|
|
578
|
+
combined_meta = {}
|
|
579
|
+
# If it's a chunk, fetch the parent document's metadata
|
|
580
|
+
if is_chunk and parent_doc_id:
|
|
581
|
+
parent_mongo_doc = mongo_docs_map.get(parent_doc_id)
|
|
582
|
+
if parent_mongo_doc:
|
|
583
|
+
# Extract metadata from parent, excluding fields specific to parent/content
|
|
584
|
+
combined_meta = {
|
|
585
|
+
k: v
|
|
586
|
+
for k, v in parent_mongo_doc.items()
|
|
587
|
+
if k
|
|
588
|
+
not in [
|
|
589
|
+
"_id",
|
|
590
|
+
"document_id",
|
|
591
|
+
"content",
|
|
592
|
+
"pdf_data", # pdf_data removed anyway
|
|
593
|
+
"is_chunk",
|
|
594
|
+
"parent_document_id",
|
|
595
|
+
"created_at",
|
|
596
|
+
"updated_at",
|
|
597
|
+
"chunk_index",
|
|
598
|
+
]
|
|
599
|
+
}
|
|
600
|
+
# If it's a plain doc, fetch its own metadata
|
|
601
|
+
elif not is_chunk and mongo_doc:
|
|
602
|
+
combined_meta = {
|
|
603
|
+
k: v
|
|
604
|
+
for k, v in mongo_doc.items()
|
|
605
|
+
if k
|
|
606
|
+
not in [
|
|
607
|
+
"_id",
|
|
608
|
+
"document_id",
|
|
609
|
+
"content",
|
|
610
|
+
"pdf_data",
|
|
611
|
+
"is_chunk",
|
|
612
|
+
"parent_document_id",
|
|
613
|
+
"created_at",
|
|
614
|
+
"updated_at",
|
|
615
|
+
"chunk_index",
|
|
616
|
+
]
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
# Add/overwrite with chunk-specific info from Pinecone meta (like chunk_index)
|
|
620
|
+
# or specific metadata stored directly on the plain doc in Pinecone
|
|
621
|
+
combined_meta.update(
|
|
622
|
+
{
|
|
623
|
+
k: v
|
|
624
|
+
for k, v in pinecone_meta.items()
|
|
625
|
+
# Avoid redundancy with already included fields or internal fields
|
|
626
|
+
if k
|
|
627
|
+
not in [
|
|
628
|
+
"document_id",
|
|
629
|
+
"parent_document_id",
|
|
630
|
+
"is_chunk",
|
|
631
|
+
self.pinecone.rerank_text_field,
|
|
632
|
+
]
|
|
633
|
+
}
|
|
634
|
+
)
|
|
635
|
+
result["metadata"] = combined_meta
|
|
636
|
+
|
|
637
|
+
results.append(result)
|
|
638
|
+
|
|
639
|
+
return results
|
|
640
|
+
|
|
641
|
+
async def delete_document(
|
|
642
|
+
self, document_id: str, namespace: Optional[str] = None
|
|
643
|
+
) -> bool: # pragma: no cover
|
|
644
|
+
"""
|
|
645
|
+
Delete a parent document (plain text or PDF) and all its associated chunks
|
|
646
|
+
from both MongoDB and Pinecone. Cannot delete a chunk directly.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
document_id: ID of the parent document to delete.
|
|
650
|
+
namespace: Optional Pinecone namespace.
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
True if deletion was successful in both stores (if applicable), False otherwise.
|
|
654
|
+
"""
|
|
655
|
+
logger.info(
|
|
656
|
+
f"Attempting to delete document and associated data for ID: {document_id}"
|
|
657
|
+
)
|
|
658
|
+
mongo_delete_error = False
|
|
659
|
+
pinecone_delete_error = False
|
|
660
|
+
document_found = False # Track if the initial ID exists
|
|
661
|
+
|
|
662
|
+
# --- 0. Check if the target ID is a chunk ---
|
|
663
|
+
try:
|
|
664
|
+
target_doc = self.mongo.find_one(
|
|
665
|
+
self.collection,
|
|
666
|
+
{"document_id": document_id},
|
|
667
|
+
)
|
|
668
|
+
if target_doc and target_doc.get("is_chunk"):
|
|
669
|
+
logger.warning(
|
|
670
|
+
f"Cannot delete chunk {document_id} directly. Delete the parent document."
|
|
671
|
+
)
|
|
672
|
+
return False # Prevent deleting chunks directly
|
|
673
|
+
if target_doc:
|
|
674
|
+
document_found = True
|
|
675
|
+
except Exception as e: # pragma: no cover
|
|
676
|
+
logger.error(
|
|
677
|
+
f"Error checking document type for {document_id} in MongoDB: {e}"
|
|
678
|
+
) # pragma: no cover
|
|
679
|
+
return False # pragma: no cover # Fail if we can't even check the type
|
|
680
|
+
|
|
681
|
+
if not document_found:
|
|
682
|
+
logger.warning(f"Document {document_id} not found for deletion.")
|
|
683
|
+
# Even if not found, attempt cleanup in Pinecone just in case of inconsistency
|
|
684
|
+
# but the overall result should be False as the primary doc wasn't there.
|
|
685
|
+
pass # Continue to attempt Pinecone cleanup, but final result will be False
|
|
686
|
+
|
|
687
|
+
# --- 1. Find all associated document IDs in MongoDB ---
|
|
688
|
+
mongo_ids_to_delete = set()
|
|
689
|
+
pinecone_ids_to_delete = set()
|
|
690
|
+
try:
|
|
691
|
+
# Find parent doc and all chunk docs linked to it
|
|
692
|
+
# Use the ID confirmed not to be a chunk
|
|
693
|
+
docs_to_delete_mongo = list(
|
|
694
|
+
self.mongo.find(
|
|
695
|
+
self.collection,
|
|
696
|
+
{
|
|
697
|
+
"$or": [
|
|
698
|
+
{"document_id": document_id},
|
|
699
|
+
{"parent_document_id": document_id},
|
|
700
|
+
]
|
|
701
|
+
},
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
if docs_to_delete_mongo:
|
|
705
|
+
document_found = True # Confirm something was found related to the ID
|
|
706
|
+
for doc in docs_to_delete_mongo:
|
|
707
|
+
mongo_ids_to_delete.add(doc["document_id"])
|
|
708
|
+
pinecone_ids_to_delete.add(doc["document_id"])
|
|
709
|
+
elif document_found: # Parent existed but no chunks found (plain text doc)
|
|
710
|
+
mongo_ids_to_delete.add(document_id)
|
|
711
|
+
pinecone_ids_to_delete.add(document_id)
|
|
712
|
+
# If !document_found initially, sets remain empty unless fallback below happens
|
|
713
|
+
|
|
714
|
+
except Exception as e:
|
|
715
|
+
logger.warning(
|
|
716
|
+
f"Error finding associated documents in MongoDB for deletion ({document_id}): {e}. Attempting Pinecone/Mongo deletion with main ID only."
|
|
717
|
+
)
|
|
718
|
+
# Fallback: try deleting the main ID from Pinecone/Mongo
|
|
719
|
+
if document_found: # Only add if we confirmed the initial doc existed
|
|
720
|
+
pinecone_ids_to_delete.add(document_id)
|
|
721
|
+
mongo_ids_to_delete.add(document_id)
|
|
722
|
+
|
|
723
|
+
# Convert sets to lists for deletion methods
|
|
724
|
+
pinecone_ids_list = list(pinecone_ids_to_delete)
|
|
725
|
+
mongo_ids_list = list(mongo_ids_to_delete)
|
|
726
|
+
|
|
727
|
+
# If no IDs were found at all, and the initial doc wasn't found, return False
|
|
728
|
+
if not document_found and not mongo_ids_list and not pinecone_ids_list:
|
|
729
|
+
logger.info(f"No trace of document {document_id} found to delete.")
|
|
730
|
+
return False
|
|
731
|
+
|
|
732
|
+
# --- 2. Delete from Pinecone ---
|
|
733
|
+
if pinecone_ids_list:
|
|
734
|
+
try:
|
|
735
|
+
await self.pinecone.delete(ids=pinecone_ids_list, namespace=namespace)
|
|
736
|
+
logger.info(
|
|
737
|
+
f"Attempted deletion of {len(pinecone_ids_list)} vectors from Pinecone for {document_id}."
|
|
738
|
+
)
|
|
739
|
+
except Exception as e:
|
|
740
|
+
logger.error(
|
|
741
|
+
f"Error deleting vectors from Pinecone for {document_id} (IDs: {pinecone_ids_list}): {e}"
|
|
742
|
+
)
|
|
743
|
+
pinecone_delete_error = True # Track error
|
|
744
|
+
|
|
745
|
+
# --- 3. Delete from MongoDB ---
|
|
746
|
+
mongo_deleted_count = 0
|
|
747
|
+
if mongo_ids_list:
|
|
748
|
+
try:
|
|
749
|
+
delete_result = self.mongo.delete_many(
|
|
750
|
+
self.collection, {"document_id": {"$in": mongo_ids_list}}
|
|
751
|
+
)
|
|
752
|
+
mongo_deleted_count = delete_result.deleted_count
|
|
753
|
+
if mongo_deleted_count > 0:
|
|
754
|
+
logger.info(
|
|
755
|
+
f"Deleted {mongo_deleted_count} documents from MongoDB for {document_id}."
|
|
756
|
+
)
|
|
757
|
+
# else: # No need to log if count is 0, covered by initial find log
|
|
758
|
+
# logger.info(f"No documents found to delete in MongoDB for {document_id} with IDs: {mongo_ids_list}")
|
|
759
|
+
|
|
760
|
+
except Exception as e:
|
|
761
|
+
logger.error(
|
|
762
|
+
f"Error deleting documents from MongoDB for {document_id} (IDs: {mongo_ids_list}): {e}"
|
|
763
|
+
)
|
|
764
|
+
mongo_delete_error = True # Track error
|
|
765
|
+
|
|
766
|
+
# Return True only if the document was initially found and no errors occurred during deletion attempts
|
|
767
|
+
# If the document wasn't found initially, return False even if cleanup attempts were made.
|
|
768
|
+
return document_found and not mongo_delete_error and not pinecone_delete_error
|