mdb-engine 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdb_engine/README.md +144 -0
- mdb_engine/__init__.py +37 -0
- mdb_engine/auth/README.md +631 -0
- mdb_engine/auth/__init__.py +128 -0
- mdb_engine/auth/casbin_factory.py +199 -0
- mdb_engine/auth/casbin_models.py +46 -0
- mdb_engine/auth/config_defaults.py +71 -0
- mdb_engine/auth/config_helpers.py +213 -0
- mdb_engine/auth/cookie_utils.py +158 -0
- mdb_engine/auth/decorators.py +350 -0
- mdb_engine/auth/dependencies.py +747 -0
- mdb_engine/auth/helpers.py +64 -0
- mdb_engine/auth/integration.py +578 -0
- mdb_engine/auth/jwt.py +225 -0
- mdb_engine/auth/middleware.py +241 -0
- mdb_engine/auth/oso_factory.py +323 -0
- mdb_engine/auth/provider.py +570 -0
- mdb_engine/auth/restrictions.py +271 -0
- mdb_engine/auth/session_manager.py +477 -0
- mdb_engine/auth/token_lifecycle.py +213 -0
- mdb_engine/auth/token_store.py +289 -0
- mdb_engine/auth/users.py +1516 -0
- mdb_engine/auth/utils.py +614 -0
- mdb_engine/cli/__init__.py +13 -0
- mdb_engine/cli/commands/__init__.py +7 -0
- mdb_engine/cli/commands/generate.py +105 -0
- mdb_engine/cli/commands/migrate.py +83 -0
- mdb_engine/cli/commands/show.py +70 -0
- mdb_engine/cli/commands/validate.py +63 -0
- mdb_engine/cli/main.py +41 -0
- mdb_engine/cli/utils.py +92 -0
- mdb_engine/config.py +217 -0
- mdb_engine/constants.py +160 -0
- mdb_engine/core/README.md +542 -0
- mdb_engine/core/__init__.py +42 -0
- mdb_engine/core/app_registration.py +392 -0
- mdb_engine/core/connection.py +243 -0
- mdb_engine/core/engine.py +749 -0
- mdb_engine/core/index_management.py +162 -0
- mdb_engine/core/manifest.py +2793 -0
- mdb_engine/core/seeding.py +179 -0
- mdb_engine/core/service_initialization.py +355 -0
- mdb_engine/core/types.py +413 -0
- mdb_engine/database/README.md +522 -0
- mdb_engine/database/__init__.py +31 -0
- mdb_engine/database/abstraction.py +635 -0
- mdb_engine/database/connection.py +387 -0
- mdb_engine/database/scoped_wrapper.py +1721 -0
- mdb_engine/embeddings/README.md +184 -0
- mdb_engine/embeddings/__init__.py +62 -0
- mdb_engine/embeddings/dependencies.py +193 -0
- mdb_engine/embeddings/service.py +759 -0
- mdb_engine/exceptions.py +167 -0
- mdb_engine/indexes/README.md +651 -0
- mdb_engine/indexes/__init__.py +21 -0
- mdb_engine/indexes/helpers.py +145 -0
- mdb_engine/indexes/manager.py +895 -0
- mdb_engine/memory/README.md +451 -0
- mdb_engine/memory/__init__.py +30 -0
- mdb_engine/memory/service.py +1285 -0
- mdb_engine/observability/README.md +515 -0
- mdb_engine/observability/__init__.py +42 -0
- mdb_engine/observability/health.py +296 -0
- mdb_engine/observability/logging.py +161 -0
- mdb_engine/observability/metrics.py +297 -0
- mdb_engine/routing/README.md +462 -0
- mdb_engine/routing/__init__.py +73 -0
- mdb_engine/routing/websockets.py +813 -0
- mdb_engine/utils/__init__.py +7 -0
- mdb_engine-0.1.6.dist-info/METADATA +213 -0
- mdb_engine-0.1.6.dist-info/RECORD +75 -0
- mdb_engine-0.1.6.dist-info/WHEEL +5 -0
- mdb_engine-0.1.6.dist-info/entry_points.txt +2 -0
- mdb_engine-0.1.6.dist-info/licenses/LICENSE +661 -0
- mdb_engine-0.1.6.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,759 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Text Splitting and Embedding Service
|
|
3
|
+
|
|
4
|
+
This module provides intelligent text chunking and embedding capabilities:
|
|
5
|
+
1. Semantic text splitting using Rust-based semantic-text-splitter
|
|
6
|
+
2. Embedding generation via custom embed functions (users provide their own)
|
|
7
|
+
3. MongoDB storage with proper document structure
|
|
8
|
+
|
|
9
|
+
Key Features:
|
|
10
|
+
- Token-aware chunking (never exceeds model limits)
|
|
11
|
+
- Semantic boundary preservation (splits on sentences/paragraphs)
|
|
12
|
+
- Custom embed functions (users implement their own embedding logic)
|
|
13
|
+
- Batch processing for efficiency
|
|
14
|
+
- Automatic metadata tracking
|
|
15
|
+
- Platform-level defaults (users don't need to configure tokenizer - defaults to "gpt-3.5-turbo")
|
|
16
|
+
|
|
17
|
+
Dependencies:
|
|
18
|
+
pip install semantic-text-splitter
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
from abc import ABC, abstractmethod
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from typing import Any, Dict, List, Optional, Union
|
|
27
|
+
|
|
28
|
+
# Optional OpenAI SDK import
|
|
29
|
+
try:
|
|
30
|
+
from openai import AsyncAzureOpenAI, AsyncOpenAI
|
|
31
|
+
|
|
32
|
+
OPENAI_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
OPENAI_AVAILABLE = False
|
|
35
|
+
AsyncOpenAI = None
|
|
36
|
+
AsyncAzureOpenAI = None
|
|
37
|
+
|
|
38
|
+
# Optional dependencies
|
|
39
|
+
try:
|
|
40
|
+
from semantic_text_splitter import TextSplitter
|
|
41
|
+
|
|
42
|
+
SEMANTIC_SPLITTER_AVAILABLE = True
|
|
43
|
+
except ImportError:
|
|
44
|
+
SEMANTIC_SPLITTER_AVAILABLE = False
|
|
45
|
+
TextSplitter = None
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EmbeddingServiceError(Exception):
|
|
51
|
+
"""Base exception for embedding service failures."""
|
|
52
|
+
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class BaseEmbeddingProvider(ABC):
|
|
57
|
+
"""
|
|
58
|
+
Abstract base class for embedding providers.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
async def embed(
|
|
63
|
+
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
64
|
+
) -> List[List[float]]:
|
|
65
|
+
"""
|
|
66
|
+
Generate embeddings for text.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
text: A single string or list of strings to embed
|
|
70
|
+
model: Optional model identifier
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
List[List[float]]: List of embedding vectors
|
|
74
|
+
"""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class OpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
79
|
+
"""
|
|
80
|
+
OpenAI embedding provider.
|
|
81
|
+
|
|
82
|
+
Uses OpenAI's embedding API. Requires OPENAI_API_KEY environment variable.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
api_key: Optional[str] = None,
|
|
88
|
+
default_model: str = "text-embedding-3-small",
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Initialize OpenAI embedding provider.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
95
|
+
default_model: Default embedding model (default: "text-embedding-3-small")
|
|
96
|
+
"""
|
|
97
|
+
if not OPENAI_AVAILABLE:
|
|
98
|
+
raise EmbeddingServiceError(
|
|
99
|
+
"OpenAI SDK not available. Install with: pip install openai"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
|
103
|
+
if not api_key:
|
|
104
|
+
raise EmbeddingServiceError(
|
|
105
|
+
"OpenAI API key not found. Set OPENAI_API_KEY environment variable."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.client = AsyncOpenAI(api_key=api_key)
|
|
109
|
+
self.default_model = default_model
|
|
110
|
+
|
|
111
|
+
async def embed(
|
|
112
|
+
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
113
|
+
) -> List[List[float]]:
|
|
114
|
+
"""Generate embeddings using OpenAI."""
|
|
115
|
+
model = model or self.default_model
|
|
116
|
+
|
|
117
|
+
# Normalize to list
|
|
118
|
+
if isinstance(text, str):
|
|
119
|
+
text = [text]
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
response = await self.client.embeddings.create(model=model, input=text)
|
|
123
|
+
|
|
124
|
+
# Extract embeddings
|
|
125
|
+
vectors = [item.embedding for item in response.data]
|
|
126
|
+
return vectors
|
|
127
|
+
|
|
128
|
+
except (
|
|
129
|
+
ImportError,
|
|
130
|
+
AttributeError,
|
|
131
|
+
TypeError,
|
|
132
|
+
ValueError,
|
|
133
|
+
RuntimeError,
|
|
134
|
+
ConnectionError,
|
|
135
|
+
OSError,
|
|
136
|
+
) as e:
|
|
137
|
+
logger.error(f"OpenAI embedding failed: {e}")
|
|
138
|
+
raise EmbeddingServiceError(f"OpenAI embedding failed: {str(e)}") from e
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class AzureOpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
142
|
+
"""
|
|
143
|
+
Azure OpenAI embedding provider.
|
|
144
|
+
|
|
145
|
+
Uses Azure OpenAI's embedding API. Requires:
|
|
146
|
+
- AZURE_OPENAI_API_KEY environment variable
|
|
147
|
+
- AZURE_OPENAI_ENDPOINT environment variable
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
api_key: Optional[str] = None,
|
|
153
|
+
endpoint: Optional[str] = None,
|
|
154
|
+
api_version: Optional[str] = None,
|
|
155
|
+
default_model: str = "text-embedding-3-small",
|
|
156
|
+
):
|
|
157
|
+
"""
|
|
158
|
+
Initialize Azure OpenAI embedding provider.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
api_key: Azure OpenAI API key (defaults to AZURE_OPENAI_API_KEY env var)
|
|
162
|
+
endpoint: Azure OpenAI endpoint (defaults to AZURE_OPENAI_ENDPOINT env var)
|
|
163
|
+
api_version: API version (defaults to AZURE_OPENAI_API_VERSION or
|
|
164
|
+
OPENAI_API_VERSION env var)
|
|
165
|
+
default_model: Default embedding model/deployment name
|
|
166
|
+
(default: "text-embedding-3-small")
|
|
167
|
+
"""
|
|
168
|
+
if not OPENAI_AVAILABLE:
|
|
169
|
+
raise EmbeddingServiceError(
|
|
170
|
+
"OpenAI SDK not available. Install with: pip install openai"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
174
|
+
endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
175
|
+
api_version = (
|
|
176
|
+
api_version
|
|
177
|
+
or os.getenv("AZURE_OPENAI_API_VERSION")
|
|
178
|
+
or os.getenv("OPENAI_API_VERSION", "2024-02-15-preview")
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if not api_key or not endpoint:
|
|
182
|
+
raise EmbeddingServiceError(
|
|
183
|
+
"Azure OpenAI credentials not found. Set "
|
|
184
|
+
"AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT environment "
|
|
185
|
+
"variables."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Use AsyncAzureOpenAI for Azure (not AsyncOpenAI with Azure params)
|
|
189
|
+
self.client = AsyncAzureOpenAI(
|
|
190
|
+
api_key=api_key, api_version=api_version, azure_endpoint=endpoint
|
|
191
|
+
)
|
|
192
|
+
self.default_model = default_model
|
|
193
|
+
|
|
194
|
+
async def embed(
|
|
195
|
+
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
196
|
+
) -> List[List[float]]:
|
|
197
|
+
"""Generate embeddings using Azure OpenAI."""
|
|
198
|
+
model = model or self.default_model
|
|
199
|
+
|
|
200
|
+
# Normalize to list
|
|
201
|
+
if isinstance(text, str):
|
|
202
|
+
text = [text]
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
response = await self.client.embeddings.create(model=model, input=text)
|
|
206
|
+
|
|
207
|
+
# Extract embeddings
|
|
208
|
+
vectors = [item.embedding for item in response.data]
|
|
209
|
+
return vectors
|
|
210
|
+
|
|
211
|
+
except (
|
|
212
|
+
ImportError,
|
|
213
|
+
AttributeError,
|
|
214
|
+
TypeError,
|
|
215
|
+
ValueError,
|
|
216
|
+
RuntimeError,
|
|
217
|
+
ConnectionError,
|
|
218
|
+
OSError,
|
|
219
|
+
) as e:
|
|
220
|
+
logger.error(f"Azure OpenAI embedding failed: {e}")
|
|
221
|
+
raise EmbeddingServiceError(
|
|
222
|
+
f"Azure OpenAI embedding failed: {str(e)}"
|
|
223
|
+
) from e
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _detect_provider_from_env() -> str:
|
|
227
|
+
"""
|
|
228
|
+
Detect provider from environment variables (same logic as mem0).
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
"azure" if Azure OpenAI credentials are present, otherwise "openai"
|
|
232
|
+
"""
|
|
233
|
+
if os.getenv("AZURE_OPENAI_API_KEY") and os.getenv("AZURE_OPENAI_ENDPOINT"):
|
|
234
|
+
return "azure"
|
|
235
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
236
|
+
return "openai"
|
|
237
|
+
else:
|
|
238
|
+
# Default to openai if nothing is configured
|
|
239
|
+
return "openai"
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class EmbeddingProvider:
|
|
243
|
+
"""
|
|
244
|
+
Standalone embedding provider wrapper.
|
|
245
|
+
|
|
246
|
+
Auto-detects OpenAI or AzureOpenAI from environment variables.
|
|
247
|
+
Supports OpenAI and AzureOpenAI only.
|
|
248
|
+
|
|
249
|
+
Example:
|
|
250
|
+
# Auto-detects from environment variables
|
|
251
|
+
provider = EmbeddingProvider()
|
|
252
|
+
|
|
253
|
+
# Or explicitly provide a provider
|
|
254
|
+
from mdb_engine.embeddings import OpenAIEmbeddingProvider
|
|
255
|
+
provider = EmbeddingProvider(embedding_provider=OpenAIEmbeddingProvider())
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
def __init__(
|
|
259
|
+
self,
|
|
260
|
+
embedding_provider: Optional[BaseEmbeddingProvider] = None,
|
|
261
|
+
config: Optional[Dict[str, Any]] = None,
|
|
262
|
+
):
|
|
263
|
+
"""
|
|
264
|
+
Initialize Embedding Provider.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
embedding_provider: BaseEmbeddingProvider instance (optional, will auto-detect if None)
|
|
268
|
+
config: Optional dict with embedding configuration (from manifest.json embedding_config)
|
|
269
|
+
Supports: default_embedding_model
|
|
270
|
+
|
|
271
|
+
Raises:
|
|
272
|
+
EmbeddingServiceError: If provider cannot be auto-detected and none is provided
|
|
273
|
+
"""
|
|
274
|
+
if embedding_provider is not None:
|
|
275
|
+
if not isinstance(embedding_provider, BaseEmbeddingProvider):
|
|
276
|
+
raise EmbeddingServiceError(
|
|
277
|
+
f"embedding_provider must be an instance of BaseEmbeddingProvider, "
|
|
278
|
+
f"got {type(embedding_provider)}"
|
|
279
|
+
)
|
|
280
|
+
self.embedding_provider = embedding_provider
|
|
281
|
+
else:
|
|
282
|
+
# Auto-detect provider from environment variables
|
|
283
|
+
provider_type = _detect_provider_from_env()
|
|
284
|
+
default_model = (config or {}).get(
|
|
285
|
+
"default_embedding_model", "text-embedding-3-small"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
if provider_type == "azure":
|
|
289
|
+
self.embedding_provider = AzureOpenAIEmbeddingProvider(
|
|
290
|
+
default_model=default_model
|
|
291
|
+
)
|
|
292
|
+
logger.info(
|
|
293
|
+
f"Auto-detected Azure OpenAI embedding provider (model: {default_model})"
|
|
294
|
+
)
|
|
295
|
+
else:
|
|
296
|
+
self.embedding_provider = OpenAIEmbeddingProvider(
|
|
297
|
+
default_model=default_model
|
|
298
|
+
)
|
|
299
|
+
logger.info(
|
|
300
|
+
f"Auto-detected OpenAI embedding provider (model: {default_model})"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Store config for potential future use
|
|
304
|
+
self.config = config or {}
|
|
305
|
+
|
|
306
|
+
async def embed(
|
|
307
|
+
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
308
|
+
) -> List[List[float]]:
|
|
309
|
+
"""
|
|
310
|
+
Generates vector embeddings for a string or list of strings.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
text: A single string document or a list of documents.
|
|
314
|
+
model: Optional model identifier (overrides default)
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
List[List[float]]: A list of vectors.
|
|
318
|
+
If input was a single string, returns a list containing one vector.
|
|
319
|
+
|
|
320
|
+
Example:
|
|
321
|
+
```python
|
|
322
|
+
# Batch embedding (Faster)
|
|
323
|
+
docs = ["Apple", "Banana", "Cherry"]
|
|
324
|
+
vectors = await provider.embed(docs, model="text-embedding-3-small")
|
|
325
|
+
|
|
326
|
+
# vectors is [[0.1, ...], [0.2, ...], [0.3, ...]]
|
|
327
|
+
```
|
|
328
|
+
"""
|
|
329
|
+
start_time = time.time()
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
vectors = await self.embedding_provider.embed(text, model)
|
|
333
|
+
|
|
334
|
+
duration = time.time() - start_time
|
|
335
|
+
item_count = 1 if isinstance(text, str) else len(text)
|
|
336
|
+
|
|
337
|
+
logger.info(
|
|
338
|
+
"EMBED_SUCCESS",
|
|
339
|
+
extra={"count": item_count, "latency_sec": round(duration, 3)},
|
|
340
|
+
)
|
|
341
|
+
return vectors
|
|
342
|
+
|
|
343
|
+
except (AttributeError, TypeError, ValueError, RuntimeError, KeyError) as e:
|
|
344
|
+
logger.error(f"EMBED_FAILED: {str(e)}")
|
|
345
|
+
raise EmbeddingServiceError(f"Embedding failed: {str(e)}") from e
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class EmbeddingService:
|
|
349
|
+
"""
|
|
350
|
+
Service for semantic text splitting and embedding generation.
|
|
351
|
+
|
|
352
|
+
This service combines:
|
|
353
|
+
1. Semantic text splitting (Rust-based, fast and accurate)
|
|
354
|
+
2. Embedding generation (via OpenAI or AzureOpenAI, auto-detected from env vars)
|
|
355
|
+
3. MongoDB storage (structured document format)
|
|
356
|
+
|
|
357
|
+
Example:
|
|
358
|
+
from mdb_engine.embeddings import EmbeddingService
|
|
359
|
+
|
|
360
|
+
# Initialize (auto-detects OpenAI or AzureOpenAI from environment variables)
|
|
361
|
+
embedding_service = EmbeddingService()
|
|
362
|
+
|
|
363
|
+
# Process and store
|
|
364
|
+
await embedding_service.process_and_store(
|
|
365
|
+
text_content="Your long document here...",
|
|
366
|
+
source_id="doc_101",
|
|
367
|
+
collection=db.knowledge_base,
|
|
368
|
+
max_tokens_per_chunk=1000
|
|
369
|
+
)
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
def __init__(
|
|
373
|
+
self,
|
|
374
|
+
embedding_provider: Optional[EmbeddingProvider] = None,
|
|
375
|
+
default_max_tokens: int = 1000,
|
|
376
|
+
default_tokenizer_model: str = "gpt-3.5-turbo",
|
|
377
|
+
config: Optional[Dict[str, Any]] = None,
|
|
378
|
+
):
|
|
379
|
+
"""
|
|
380
|
+
Initialize Embedding Service.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
embedding_provider: EmbeddingProvider instance (optional, will create default if None)
|
|
384
|
+
default_max_tokens: Default max tokens per chunk (default: 1000)
|
|
385
|
+
default_tokenizer_model: Tokenizer model name for counting tokens
|
|
386
|
+
(default: "gpt-3.5-turbo").
|
|
387
|
+
This is ONLY for token counting during chunking, NOT for
|
|
388
|
+
embeddings.
|
|
389
|
+
Must be a valid OpenAI model name (e.g., "gpt-3.5-turbo",
|
|
390
|
+
"gpt-4").
|
|
391
|
+
config: Optional configuration dict (from manifest.json embedding_config)
|
|
392
|
+
|
|
393
|
+
Raises:
|
|
394
|
+
EmbeddingServiceError: If required dependencies are not available
|
|
395
|
+
"""
|
|
396
|
+
if not SEMANTIC_SPLITTER_AVAILABLE:
|
|
397
|
+
raise EmbeddingServiceError(
|
|
398
|
+
"semantic-text-splitter not available. Install with: "
|
|
399
|
+
"pip install semantic-text-splitter"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
# Create embedding provider if not provided
|
|
403
|
+
if embedding_provider is None:
|
|
404
|
+
embedding_provider = EmbeddingProvider(config=config)
|
|
405
|
+
|
|
406
|
+
self.embedding_provider = embedding_provider
|
|
407
|
+
self.default_max_tokens = default_max_tokens
|
|
408
|
+
self.default_tokenizer_model = default_tokenizer_model
|
|
409
|
+
|
|
410
|
+
def _create_splitter(
|
|
411
|
+
self, max_tokens: int, tokenizer_model: Optional[str] = None
|
|
412
|
+
) -> TextSplitter:
|
|
413
|
+
"""
|
|
414
|
+
Create a TextSplitter instance.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
max_tokens: Maximum tokens per chunk
|
|
418
|
+
tokenizer_model: Tokenizer encoding for counting
|
|
419
|
+
(default: uses default_tokenizer_model).
|
|
420
|
+
This is ONLY for token counting, NOT for embeddings.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
TextSplitter instance
|
|
424
|
+
"""
|
|
425
|
+
# Use provided tokenizer, or fall back to default (gpt-3.5-turbo)
|
|
426
|
+
model = tokenizer_model or self.default_tokenizer_model
|
|
427
|
+
return TextSplitter.from_tiktoken_model(model, max_tokens)
|
|
428
|
+
|
|
429
|
+
async def chunk_text(
|
|
430
|
+
self,
|
|
431
|
+
text_content: str,
|
|
432
|
+
max_tokens: Optional[int] = None,
|
|
433
|
+
tokenizer_model: Optional[str] = None,
|
|
434
|
+
) -> List[str]:
|
|
435
|
+
"""
|
|
436
|
+
Split text into semantic chunks.
|
|
437
|
+
|
|
438
|
+
Uses Rust-based semantic-text-splitter for fast, accurate chunking
|
|
439
|
+
that respects token limits and semantic boundaries.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
text_content: The text to chunk
|
|
443
|
+
max_tokens: Max tokens per chunk (default: uses default_max_tokens)
|
|
444
|
+
tokenizer_model: Tokenizer model name for counting (optional,
|
|
445
|
+
defaults to "gpt-3.5-turbo").
|
|
446
|
+
This is ONLY for token counting, NOT for embeddings.
|
|
447
|
+
Must be a valid OpenAI model name (e.g., "gpt-3.5-turbo",
|
|
448
|
+
"gpt-4").
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
List of text chunks
|
|
452
|
+
|
|
453
|
+
Example:
|
|
454
|
+
chunks = await service.chunk_text("Long document...", max_tokens=1000)
|
|
455
|
+
print(f"Generated {len(chunks)} chunks")
|
|
456
|
+
"""
|
|
457
|
+
max_tokens = max_tokens or self.default_max_tokens
|
|
458
|
+
splitter = self._create_splitter(max_tokens, tokenizer_model)
|
|
459
|
+
|
|
460
|
+
try:
|
|
461
|
+
chunks = splitter.chunks(text_content)
|
|
462
|
+
logger.info(f"Generated {len(chunks)} chunks (max_tokens={max_tokens})")
|
|
463
|
+
return chunks
|
|
464
|
+
except (ImportError, AttributeError, TypeError, ValueError, RuntimeError) as e:
|
|
465
|
+
logger.error(f"Error chunking text: {e}", exc_info=True)
|
|
466
|
+
raise EmbeddingServiceError(f"Chunking failed: {str(e)}") from e
|
|
467
|
+
|
|
468
|
+
async def embed_chunks(
|
|
469
|
+
self, chunks: List[str], model: Optional[str] = None
|
|
470
|
+
) -> List[List[float]]:
|
|
471
|
+
"""
|
|
472
|
+
Generate embeddings for text chunks.
|
|
473
|
+
|
|
474
|
+
Uses the user-provided embedding provider/function.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
chunks: List of text chunks to embed
|
|
478
|
+
model: Optional model identifier (passed to embedding provider)
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
List of embedding vectors (each is a list of floats)
|
|
482
|
+
|
|
483
|
+
Example:
|
|
484
|
+
chunks = ["chunk 1", "chunk 2"]
|
|
485
|
+
vectors = await service.embed_chunks(chunks, model="text-embedding-3-small")
|
|
486
|
+
"""
|
|
487
|
+
if not chunks:
|
|
488
|
+
return []
|
|
489
|
+
|
|
490
|
+
try:
|
|
491
|
+
# Use EmbeddingProvider's embed method (handles retries, logging, etc.)
|
|
492
|
+
vectors = await self.embedding_provider.embed(chunks, model=model)
|
|
493
|
+
logger.info(f"Generated {len(vectors)} embeddings")
|
|
494
|
+
return vectors
|
|
495
|
+
except (
|
|
496
|
+
AttributeError,
|
|
497
|
+
TypeError,
|
|
498
|
+
ValueError,
|
|
499
|
+
RuntimeError,
|
|
500
|
+
ConnectionError,
|
|
501
|
+
OSError,
|
|
502
|
+
) as e:
|
|
503
|
+
logger.error(f"Error generating embeddings: {e}", exc_info=True)
|
|
504
|
+
raise EmbeddingServiceError(f"Embedding generation failed: {str(e)}") from e
|
|
505
|
+
|
|
506
|
+
async def process_and_store(
|
|
507
|
+
self,
|
|
508
|
+
text_content: str,
|
|
509
|
+
source_id: str,
|
|
510
|
+
collection: Any, # MongoDB collection (AppDB Collection or Motor collection)
|
|
511
|
+
max_tokens: Optional[int] = None,
|
|
512
|
+
tokenizer_model: Optional[str] = None,
|
|
513
|
+
embedding_model: Optional[str] = None,
|
|
514
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
515
|
+
) -> Dict[str, Any]:
|
|
516
|
+
"""
|
|
517
|
+
Process text and store chunks with embeddings in MongoDB.
|
|
518
|
+
|
|
519
|
+
This is the main method that:
|
|
520
|
+
1. Chunks the text semantically
|
|
521
|
+
2. Generates embeddings for each chunk
|
|
522
|
+
3. Stores documents in MongoDB with proper structure
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
text_content: The text to process
|
|
526
|
+
source_id: Unique identifier for the source document
|
|
527
|
+
collection: MongoDB collection (AppDB Collection or Motor collection)
|
|
528
|
+
max_tokens: Max tokens per chunk (default: uses default_max_tokens)
|
|
529
|
+
tokenizer_model: Tokenizer model for counting (default: uses default_tokenizer_model)
|
|
530
|
+
embedding_model: Embedding model (default: uses EmbeddingProvider default)
|
|
531
|
+
metadata: Additional metadata to store with each chunk
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
Dict with processing results:
|
|
535
|
+
{
|
|
536
|
+
"chunks_created": int,
|
|
537
|
+
"documents_inserted": int,
|
|
538
|
+
"source_id": str
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
Example:
|
|
542
|
+
result = await service.process_and_store(
|
|
543
|
+
text_content="Long document...",
|
|
544
|
+
source_id="doc_101",
|
|
545
|
+
collection=db.knowledge_base,
|
|
546
|
+
max_tokens=1000
|
|
547
|
+
)
|
|
548
|
+
print(f"Created {result['chunks_created']} chunks")
|
|
549
|
+
"""
|
|
550
|
+
logger.info(f"Processing source: {source_id}")
|
|
551
|
+
|
|
552
|
+
# Step 1: Chunk the text
|
|
553
|
+
chunks = await self.chunk_text(
|
|
554
|
+
text_content, max_tokens=max_tokens, tokenizer_model=tokenizer_model
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
if not chunks:
|
|
558
|
+
logger.warning(f"No chunks generated for source: {source_id}")
|
|
559
|
+
return {
|
|
560
|
+
"chunks_created": 0,
|
|
561
|
+
"documents_inserted": 0,
|
|
562
|
+
"source_id": source_id,
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
# Step 2: Generate embeddings (batch for efficiency)
|
|
566
|
+
try:
|
|
567
|
+
vectors = await self.embed_chunks(chunks, model=embedding_model)
|
|
568
|
+
except (
|
|
569
|
+
AttributeError,
|
|
570
|
+
TypeError,
|
|
571
|
+
ValueError,
|
|
572
|
+
RuntimeError,
|
|
573
|
+
ConnectionError,
|
|
574
|
+
OSError,
|
|
575
|
+
) as e:
|
|
576
|
+
logger.error(f"Failed to generate embeddings for {source_id}: {e}")
|
|
577
|
+
raise EmbeddingServiceError(f"Embedding generation failed: {str(e)}") from e
|
|
578
|
+
|
|
579
|
+
if len(vectors) != len(chunks):
|
|
580
|
+
raise EmbeddingServiceError(
|
|
581
|
+
f"Mismatch: {len(chunks)} chunks but {len(vectors)} embeddings"
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Step 3: Prepare documents for insertion
|
|
585
|
+
documents_to_insert = []
|
|
586
|
+
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
|
|
587
|
+
doc = {
|
|
588
|
+
"source_id": source_id,
|
|
589
|
+
"chunk_index": i,
|
|
590
|
+
"text": chunk_text,
|
|
591
|
+
"embedding": vector,
|
|
592
|
+
"metadata": {
|
|
593
|
+
"model": embedding_model or "custom",
|
|
594
|
+
"token_count": len(chunk_text), # Approximation
|
|
595
|
+
"created_at": datetime.utcnow(),
|
|
596
|
+
},
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
# Add custom metadata if provided
|
|
600
|
+
if metadata:
|
|
601
|
+
doc["metadata"].update(metadata)
|
|
602
|
+
|
|
603
|
+
documents_to_insert.append(doc)
|
|
604
|
+
|
|
605
|
+
# Step 4: Store in MongoDB
|
|
606
|
+
try:
|
|
607
|
+
# Handle both AppDB Collection and Motor collection
|
|
608
|
+
if hasattr(collection, "insert_many"):
|
|
609
|
+
# AppDB Collection wrapper
|
|
610
|
+
result = await collection.insert_many(documents_to_insert)
|
|
611
|
+
inserted_count = len(result.inserted_ids)
|
|
612
|
+
else:
|
|
613
|
+
# Direct Motor collection
|
|
614
|
+
result = await collection.insert_many(documents_to_insert)
|
|
615
|
+
inserted_count = len(result.inserted_ids)
|
|
616
|
+
|
|
617
|
+
logger.info(
|
|
618
|
+
f"Successfully inserted {inserted_count} documents for source: {source_id}"
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
return {
|
|
622
|
+
"chunks_created": len(chunks),
|
|
623
|
+
"documents_inserted": inserted_count,
|
|
624
|
+
"source_id": source_id,
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
except (
|
|
628
|
+
AttributeError,
|
|
629
|
+
TypeError,
|
|
630
|
+
ValueError,
|
|
631
|
+
RuntimeError,
|
|
632
|
+
KeyError,
|
|
633
|
+
ConnectionError,
|
|
634
|
+
) as e:
|
|
635
|
+
logger.error(
|
|
636
|
+
f"Failed to store documents for {source_id}: {e}", exc_info=True
|
|
637
|
+
)
|
|
638
|
+
raise EmbeddingServiceError(f"Storage failed: {str(e)}") from e
|
|
639
|
+
|
|
640
|
+
async def process_text(
|
|
641
|
+
self,
|
|
642
|
+
text_content: str,
|
|
643
|
+
max_tokens: Optional[int] = None,
|
|
644
|
+
tokenizer_model: Optional[str] = None,
|
|
645
|
+
embedding_model: Optional[str] = None,
|
|
646
|
+
) -> List[Dict[str, Any]]:
|
|
647
|
+
"""
|
|
648
|
+
Process text and return chunks with embeddings (without storing).
|
|
649
|
+
|
|
650
|
+
Useful when you want to process text but handle storage yourself.
|
|
651
|
+
|
|
652
|
+
Args:
|
|
653
|
+
text_content: The text to process
|
|
654
|
+
max_tokens: Max tokens per chunk (default: uses default_max_tokens)
|
|
655
|
+
tokenizer_model: Tokenizer model for counting (default: uses default_tokenizer_model)
|
|
656
|
+
embedding_model: Embedding model (default: uses EmbeddingProvider default)
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
List of dicts, each containing:
|
|
660
|
+
{
|
|
661
|
+
"chunk_index": int,
|
|
662
|
+
"text": str,
|
|
663
|
+
"embedding": List[float],
|
|
664
|
+
"metadata": Dict[str, Any]
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
Example:
|
|
668
|
+
results = await service.process_text("Long document...")
|
|
669
|
+
for result in results:
|
|
670
|
+
print(f"Chunk {result['chunk_index']}: {result['text'][:50]}...")
|
|
671
|
+
"""
|
|
672
|
+
# Chunk the text
|
|
673
|
+
chunks = await self.chunk_text(
|
|
674
|
+
text_content, max_tokens=max_tokens, tokenizer_model=tokenizer_model
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
if not chunks:
|
|
678
|
+
return []
|
|
679
|
+
|
|
680
|
+
# Generate embeddings
|
|
681
|
+
vectors = await self.embed_chunks(chunks, model=embedding_model)
|
|
682
|
+
|
|
683
|
+
if len(vectors) != len(chunks):
|
|
684
|
+
raise EmbeddingServiceError(
|
|
685
|
+
f"Mismatch: {len(chunks)} chunks but {len(vectors)} embeddings"
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Prepare results
|
|
689
|
+
results = []
|
|
690
|
+
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
|
|
691
|
+
results.append(
|
|
692
|
+
{
|
|
693
|
+
"chunk_index": i,
|
|
694
|
+
"text": chunk_text,
|
|
695
|
+
"embedding": vector,
|
|
696
|
+
"metadata": {
|
|
697
|
+
"model": embedding_model or "custom",
|
|
698
|
+
"token_count": len(chunk_text),
|
|
699
|
+
"created_at": datetime.utcnow(),
|
|
700
|
+
},
|
|
701
|
+
}
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
return results
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
# Dependency injection helper
|
|
708
|
+
def get_embedding_service(
|
|
709
|
+
embedding_provider: Optional[BaseEmbeddingProvider] = None,
|
|
710
|
+
config: Optional[Dict[str, Any]] = None,
|
|
711
|
+
) -> EmbeddingService:
|
|
712
|
+
"""
|
|
713
|
+
Create EmbeddingService instance with auto-detected or provided embedding provider.
|
|
714
|
+
|
|
715
|
+
Auto-detects OpenAI or AzureOpenAI from environment variables (same logic as mem0).
|
|
716
|
+
Requires either OPENAI_API_KEY or AZURE_OPENAI_API_KEY + AZURE_OPENAI_ENDPOINT.
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
embedding_provider: Optional BaseEmbeddingProvider instance (will auto-detect if None)
|
|
720
|
+
config: Optional configuration dict (from manifest.json
|
|
721
|
+
embedding_config)
|
|
722
|
+
Supports: max_tokens_per_chunk, tokenizer_model (optional,
|
|
723
|
+
defaults to "gpt-3.5-turbo"), default_embedding_model
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
EmbeddingService instance
|
|
727
|
+
|
|
728
|
+
Example:
|
|
729
|
+
from mdb_engine.embeddings import get_embedding_service
|
|
730
|
+
|
|
731
|
+
# Auto-detects from environment variables
|
|
732
|
+
embedding_service = get_embedding_service(
|
|
733
|
+
config={
|
|
734
|
+
"max_tokens_per_chunk": 1000,
|
|
735
|
+
"default_embedding_model": "text-embedding-3-small"
|
|
736
|
+
}
|
|
737
|
+
)
|
|
738
|
+
"""
|
|
739
|
+
# Platform-level defaults (users don't need to think about these)
|
|
740
|
+
default_max_tokens = 1000
|
|
741
|
+
# Model name for tiktoken (uses cl100k_base encoding internally)
|
|
742
|
+
default_tokenizer_model = "gpt-3.5-turbo"
|
|
743
|
+
|
|
744
|
+
# Override from config if provided (but not required)
|
|
745
|
+
if config:
|
|
746
|
+
default_max_tokens = config.get("max_tokens_per_chunk", default_max_tokens)
|
|
747
|
+
# tokenizer_model is optional - only override if explicitly provided
|
|
748
|
+
if "tokenizer_model" in config:
|
|
749
|
+
default_tokenizer_model = config["tokenizer_model"]
|
|
750
|
+
|
|
751
|
+
# Create embedding provider (auto-detects if embedding_provider is None)
|
|
752
|
+
provider = EmbeddingProvider(embedding_provider=embedding_provider, config=config)
|
|
753
|
+
|
|
754
|
+
return EmbeddingService(
|
|
755
|
+
embedding_provider=provider,
|
|
756
|
+
default_max_tokens=default_max_tokens,
|
|
757
|
+
default_tokenizer_model=default_tokenizer_model,
|
|
758
|
+
config=config,
|
|
759
|
+
)
|