mdb-engine 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. mdb_engine/README.md +144 -0
  2. mdb_engine/__init__.py +37 -0
  3. mdb_engine/auth/README.md +631 -0
  4. mdb_engine/auth/__init__.py +128 -0
  5. mdb_engine/auth/casbin_factory.py +199 -0
  6. mdb_engine/auth/casbin_models.py +46 -0
  7. mdb_engine/auth/config_defaults.py +71 -0
  8. mdb_engine/auth/config_helpers.py +213 -0
  9. mdb_engine/auth/cookie_utils.py +158 -0
  10. mdb_engine/auth/decorators.py +350 -0
  11. mdb_engine/auth/dependencies.py +747 -0
  12. mdb_engine/auth/helpers.py +64 -0
  13. mdb_engine/auth/integration.py +578 -0
  14. mdb_engine/auth/jwt.py +225 -0
  15. mdb_engine/auth/middleware.py +241 -0
  16. mdb_engine/auth/oso_factory.py +323 -0
  17. mdb_engine/auth/provider.py +570 -0
  18. mdb_engine/auth/restrictions.py +271 -0
  19. mdb_engine/auth/session_manager.py +477 -0
  20. mdb_engine/auth/token_lifecycle.py +213 -0
  21. mdb_engine/auth/token_store.py +289 -0
  22. mdb_engine/auth/users.py +1516 -0
  23. mdb_engine/auth/utils.py +614 -0
  24. mdb_engine/cli/__init__.py +13 -0
  25. mdb_engine/cli/commands/__init__.py +7 -0
  26. mdb_engine/cli/commands/generate.py +105 -0
  27. mdb_engine/cli/commands/migrate.py +83 -0
  28. mdb_engine/cli/commands/show.py +70 -0
  29. mdb_engine/cli/commands/validate.py +63 -0
  30. mdb_engine/cli/main.py +41 -0
  31. mdb_engine/cli/utils.py +92 -0
  32. mdb_engine/config.py +217 -0
  33. mdb_engine/constants.py +160 -0
  34. mdb_engine/core/README.md +542 -0
  35. mdb_engine/core/__init__.py +42 -0
  36. mdb_engine/core/app_registration.py +392 -0
  37. mdb_engine/core/connection.py +243 -0
  38. mdb_engine/core/engine.py +749 -0
  39. mdb_engine/core/index_management.py +162 -0
  40. mdb_engine/core/manifest.py +2793 -0
  41. mdb_engine/core/seeding.py +179 -0
  42. mdb_engine/core/service_initialization.py +355 -0
  43. mdb_engine/core/types.py +413 -0
  44. mdb_engine/database/README.md +522 -0
  45. mdb_engine/database/__init__.py +31 -0
  46. mdb_engine/database/abstraction.py +635 -0
  47. mdb_engine/database/connection.py +387 -0
  48. mdb_engine/database/scoped_wrapper.py +1721 -0
  49. mdb_engine/embeddings/README.md +184 -0
  50. mdb_engine/embeddings/__init__.py +62 -0
  51. mdb_engine/embeddings/dependencies.py +193 -0
  52. mdb_engine/embeddings/service.py +759 -0
  53. mdb_engine/exceptions.py +167 -0
  54. mdb_engine/indexes/README.md +651 -0
  55. mdb_engine/indexes/__init__.py +21 -0
  56. mdb_engine/indexes/helpers.py +145 -0
  57. mdb_engine/indexes/manager.py +895 -0
  58. mdb_engine/memory/README.md +451 -0
  59. mdb_engine/memory/__init__.py +30 -0
  60. mdb_engine/memory/service.py +1285 -0
  61. mdb_engine/observability/README.md +515 -0
  62. mdb_engine/observability/__init__.py +42 -0
  63. mdb_engine/observability/health.py +296 -0
  64. mdb_engine/observability/logging.py +161 -0
  65. mdb_engine/observability/metrics.py +297 -0
  66. mdb_engine/routing/README.md +462 -0
  67. mdb_engine/routing/__init__.py +73 -0
  68. mdb_engine/routing/websockets.py +813 -0
  69. mdb_engine/utils/__init__.py +7 -0
  70. mdb_engine-0.1.6.dist-info/METADATA +213 -0
  71. mdb_engine-0.1.6.dist-info/RECORD +75 -0
  72. mdb_engine-0.1.6.dist-info/WHEEL +5 -0
  73. mdb_engine-0.1.6.dist-info/entry_points.txt +2 -0
  74. mdb_engine-0.1.6.dist-info/licenses/LICENSE +661 -0
  75. mdb_engine-0.1.6.dist-info/top_level.txt +1 -0
@@ -0,0 +1,759 @@
1
+ """
2
+ Semantic Text Splitting and Embedding Service
3
+
4
+ This module provides intelligent text chunking and embedding capabilities:
5
+ 1. Semantic text splitting using Rust-based semantic-text-splitter
6
+ 2. Embedding generation via custom embed functions (users provide their own)
7
+ 3. MongoDB storage with proper document structure
8
+
9
+ Key Features:
10
+ - Token-aware chunking (never exceeds model limits)
11
+ - Semantic boundary preservation (splits on sentences/paragraphs)
12
+ - Custom embed functions (users implement their own embedding logic)
13
+ - Batch processing for efficiency
14
+ - Automatic metadata tracking
15
+ - Platform-level defaults (users don't need to configure tokenizer - defaults to "gpt-3.5-turbo")
16
+
17
+ Dependencies:
18
+ pip install semantic-text-splitter
19
+ """
20
+
21
+ import logging
22
+ import os
23
+ import time
24
+ from abc import ABC, abstractmethod
25
+ from datetime import datetime
26
+ from typing import Any, Dict, List, Optional, Union
27
+
28
+ # Optional OpenAI SDK import
29
+ try:
30
+ from openai import AsyncAzureOpenAI, AsyncOpenAI
31
+
32
+ OPENAI_AVAILABLE = True
33
+ except ImportError:
34
+ OPENAI_AVAILABLE = False
35
+ AsyncOpenAI = None
36
+ AsyncAzureOpenAI = None
37
+
38
+ # Optional dependencies
39
+ try:
40
+ from semantic_text_splitter import TextSplitter
41
+
42
+ SEMANTIC_SPLITTER_AVAILABLE = True
43
+ except ImportError:
44
+ SEMANTIC_SPLITTER_AVAILABLE = False
45
+ TextSplitter = None
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ class EmbeddingServiceError(Exception):
51
+ """Base exception for embedding service failures."""
52
+
53
+ pass
54
+
55
+
56
+ class BaseEmbeddingProvider(ABC):
57
+ """
58
+ Abstract base class for embedding providers.
59
+ """
60
+
61
+ @abstractmethod
62
+ async def embed(
63
+ self, text: Union[str, List[str]], model: Optional[str] = None
64
+ ) -> List[List[float]]:
65
+ """
66
+ Generate embeddings for text.
67
+
68
+ Args:
69
+ text: A single string or list of strings to embed
70
+ model: Optional model identifier
71
+
72
+ Returns:
73
+ List[List[float]]: List of embedding vectors
74
+ """
75
+ pass
76
+
77
+
78
+ class OpenAIEmbeddingProvider(BaseEmbeddingProvider):
79
+ """
80
+ OpenAI embedding provider.
81
+
82
+ Uses OpenAI's embedding API. Requires OPENAI_API_KEY environment variable.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ api_key: Optional[str] = None,
88
+ default_model: str = "text-embedding-3-small",
89
+ ):
90
+ """
91
+ Initialize OpenAI embedding provider.
92
+
93
+ Args:
94
+ api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
95
+ default_model: Default embedding model (default: "text-embedding-3-small")
96
+ """
97
+ if not OPENAI_AVAILABLE:
98
+ raise EmbeddingServiceError(
99
+ "OpenAI SDK not available. Install with: pip install openai"
100
+ )
101
+
102
+ api_key = api_key or os.getenv("OPENAI_API_KEY")
103
+ if not api_key:
104
+ raise EmbeddingServiceError(
105
+ "OpenAI API key not found. Set OPENAI_API_KEY environment variable."
106
+ )
107
+
108
+ self.client = AsyncOpenAI(api_key=api_key)
109
+ self.default_model = default_model
110
+
111
+ async def embed(
112
+ self, text: Union[str, List[str]], model: Optional[str] = None
113
+ ) -> List[List[float]]:
114
+ """Generate embeddings using OpenAI."""
115
+ model = model or self.default_model
116
+
117
+ # Normalize to list
118
+ if isinstance(text, str):
119
+ text = [text]
120
+
121
+ try:
122
+ response = await self.client.embeddings.create(model=model, input=text)
123
+
124
+ # Extract embeddings
125
+ vectors = [item.embedding for item in response.data]
126
+ return vectors
127
+
128
+ except (
129
+ ImportError,
130
+ AttributeError,
131
+ TypeError,
132
+ ValueError,
133
+ RuntimeError,
134
+ ConnectionError,
135
+ OSError,
136
+ ) as e:
137
+ logger.error(f"OpenAI embedding failed: {e}")
138
+ raise EmbeddingServiceError(f"OpenAI embedding failed: {str(e)}") from e
139
+
140
+
141
+ class AzureOpenAIEmbeddingProvider(BaseEmbeddingProvider):
142
+ """
143
+ Azure OpenAI embedding provider.
144
+
145
+ Uses Azure OpenAI's embedding API. Requires:
146
+ - AZURE_OPENAI_API_KEY environment variable
147
+ - AZURE_OPENAI_ENDPOINT environment variable
148
+ """
149
+
150
+ def __init__(
151
+ self,
152
+ api_key: Optional[str] = None,
153
+ endpoint: Optional[str] = None,
154
+ api_version: Optional[str] = None,
155
+ default_model: str = "text-embedding-3-small",
156
+ ):
157
+ """
158
+ Initialize Azure OpenAI embedding provider.
159
+
160
+ Args:
161
+ api_key: Azure OpenAI API key (defaults to AZURE_OPENAI_API_KEY env var)
162
+ endpoint: Azure OpenAI endpoint (defaults to AZURE_OPENAI_ENDPOINT env var)
163
+ api_version: API version (defaults to AZURE_OPENAI_API_VERSION or
164
+ OPENAI_API_VERSION env var)
165
+ default_model: Default embedding model/deployment name
166
+ (default: "text-embedding-3-small")
167
+ """
168
+ if not OPENAI_AVAILABLE:
169
+ raise EmbeddingServiceError(
170
+ "OpenAI SDK not available. Install with: pip install openai"
171
+ )
172
+
173
+ api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
174
+ endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
175
+ api_version = (
176
+ api_version
177
+ or os.getenv("AZURE_OPENAI_API_VERSION")
178
+ or os.getenv("OPENAI_API_VERSION", "2024-02-15-preview")
179
+ )
180
+
181
+ if not api_key or not endpoint:
182
+ raise EmbeddingServiceError(
183
+ "Azure OpenAI credentials not found. Set "
184
+ "AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT environment "
185
+ "variables."
186
+ )
187
+
188
+ # Use AsyncAzureOpenAI for Azure (not AsyncOpenAI with Azure params)
189
+ self.client = AsyncAzureOpenAI(
190
+ api_key=api_key, api_version=api_version, azure_endpoint=endpoint
191
+ )
192
+ self.default_model = default_model
193
+
194
+ async def embed(
195
+ self, text: Union[str, List[str]], model: Optional[str] = None
196
+ ) -> List[List[float]]:
197
+ """Generate embeddings using Azure OpenAI."""
198
+ model = model or self.default_model
199
+
200
+ # Normalize to list
201
+ if isinstance(text, str):
202
+ text = [text]
203
+
204
+ try:
205
+ response = await self.client.embeddings.create(model=model, input=text)
206
+
207
+ # Extract embeddings
208
+ vectors = [item.embedding for item in response.data]
209
+ return vectors
210
+
211
+ except (
212
+ ImportError,
213
+ AttributeError,
214
+ TypeError,
215
+ ValueError,
216
+ RuntimeError,
217
+ ConnectionError,
218
+ OSError,
219
+ ) as e:
220
+ logger.error(f"Azure OpenAI embedding failed: {e}")
221
+ raise EmbeddingServiceError(
222
+ f"Azure OpenAI embedding failed: {str(e)}"
223
+ ) from e
224
+
225
+
226
+ def _detect_provider_from_env() -> str:
227
+ """
228
+ Detect provider from environment variables (same logic as mem0).
229
+
230
+ Returns:
231
+ "azure" if Azure OpenAI credentials are present, otherwise "openai"
232
+ """
233
+ if os.getenv("AZURE_OPENAI_API_KEY") and os.getenv("AZURE_OPENAI_ENDPOINT"):
234
+ return "azure"
235
+ elif os.getenv("OPENAI_API_KEY"):
236
+ return "openai"
237
+ else:
238
+ # Default to openai if nothing is configured
239
+ return "openai"
240
+
241
+
242
+ class EmbeddingProvider:
243
+ """
244
+ Standalone embedding provider wrapper.
245
+
246
+ Auto-detects OpenAI or AzureOpenAI from environment variables.
247
+ Supports OpenAI and AzureOpenAI only.
248
+
249
+ Example:
250
+ # Auto-detects from environment variables
251
+ provider = EmbeddingProvider()
252
+
253
+ # Or explicitly provide a provider
254
+ from mdb_engine.embeddings import OpenAIEmbeddingProvider
255
+ provider = EmbeddingProvider(embedding_provider=OpenAIEmbeddingProvider())
256
+ """
257
+
258
+ def __init__(
259
+ self,
260
+ embedding_provider: Optional[BaseEmbeddingProvider] = None,
261
+ config: Optional[Dict[str, Any]] = None,
262
+ ):
263
+ """
264
+ Initialize Embedding Provider.
265
+
266
+ Args:
267
+ embedding_provider: BaseEmbeddingProvider instance (optional, will auto-detect if None)
268
+ config: Optional dict with embedding configuration (from manifest.json embedding_config)
269
+ Supports: default_embedding_model
270
+
271
+ Raises:
272
+ EmbeddingServiceError: If provider cannot be auto-detected and none is provided
273
+ """
274
+ if embedding_provider is not None:
275
+ if not isinstance(embedding_provider, BaseEmbeddingProvider):
276
+ raise EmbeddingServiceError(
277
+ f"embedding_provider must be an instance of BaseEmbeddingProvider, "
278
+ f"got {type(embedding_provider)}"
279
+ )
280
+ self.embedding_provider = embedding_provider
281
+ else:
282
+ # Auto-detect provider from environment variables
283
+ provider_type = _detect_provider_from_env()
284
+ default_model = (config or {}).get(
285
+ "default_embedding_model", "text-embedding-3-small"
286
+ )
287
+
288
+ if provider_type == "azure":
289
+ self.embedding_provider = AzureOpenAIEmbeddingProvider(
290
+ default_model=default_model
291
+ )
292
+ logger.info(
293
+ f"Auto-detected Azure OpenAI embedding provider (model: {default_model})"
294
+ )
295
+ else:
296
+ self.embedding_provider = OpenAIEmbeddingProvider(
297
+ default_model=default_model
298
+ )
299
+ logger.info(
300
+ f"Auto-detected OpenAI embedding provider (model: {default_model})"
301
+ )
302
+
303
+ # Store config for potential future use
304
+ self.config = config or {}
305
+
306
+ async def embed(
307
+ self, text: Union[str, List[str]], model: Optional[str] = None
308
+ ) -> List[List[float]]:
309
+ """
310
+ Generates vector embeddings for a string or list of strings.
311
+
312
+ Args:
313
+ text: A single string document or a list of documents.
314
+ model: Optional model identifier (overrides default)
315
+
316
+ Returns:
317
+ List[List[float]]: A list of vectors.
318
+ If input was a single string, returns a list containing one vector.
319
+
320
+ Example:
321
+ ```python
322
+ # Batch embedding (Faster)
323
+ docs = ["Apple", "Banana", "Cherry"]
324
+ vectors = await provider.embed(docs, model="text-embedding-3-small")
325
+
326
+ # vectors is [[0.1, ...], [0.2, ...], [0.3, ...]]
327
+ ```
328
+ """
329
+ start_time = time.time()
330
+
331
+ try:
332
+ vectors = await self.embedding_provider.embed(text, model)
333
+
334
+ duration = time.time() - start_time
335
+ item_count = 1 if isinstance(text, str) else len(text)
336
+
337
+ logger.info(
338
+ "EMBED_SUCCESS",
339
+ extra={"count": item_count, "latency_sec": round(duration, 3)},
340
+ )
341
+ return vectors
342
+
343
+ except (AttributeError, TypeError, ValueError, RuntimeError, KeyError) as e:
344
+ logger.error(f"EMBED_FAILED: {str(e)}")
345
+ raise EmbeddingServiceError(f"Embedding failed: {str(e)}") from e
346
+
347
+
348
+ class EmbeddingService:
349
+ """
350
+ Service for semantic text splitting and embedding generation.
351
+
352
+ This service combines:
353
+ 1. Semantic text splitting (Rust-based, fast and accurate)
354
+ 2. Embedding generation (via OpenAI or AzureOpenAI, auto-detected from env vars)
355
+ 3. MongoDB storage (structured document format)
356
+
357
+ Example:
358
+ from mdb_engine.embeddings import EmbeddingService
359
+
360
+ # Initialize (auto-detects OpenAI or AzureOpenAI from environment variables)
361
+ embedding_service = EmbeddingService()
362
+
363
+ # Process and store
364
+ await embedding_service.process_and_store(
365
+ text_content="Your long document here...",
366
+ source_id="doc_101",
367
+ collection=db.knowledge_base,
368
+ max_tokens_per_chunk=1000
369
+ )
370
+ """
371
+
372
+ def __init__(
373
+ self,
374
+ embedding_provider: Optional[EmbeddingProvider] = None,
375
+ default_max_tokens: int = 1000,
376
+ default_tokenizer_model: str = "gpt-3.5-turbo",
377
+ config: Optional[Dict[str, Any]] = None,
378
+ ):
379
+ """
380
+ Initialize Embedding Service.
381
+
382
+ Args:
383
+ embedding_provider: EmbeddingProvider instance (optional, will create default if None)
384
+ default_max_tokens: Default max tokens per chunk (default: 1000)
385
+ default_tokenizer_model: Tokenizer model name for counting tokens
386
+ (default: "gpt-3.5-turbo").
387
+ This is ONLY for token counting during chunking, NOT for
388
+ embeddings.
389
+ Must be a valid OpenAI model name (e.g., "gpt-3.5-turbo",
390
+ "gpt-4").
391
+ config: Optional configuration dict (from manifest.json embedding_config)
392
+
393
+ Raises:
394
+ EmbeddingServiceError: If required dependencies are not available
395
+ """
396
+ if not SEMANTIC_SPLITTER_AVAILABLE:
397
+ raise EmbeddingServiceError(
398
+ "semantic-text-splitter not available. Install with: "
399
+ "pip install semantic-text-splitter"
400
+ )
401
+
402
+ # Create embedding provider if not provided
403
+ if embedding_provider is None:
404
+ embedding_provider = EmbeddingProvider(config=config)
405
+
406
+ self.embedding_provider = embedding_provider
407
+ self.default_max_tokens = default_max_tokens
408
+ self.default_tokenizer_model = default_tokenizer_model
409
+
410
+ def _create_splitter(
411
+ self, max_tokens: int, tokenizer_model: Optional[str] = None
412
+ ) -> TextSplitter:
413
+ """
414
+ Create a TextSplitter instance.
415
+
416
+ Args:
417
+ max_tokens: Maximum tokens per chunk
418
+ tokenizer_model: Tokenizer encoding for counting
419
+ (default: uses default_tokenizer_model).
420
+ This is ONLY for token counting, NOT for embeddings.
421
+
422
+ Returns:
423
+ TextSplitter instance
424
+ """
425
+ # Use provided tokenizer, or fall back to default (gpt-3.5-turbo)
426
+ model = tokenizer_model or self.default_tokenizer_model
427
+ return TextSplitter.from_tiktoken_model(model, max_tokens)
428
+
429
+ async def chunk_text(
430
+ self,
431
+ text_content: str,
432
+ max_tokens: Optional[int] = None,
433
+ tokenizer_model: Optional[str] = None,
434
+ ) -> List[str]:
435
+ """
436
+ Split text into semantic chunks.
437
+
438
+ Uses Rust-based semantic-text-splitter for fast, accurate chunking
439
+ that respects token limits and semantic boundaries.
440
+
441
+ Args:
442
+ text_content: The text to chunk
443
+ max_tokens: Max tokens per chunk (default: uses default_max_tokens)
444
+ tokenizer_model: Tokenizer model name for counting (optional,
445
+ defaults to "gpt-3.5-turbo").
446
+ This is ONLY for token counting, NOT for embeddings.
447
+ Must be a valid OpenAI model name (e.g., "gpt-3.5-turbo",
448
+ "gpt-4").
449
+
450
+ Returns:
451
+ List of text chunks
452
+
453
+ Example:
454
+ chunks = await service.chunk_text("Long document...", max_tokens=1000)
455
+ print(f"Generated {len(chunks)} chunks")
456
+ """
457
+ max_tokens = max_tokens or self.default_max_tokens
458
+ splitter = self._create_splitter(max_tokens, tokenizer_model)
459
+
460
+ try:
461
+ chunks = splitter.chunks(text_content)
462
+ logger.info(f"Generated {len(chunks)} chunks (max_tokens={max_tokens})")
463
+ return chunks
464
+ except (ImportError, AttributeError, TypeError, ValueError, RuntimeError) as e:
465
+ logger.error(f"Error chunking text: {e}", exc_info=True)
466
+ raise EmbeddingServiceError(f"Chunking failed: {str(e)}") from e
467
+
468
+ async def embed_chunks(
469
+ self, chunks: List[str], model: Optional[str] = None
470
+ ) -> List[List[float]]:
471
+ """
472
+ Generate embeddings for text chunks.
473
+
474
+ Uses the user-provided embedding provider/function.
475
+
476
+ Args:
477
+ chunks: List of text chunks to embed
478
+ model: Optional model identifier (passed to embedding provider)
479
+
480
+ Returns:
481
+ List of embedding vectors (each is a list of floats)
482
+
483
+ Example:
484
+ chunks = ["chunk 1", "chunk 2"]
485
+ vectors = await service.embed_chunks(chunks, model="text-embedding-3-small")
486
+ """
487
+ if not chunks:
488
+ return []
489
+
490
+ try:
491
+ # Use EmbeddingProvider's embed method (handles retries, logging, etc.)
492
+ vectors = await self.embedding_provider.embed(chunks, model=model)
493
+ logger.info(f"Generated {len(vectors)} embeddings")
494
+ return vectors
495
+ except (
496
+ AttributeError,
497
+ TypeError,
498
+ ValueError,
499
+ RuntimeError,
500
+ ConnectionError,
501
+ OSError,
502
+ ) as e:
503
+ logger.error(f"Error generating embeddings: {e}", exc_info=True)
504
+ raise EmbeddingServiceError(f"Embedding generation failed: {str(e)}") from e
505
+
506
+ async def process_and_store(
507
+ self,
508
+ text_content: str,
509
+ source_id: str,
510
+ collection: Any, # MongoDB collection (AppDB Collection or Motor collection)
511
+ max_tokens: Optional[int] = None,
512
+ tokenizer_model: Optional[str] = None,
513
+ embedding_model: Optional[str] = None,
514
+ metadata: Optional[Dict[str, Any]] = None,
515
+ ) -> Dict[str, Any]:
516
+ """
517
+ Process text and store chunks with embeddings in MongoDB.
518
+
519
+ This is the main method that:
520
+ 1. Chunks the text semantically
521
+ 2. Generates embeddings for each chunk
522
+ 3. Stores documents in MongoDB with proper structure
523
+
524
+ Args:
525
+ text_content: The text to process
526
+ source_id: Unique identifier for the source document
527
+ collection: MongoDB collection (AppDB Collection or Motor collection)
528
+ max_tokens: Max tokens per chunk (default: uses default_max_tokens)
529
+ tokenizer_model: Tokenizer model for counting (default: uses default_tokenizer_model)
530
+ embedding_model: Embedding model (default: uses EmbeddingProvider default)
531
+ metadata: Additional metadata to store with each chunk
532
+
533
+ Returns:
534
+ Dict with processing results:
535
+ {
536
+ "chunks_created": int,
537
+ "documents_inserted": int,
538
+ "source_id": str
539
+ }
540
+
541
+ Example:
542
+ result = await service.process_and_store(
543
+ text_content="Long document...",
544
+ source_id="doc_101",
545
+ collection=db.knowledge_base,
546
+ max_tokens=1000
547
+ )
548
+ print(f"Created {result['chunks_created']} chunks")
549
+ """
550
+ logger.info(f"Processing source: {source_id}")
551
+
552
+ # Step 1: Chunk the text
553
+ chunks = await self.chunk_text(
554
+ text_content, max_tokens=max_tokens, tokenizer_model=tokenizer_model
555
+ )
556
+
557
+ if not chunks:
558
+ logger.warning(f"No chunks generated for source: {source_id}")
559
+ return {
560
+ "chunks_created": 0,
561
+ "documents_inserted": 0,
562
+ "source_id": source_id,
563
+ }
564
+
565
+ # Step 2: Generate embeddings (batch for efficiency)
566
+ try:
567
+ vectors = await self.embed_chunks(chunks, model=embedding_model)
568
+ except (
569
+ AttributeError,
570
+ TypeError,
571
+ ValueError,
572
+ RuntimeError,
573
+ ConnectionError,
574
+ OSError,
575
+ ) as e:
576
+ logger.error(f"Failed to generate embeddings for {source_id}: {e}")
577
+ raise EmbeddingServiceError(f"Embedding generation failed: {str(e)}") from e
578
+
579
+ if len(vectors) != len(chunks):
580
+ raise EmbeddingServiceError(
581
+ f"Mismatch: {len(chunks)} chunks but {len(vectors)} embeddings"
582
+ )
583
+
584
+ # Step 3: Prepare documents for insertion
585
+ documents_to_insert = []
586
+ for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
587
+ doc = {
588
+ "source_id": source_id,
589
+ "chunk_index": i,
590
+ "text": chunk_text,
591
+ "embedding": vector,
592
+ "metadata": {
593
+ "model": embedding_model or "custom",
594
+ "token_count": len(chunk_text), # Approximation
595
+ "created_at": datetime.utcnow(),
596
+ },
597
+ }
598
+
599
+ # Add custom metadata if provided
600
+ if metadata:
601
+ doc["metadata"].update(metadata)
602
+
603
+ documents_to_insert.append(doc)
604
+
605
+ # Step 4: Store in MongoDB
606
+ try:
607
+ # Handle both AppDB Collection and Motor collection
608
+ if hasattr(collection, "insert_many"):
609
+ # AppDB Collection wrapper
610
+ result = await collection.insert_many(documents_to_insert)
611
+ inserted_count = len(result.inserted_ids)
612
+ else:
613
+ # Direct Motor collection
614
+ result = await collection.insert_many(documents_to_insert)
615
+ inserted_count = len(result.inserted_ids)
616
+
617
+ logger.info(
618
+ f"Successfully inserted {inserted_count} documents for source: {source_id}"
619
+ )
620
+
621
+ return {
622
+ "chunks_created": len(chunks),
623
+ "documents_inserted": inserted_count,
624
+ "source_id": source_id,
625
+ }
626
+
627
+ except (
628
+ AttributeError,
629
+ TypeError,
630
+ ValueError,
631
+ RuntimeError,
632
+ KeyError,
633
+ ConnectionError,
634
+ ) as e:
635
+ logger.error(
636
+ f"Failed to store documents for {source_id}: {e}", exc_info=True
637
+ )
638
+ raise EmbeddingServiceError(f"Storage failed: {str(e)}") from e
639
+
640
+ async def process_text(
641
+ self,
642
+ text_content: str,
643
+ max_tokens: Optional[int] = None,
644
+ tokenizer_model: Optional[str] = None,
645
+ embedding_model: Optional[str] = None,
646
+ ) -> List[Dict[str, Any]]:
647
+ """
648
+ Process text and return chunks with embeddings (without storing).
649
+
650
+ Useful when you want to process text but handle storage yourself.
651
+
652
+ Args:
653
+ text_content: The text to process
654
+ max_tokens: Max tokens per chunk (default: uses default_max_tokens)
655
+ tokenizer_model: Tokenizer model for counting (default: uses default_tokenizer_model)
656
+ embedding_model: Embedding model (default: uses EmbeddingProvider default)
657
+
658
+ Returns:
659
+ List of dicts, each containing:
660
+ {
661
+ "chunk_index": int,
662
+ "text": str,
663
+ "embedding": List[float],
664
+ "metadata": Dict[str, Any]
665
+ }
666
+
667
+ Example:
668
+ results = await service.process_text("Long document...")
669
+ for result in results:
670
+ print(f"Chunk {result['chunk_index']}: {result['text'][:50]}...")
671
+ """
672
+ # Chunk the text
673
+ chunks = await self.chunk_text(
674
+ text_content, max_tokens=max_tokens, tokenizer_model=tokenizer_model
675
+ )
676
+
677
+ if not chunks:
678
+ return []
679
+
680
+ # Generate embeddings
681
+ vectors = await self.embed_chunks(chunks, model=embedding_model)
682
+
683
+ if len(vectors) != len(chunks):
684
+ raise EmbeddingServiceError(
685
+ f"Mismatch: {len(chunks)} chunks but {len(vectors)} embeddings"
686
+ )
687
+
688
+ # Prepare results
689
+ results = []
690
+ for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
691
+ results.append(
692
+ {
693
+ "chunk_index": i,
694
+ "text": chunk_text,
695
+ "embedding": vector,
696
+ "metadata": {
697
+ "model": embedding_model or "custom",
698
+ "token_count": len(chunk_text),
699
+ "created_at": datetime.utcnow(),
700
+ },
701
+ }
702
+ )
703
+
704
+ return results
705
+
706
+
707
+ # Dependency injection helper
708
+ def get_embedding_service(
709
+ embedding_provider: Optional[BaseEmbeddingProvider] = None,
710
+ config: Optional[Dict[str, Any]] = None,
711
+ ) -> EmbeddingService:
712
+ """
713
+ Create EmbeddingService instance with auto-detected or provided embedding provider.
714
+
715
+ Auto-detects OpenAI or AzureOpenAI from environment variables (same logic as mem0).
716
+ Requires either OPENAI_API_KEY or AZURE_OPENAI_API_KEY + AZURE_OPENAI_ENDPOINT.
717
+
718
+ Args:
719
+ embedding_provider: Optional BaseEmbeddingProvider instance (will auto-detect if None)
720
+ config: Optional configuration dict (from manifest.json
721
+ embedding_config)
722
+ Supports: max_tokens_per_chunk, tokenizer_model (optional,
723
+ defaults to "gpt-3.5-turbo"), default_embedding_model
724
+
725
+ Returns:
726
+ EmbeddingService instance
727
+
728
+ Example:
729
+ from mdb_engine.embeddings import get_embedding_service
730
+
731
+ # Auto-detects from environment variables
732
+ embedding_service = get_embedding_service(
733
+ config={
734
+ "max_tokens_per_chunk": 1000,
735
+ "default_embedding_model": "text-embedding-3-small"
736
+ }
737
+ )
738
+ """
739
+ # Platform-level defaults (users don't need to think about these)
740
+ default_max_tokens = 1000
741
+ # Model name for tiktoken (uses cl100k_base encoding internally)
742
+ default_tokenizer_model = "gpt-3.5-turbo"
743
+
744
+ # Override from config if provided (but not required)
745
+ if config:
746
+ default_max_tokens = config.get("max_tokens_per_chunk", default_max_tokens)
747
+ # tokenizer_model is optional - only override if explicitly provided
748
+ if "tokenizer_model" in config:
749
+ default_tokenizer_model = config["tokenizer_model"]
750
+
751
+ # Create embedding provider (auto-detects if embedding_provider is None)
752
+ provider = EmbeddingProvider(embedding_provider=embedding_provider, config=config)
753
+
754
+ return EmbeddingService(
755
+ embedding_provider=provider,
756
+ default_max_tokens=default_max_tokens,
757
+ default_tokenizer_model=default_tokenizer_model,
758
+ config=config,
759
+ )