mdb-engine 0.1.6__py3-none-any.whl → 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdb_engine/__init__.py +116 -11
- mdb_engine/auth/ARCHITECTURE.md +112 -0
- mdb_engine/auth/README.md +654 -11
- mdb_engine/auth/__init__.py +136 -29
- mdb_engine/auth/audit.py +592 -0
- mdb_engine/auth/base.py +252 -0
- mdb_engine/auth/casbin_factory.py +265 -70
- mdb_engine/auth/config_defaults.py +5 -5
- mdb_engine/auth/config_helpers.py +19 -18
- mdb_engine/auth/cookie_utils.py +12 -16
- mdb_engine/auth/csrf.py +483 -0
- mdb_engine/auth/decorators.py +10 -16
- mdb_engine/auth/dependencies.py +69 -71
- mdb_engine/auth/helpers.py +3 -3
- mdb_engine/auth/integration.py +61 -88
- mdb_engine/auth/jwt.py +11 -15
- mdb_engine/auth/middleware.py +79 -35
- mdb_engine/auth/oso_factory.py +21 -41
- mdb_engine/auth/provider.py +270 -171
- mdb_engine/auth/rate_limiter.py +505 -0
- mdb_engine/auth/restrictions.py +21 -36
- mdb_engine/auth/session_manager.py +24 -41
- mdb_engine/auth/shared_middleware.py +977 -0
- mdb_engine/auth/shared_users.py +775 -0
- mdb_engine/auth/token_lifecycle.py +10 -12
- mdb_engine/auth/token_store.py +17 -32
- mdb_engine/auth/users.py +99 -159
- mdb_engine/auth/utils.py +236 -42
- mdb_engine/cli/commands/generate.py +546 -10
- mdb_engine/cli/commands/validate.py +3 -7
- mdb_engine/cli/utils.py +7 -7
- mdb_engine/config.py +13 -28
- mdb_engine/constants.py +65 -0
- mdb_engine/core/README.md +117 -6
- mdb_engine/core/__init__.py +39 -7
- mdb_engine/core/app_registration.py +31 -50
- mdb_engine/core/app_secrets.py +289 -0
- mdb_engine/core/connection.py +20 -12
- mdb_engine/core/encryption.py +222 -0
- mdb_engine/core/engine.py +2862 -115
- mdb_engine/core/index_management.py +12 -16
- mdb_engine/core/manifest.py +628 -204
- mdb_engine/core/ray_integration.py +436 -0
- mdb_engine/core/seeding.py +13 -21
- mdb_engine/core/service_initialization.py +20 -30
- mdb_engine/core/types.py +40 -43
- mdb_engine/database/README.md +140 -17
- mdb_engine/database/__init__.py +17 -6
- mdb_engine/database/abstraction.py +37 -50
- mdb_engine/database/connection.py +51 -30
- mdb_engine/database/query_validator.py +367 -0
- mdb_engine/database/resource_limiter.py +204 -0
- mdb_engine/database/scoped_wrapper.py +747 -237
- mdb_engine/dependencies.py +427 -0
- mdb_engine/di/__init__.py +34 -0
- mdb_engine/di/container.py +247 -0
- mdb_engine/di/providers.py +206 -0
- mdb_engine/di/scopes.py +139 -0
- mdb_engine/embeddings/README.md +54 -24
- mdb_engine/embeddings/__init__.py +31 -24
- mdb_engine/embeddings/dependencies.py +38 -155
- mdb_engine/embeddings/service.py +78 -75
- mdb_engine/exceptions.py +104 -12
- mdb_engine/indexes/README.md +30 -13
- mdb_engine/indexes/__init__.py +1 -0
- mdb_engine/indexes/helpers.py +11 -11
- mdb_engine/indexes/manager.py +59 -123
- mdb_engine/memory/README.md +95 -4
- mdb_engine/memory/__init__.py +1 -2
- mdb_engine/memory/service.py +363 -1168
- mdb_engine/observability/README.md +4 -2
- mdb_engine/observability/__init__.py +26 -9
- mdb_engine/observability/health.py +17 -17
- mdb_engine/observability/logging.py +10 -10
- mdb_engine/observability/metrics.py +40 -19
- mdb_engine/repositories/__init__.py +34 -0
- mdb_engine/repositories/base.py +325 -0
- mdb_engine/repositories/mongo.py +233 -0
- mdb_engine/repositories/unit_of_work.py +166 -0
- mdb_engine/routing/README.md +1 -1
- mdb_engine/routing/__init__.py +1 -3
- mdb_engine/routing/websockets.py +41 -75
- mdb_engine/utils/__init__.py +3 -1
- mdb_engine/utils/mongo.py +117 -0
- mdb_engine-0.4.12.dist-info/METADATA +492 -0
- mdb_engine-0.4.12.dist-info/RECORD +97 -0
- {mdb_engine-0.1.6.dist-info → mdb_engine-0.4.12.dist-info}/WHEEL +1 -1
- mdb_engine-0.1.6.dist-info/METADATA +0 -213
- mdb_engine-0.1.6.dist-info/RECORD +0 -75
- {mdb_engine-0.1.6.dist-info → mdb_engine-0.4.12.dist-info}/entry_points.txt +0 -0
- {mdb_engine-0.1.6.dist-info → mdb_engine-0.4.12.dist-info}/licenses/LICENSE +0 -0
- {mdb_engine-0.1.6.dist-info → mdb_engine-0.4.12.dist-info}/top_level.txt +0 -0
|
@@ -1,97 +1,60 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Embedding Service
|
|
2
|
+
Embedding Service Utilities
|
|
3
3
|
|
|
4
|
-
This module provides
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"""
|
|
4
|
+
This module provides utility functions for creating embedding services.
|
|
5
|
+
For FastAPI dependency injection, use the request-scoped dependencies
|
|
6
|
+
from `mdb_engine.dependencies` instead.
|
|
8
7
|
|
|
9
|
-
|
|
8
|
+
Usage:
|
|
9
|
+
# For FastAPI routes (RECOMMENDED):
|
|
10
|
+
from mdb_engine.dependencies import get_embedding_service
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
@app.post("/embed")
|
|
13
|
+
async def embed(embedding_service=Depends(get_embedding_service)):
|
|
14
|
+
...
|
|
14
15
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
FASTAPI_AVAILABLE = False
|
|
16
|
+
# For standalone/utility usage:
|
|
17
|
+
from mdb_engine.embeddings.dependencies import get_embedding_service_for_app
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return None
|
|
19
|
+
service = get_embedding_service_for_app("my_app", engine)
|
|
20
|
+
"""
|
|
22
21
|
|
|
23
|
-
|
|
24
|
-
pass
|
|
22
|
+
from typing import TYPE_CHECKING
|
|
25
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from ..core.engine import MongoDBEngine
|
|
26
26
|
|
|
27
27
|
from .service import EmbeddingService, get_embedding_service
|
|
28
28
|
|
|
29
|
-
# Global engine registry (for apps that don't pass engine explicitly)
|
|
30
|
-
_global_engine: Optional[Any] = None
|
|
31
|
-
_global_app_slug: Optional[str] = None
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def set_global_engine(engine: Any, app_slug: Optional[str] = None) -> None:
|
|
35
|
-
"""
|
|
36
|
-
Set global MongoDBEngine instance for embedding dependency injection.
|
|
37
|
-
|
|
38
|
-
This is useful when you have a single engine instance that you want
|
|
39
|
-
to use across all apps. Call this during application startup.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
engine: MongoDBEngine instance
|
|
43
|
-
app_slug: Optional app slug
|
|
44
|
-
"""
|
|
45
|
-
global _global_engine, _global_app_slug
|
|
46
|
-
_global_engine = engine
|
|
47
|
-
_global_app_slug = app_slug
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def get_global_engine() -> Optional[Any]:
|
|
51
|
-
"""
|
|
52
|
-
Get global MongoDBEngine instance.
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
MongoDBEngine instance if set, None otherwise
|
|
56
|
-
"""
|
|
57
|
-
return _global_engine
|
|
58
|
-
|
|
59
29
|
|
|
60
30
|
def get_embedding_service_for_app(
|
|
61
|
-
app_slug: str, engine:
|
|
62
|
-
) ->
|
|
31
|
+
app_slug: str, engine: "MongoDBEngine"
|
|
32
|
+
) -> EmbeddingService | None:
|
|
63
33
|
"""
|
|
64
|
-
Get embedding service for a specific app.
|
|
34
|
+
Get embedding service for a specific app using the engine instance.
|
|
65
35
|
|
|
66
|
-
This is a
|
|
67
|
-
|
|
36
|
+
This is a utility function for cases where you need to create an
|
|
37
|
+
embedding service outside of a FastAPI request context (e.g., in
|
|
38
|
+
background tasks, CLI tools, or tests).
|
|
39
|
+
|
|
40
|
+
For FastAPI routes, use `mdb_engine.dependencies.get_embedding_service` instead.
|
|
68
41
|
|
|
69
42
|
Args:
|
|
70
|
-
app_slug: App slug
|
|
71
|
-
engine: MongoDBEngine instance
|
|
43
|
+
app_slug: App slug to get embedding config from
|
|
44
|
+
engine: MongoDBEngine instance
|
|
72
45
|
|
|
73
46
|
Returns:
|
|
74
|
-
EmbeddingService instance if embedding is enabled
|
|
47
|
+
EmbeddingService instance if embedding is enabled, None otherwise
|
|
75
48
|
|
|
76
49
|
Example:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
@app.post("/embed")
|
|
82
|
-
async def embed_endpoint(
|
|
83
|
-
embedding_service = Depends(lambda: get_embedding_service_for_app("my_app"))
|
|
84
|
-
):
|
|
85
|
-
if not embedding_service:
|
|
86
|
-
raise HTTPException(503, "Embedding service not available")
|
|
87
|
-
embeddings = await embedding_service.embed_chunks(["Hello world"])
|
|
88
|
-
return {"embeddings": embeddings}
|
|
89
|
-
```
|
|
90
|
-
"""
|
|
91
|
-
# Try to get engine from context if not provided
|
|
92
|
-
if engine is None:
|
|
93
|
-
engine = _global_engine
|
|
50
|
+
# In a background task or CLI
|
|
51
|
+
engine = MongoDBEngine(...)
|
|
52
|
+
await engine.initialize()
|
|
94
53
|
|
|
54
|
+
service = get_embedding_service_for_app("my_app", engine)
|
|
55
|
+
if service:
|
|
56
|
+
embeddings = await service.embed_chunks(["Hello world"])
|
|
57
|
+
"""
|
|
95
58
|
if engine is None:
|
|
96
59
|
return None
|
|
97
60
|
|
|
@@ -108,86 +71,6 @@ def get_embedding_service_for_app(
|
|
|
108
71
|
return get_embedding_service(config=embedding_config)
|
|
109
72
|
|
|
110
73
|
|
|
111
|
-
|
|
112
|
-
""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
This creates a dependency function that can be used with Depends()
|
|
116
|
-
to inject the embedding service into route handlers.
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
app_slug: App slug
|
|
120
|
-
engine: MongoDBEngine instance (optional)
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
Dependency function that returns EmbeddingService or raises HTTPException
|
|
124
|
-
|
|
125
|
-
Example:
|
|
126
|
-
```python
|
|
127
|
-
from fastapi import Depends
|
|
128
|
-
from mdb_engine.embeddings.dependencies import create_embedding_dependency
|
|
129
|
-
|
|
130
|
-
embedding_dep = create_embedding_dependency("my_app", engine)
|
|
131
|
-
|
|
132
|
-
@app.post("/embed")
|
|
133
|
-
async def embed_endpoint(embedding_service = Depends(embedding_dep)):
|
|
134
|
-
embeddings = await embedding_service.embed_chunks(["Hello world"])
|
|
135
|
-
return {"embeddings": embeddings}
|
|
136
|
-
```
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
def _get_embedding_service() -> EmbeddingService:
|
|
140
|
-
embedding_service = get_embedding_service_for_app(app_slug, engine)
|
|
141
|
-
if embedding_service is None:
|
|
142
|
-
if FASTAPI_AVAILABLE:
|
|
143
|
-
raise HTTPException(
|
|
144
|
-
status_code=503,
|
|
145
|
-
detail=f"Embedding service not available for app '{app_slug}'. "
|
|
146
|
-
"Ensure 'embedding_config.enabled' is true in manifest.json and "
|
|
147
|
-
"embedding dependencies are installed.",
|
|
148
|
-
)
|
|
149
|
-
else:
|
|
150
|
-
raise RuntimeError(
|
|
151
|
-
f"Embedding service not available for app '{app_slug}'"
|
|
152
|
-
)
|
|
153
|
-
return embedding_service
|
|
154
|
-
|
|
155
|
-
return _get_embedding_service
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def get_embedding_service_dependency(app_slug: str):
|
|
159
|
-
"""
|
|
160
|
-
Get embedding service dependency using global engine.
|
|
161
|
-
|
|
162
|
-
This is a convenience function that uses the global engine registry.
|
|
163
|
-
Set the engine with set_global_engine() during app startup.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
app_slug: App slug
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
Dependency function for FastAPI Depends()
|
|
170
|
-
|
|
171
|
-
Example:
|
|
172
|
-
```python
|
|
173
|
-
from fastapi import FastAPI, Depends
|
|
174
|
-
from mdb_engine.embeddings.dependencies import (
|
|
175
|
-
set_global_engine, get_embedding_service_dependency
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
app = FastAPI()
|
|
179
|
-
|
|
180
|
-
# During startup
|
|
181
|
-
set_global_engine(engine, app_slug="my_app")
|
|
182
|
-
|
|
183
|
-
# In routes
|
|
184
|
-
@app.post("/embed")
|
|
185
|
-
async def embed(embedding_service = Depends(get_embedding_service_dependency("my_app"))):
|
|
186
|
-
return await embedding_service.embed_chunks(["Hello world"])
|
|
187
|
-
```
|
|
188
|
-
"""
|
|
189
|
-
return create_embedding_dependency(app_slug, _global_engine)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
# Alias for backward compatibility
|
|
193
|
-
get_embedding_service_dep = get_embedding_service_dependency
|
|
74
|
+
__all__ = [
|
|
75
|
+
"get_embedding_service_for_app",
|
|
76
|
+
]
|
mdb_engine/embeddings/service.py
CHANGED
|
@@ -23,7 +23,7 @@ import os
|
|
|
23
23
|
import time
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from datetime import datetime
|
|
26
|
-
from typing import Any
|
|
26
|
+
from typing import Any
|
|
27
27
|
|
|
28
28
|
# Optional OpenAI SDK import
|
|
29
29
|
try:
|
|
@@ -59,9 +59,7 @@ class BaseEmbeddingProvider(ABC):
|
|
|
59
59
|
"""
|
|
60
60
|
|
|
61
61
|
@abstractmethod
|
|
62
|
-
async def embed(
|
|
63
|
-
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
64
|
-
) -> List[List[float]]:
|
|
62
|
+
async def embed(self, text: str | list[str], model: str | None = None) -> list[list[float]]:
|
|
65
63
|
"""
|
|
66
64
|
Generate embeddings for text.
|
|
67
65
|
|
|
@@ -84,7 +82,7 @@ class OpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
|
84
82
|
|
|
85
83
|
def __init__(
|
|
86
84
|
self,
|
|
87
|
-
api_key:
|
|
85
|
+
api_key: str | None = None,
|
|
88
86
|
default_model: str = "text-embedding-3-small",
|
|
89
87
|
):
|
|
90
88
|
"""
|
|
@@ -108,9 +106,7 @@ class OpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
|
108
106
|
self.client = AsyncOpenAI(api_key=api_key)
|
|
109
107
|
self.default_model = default_model
|
|
110
108
|
|
|
111
|
-
async def embed(
|
|
112
|
-
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
113
|
-
) -> List[List[float]]:
|
|
109
|
+
async def embed(self, text: str | list[str], model: str | None = None) -> list[list[float]]:
|
|
114
110
|
"""Generate embeddings using OpenAI."""
|
|
115
111
|
model = model or self.default_model
|
|
116
112
|
|
|
@@ -134,7 +130,7 @@ class OpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
|
134
130
|
ConnectionError,
|
|
135
131
|
OSError,
|
|
136
132
|
) as e:
|
|
137
|
-
logger.
|
|
133
|
+
logger.exception(f"OpenAI embedding failed: {e}")
|
|
138
134
|
raise EmbeddingServiceError(f"OpenAI embedding failed: {str(e)}") from e
|
|
139
135
|
|
|
140
136
|
|
|
@@ -149,9 +145,9 @@ class AzureOpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
|
149
145
|
|
|
150
146
|
def __init__(
|
|
151
147
|
self,
|
|
152
|
-
api_key:
|
|
153
|
-
endpoint:
|
|
154
|
-
api_version:
|
|
148
|
+
api_key: str | None = None,
|
|
149
|
+
endpoint: str | None = None,
|
|
150
|
+
api_version: str | None = None,
|
|
155
151
|
default_model: str = "text-embedding-3-small",
|
|
156
152
|
):
|
|
157
153
|
"""
|
|
@@ -191,9 +187,7 @@ class AzureOpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
|
191
187
|
)
|
|
192
188
|
self.default_model = default_model
|
|
193
189
|
|
|
194
|
-
async def embed(
|
|
195
|
-
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
196
|
-
) -> List[List[float]]:
|
|
190
|
+
async def embed(self, text: str | list[str], model: str | None = None) -> list[list[float]]:
|
|
197
191
|
"""Generate embeddings using Azure OpenAI."""
|
|
198
192
|
model = model or self.default_model
|
|
199
193
|
|
|
@@ -217,10 +211,8 @@ class AzureOpenAIEmbeddingProvider(BaseEmbeddingProvider):
|
|
|
217
211
|
ConnectionError,
|
|
218
212
|
OSError,
|
|
219
213
|
) as e:
|
|
220
|
-
logger.
|
|
221
|
-
raise EmbeddingServiceError(
|
|
222
|
-
f"Azure OpenAI embedding failed: {str(e)}"
|
|
223
|
-
) from e
|
|
214
|
+
logger.exception(f"Azure OpenAI embedding failed: {e}")
|
|
215
|
+
raise EmbeddingServiceError(f"Azure OpenAI embedding failed: {str(e)}") from e
|
|
224
216
|
|
|
225
217
|
|
|
226
218
|
def _detect_provider_from_env() -> str:
|
|
@@ -257,8 +249,8 @@ class EmbeddingProvider:
|
|
|
257
249
|
|
|
258
250
|
def __init__(
|
|
259
251
|
self,
|
|
260
|
-
embedding_provider:
|
|
261
|
-
config:
|
|
252
|
+
embedding_provider: BaseEmbeddingProvider | None = None,
|
|
253
|
+
config: dict[str, Any] | None = None,
|
|
262
254
|
):
|
|
263
255
|
"""
|
|
264
256
|
Initialize Embedding Provider.
|
|
@@ -281,31 +273,21 @@ class EmbeddingProvider:
|
|
|
281
273
|
else:
|
|
282
274
|
# Auto-detect provider from environment variables
|
|
283
275
|
provider_type = _detect_provider_from_env()
|
|
284
|
-
default_model = (config or {}).get(
|
|
285
|
-
"default_embedding_model", "text-embedding-3-small"
|
|
286
|
-
)
|
|
276
|
+
default_model = (config or {}).get("default_embedding_model", "text-embedding-3-small")
|
|
287
277
|
|
|
288
278
|
if provider_type == "azure":
|
|
289
|
-
self.embedding_provider = AzureOpenAIEmbeddingProvider(
|
|
290
|
-
default_model=default_model
|
|
291
|
-
)
|
|
279
|
+
self.embedding_provider = AzureOpenAIEmbeddingProvider(default_model=default_model)
|
|
292
280
|
logger.info(
|
|
293
281
|
f"Auto-detected Azure OpenAI embedding provider (model: {default_model})"
|
|
294
282
|
)
|
|
295
283
|
else:
|
|
296
|
-
self.embedding_provider = OpenAIEmbeddingProvider(
|
|
297
|
-
|
|
298
|
-
)
|
|
299
|
-
logger.info(
|
|
300
|
-
f"Auto-detected OpenAI embedding provider (model: {default_model})"
|
|
301
|
-
)
|
|
284
|
+
self.embedding_provider = OpenAIEmbeddingProvider(default_model=default_model)
|
|
285
|
+
logger.info(f"Auto-detected OpenAI embedding provider (model: {default_model})")
|
|
302
286
|
|
|
303
287
|
# Store config for potential future use
|
|
304
288
|
self.config = config or {}
|
|
305
289
|
|
|
306
|
-
async def embed(
|
|
307
|
-
self, text: Union[str, List[str]], model: Optional[str] = None
|
|
308
|
-
) -> List[List[float]]:
|
|
290
|
+
async def embed(self, text: str | list[str], model: str | None = None) -> list[list[float]]:
|
|
309
291
|
"""
|
|
310
292
|
Generates vector embeddings for a string or list of strings.
|
|
311
293
|
|
|
@@ -341,7 +323,7 @@ class EmbeddingProvider:
|
|
|
341
323
|
return vectors
|
|
342
324
|
|
|
343
325
|
except (AttributeError, TypeError, ValueError, RuntimeError, KeyError) as e:
|
|
344
|
-
logger.
|
|
326
|
+
logger.exception(f"EMBED_FAILED: {str(e)}")
|
|
345
327
|
raise EmbeddingServiceError(f"Embedding failed: {str(e)}") from e
|
|
346
328
|
|
|
347
329
|
|
|
@@ -371,10 +353,10 @@ class EmbeddingService:
|
|
|
371
353
|
|
|
372
354
|
def __init__(
|
|
373
355
|
self,
|
|
374
|
-
embedding_provider:
|
|
356
|
+
embedding_provider: EmbeddingProvider | None = None,
|
|
375
357
|
default_max_tokens: int = 1000,
|
|
376
358
|
default_tokenizer_model: str = "gpt-3.5-turbo",
|
|
377
|
-
config:
|
|
359
|
+
config: dict[str, Any] | None = None,
|
|
378
360
|
):
|
|
379
361
|
"""
|
|
380
362
|
Initialize Embedding Service.
|
|
@@ -407,9 +389,7 @@ class EmbeddingService:
|
|
|
407
389
|
self.default_max_tokens = default_max_tokens
|
|
408
390
|
self.default_tokenizer_model = default_tokenizer_model
|
|
409
391
|
|
|
410
|
-
def _create_splitter(
|
|
411
|
-
self, max_tokens: int, tokenizer_model: Optional[str] = None
|
|
412
|
-
) -> TextSplitter:
|
|
392
|
+
def _create_splitter(self, max_tokens: int, tokenizer_model: str | None = None) -> TextSplitter:
|
|
413
393
|
"""
|
|
414
394
|
Create a TextSplitter instance.
|
|
415
395
|
|
|
@@ -429,9 +409,9 @@ class EmbeddingService:
|
|
|
429
409
|
async def chunk_text(
|
|
430
410
|
self,
|
|
431
411
|
text_content: str,
|
|
432
|
-
max_tokens:
|
|
433
|
-
tokenizer_model:
|
|
434
|
-
) ->
|
|
412
|
+
max_tokens: int | None = None,
|
|
413
|
+
tokenizer_model: str | None = None,
|
|
414
|
+
) -> list[str]:
|
|
435
415
|
"""
|
|
436
416
|
Split text into semantic chunks.
|
|
437
417
|
|
|
@@ -465,32 +445,39 @@ class EmbeddingService:
|
|
|
465
445
|
logger.error(f"Error chunking text: {e}", exc_info=True)
|
|
466
446
|
raise EmbeddingServiceError(f"Chunking failed: {str(e)}") from e
|
|
467
447
|
|
|
468
|
-
async def
|
|
469
|
-
self, chunks: List[str], model: Optional[str] = None
|
|
470
|
-
) -> List[List[float]]:
|
|
448
|
+
async def embed(self, text: str | list[str], model: str | None = None) -> list[list[float]]:
|
|
471
449
|
"""
|
|
472
|
-
Generate embeddings for text
|
|
450
|
+
Generate embeddings for text or a list of texts.
|
|
473
451
|
|
|
474
|
-
|
|
452
|
+
Natural API that works with both single strings and lists.
|
|
475
453
|
|
|
476
454
|
Args:
|
|
477
|
-
|
|
455
|
+
text: A single string or list of strings to embed
|
|
478
456
|
model: Optional model identifier (passed to embedding provider)
|
|
479
457
|
|
|
480
458
|
Returns:
|
|
481
|
-
List of embedding vectors (each is a list of floats)
|
|
459
|
+
List of embedding vectors (each is a list of floats).
|
|
460
|
+
If input was a single string, returns a list containing one vector.
|
|
482
461
|
|
|
483
462
|
Example:
|
|
484
|
-
|
|
485
|
-
vectors = await service.
|
|
463
|
+
# Single string
|
|
464
|
+
vectors = await service.embed("Hello world", model="text-embedding-3-small")
|
|
465
|
+
# vectors is [[0.1, 0.2, ...]]
|
|
466
|
+
|
|
467
|
+
# List of strings (batch - more efficient)
|
|
468
|
+
vectors = await service.embed(["chunk 1", "chunk 2"], model="text-embedding-3-small")
|
|
469
|
+
# vectors is [[0.1, ...], [0.2, ...]]
|
|
486
470
|
"""
|
|
471
|
+
# Normalize to list
|
|
472
|
+
chunks = [text] if isinstance(text, str) else text
|
|
473
|
+
|
|
487
474
|
if not chunks:
|
|
488
475
|
return []
|
|
489
476
|
|
|
490
477
|
try:
|
|
491
478
|
# Use EmbeddingProvider's embed method (handles retries, logging, etc.)
|
|
492
479
|
vectors = await self.embedding_provider.embed(chunks, model=model)
|
|
493
|
-
logger.info(f"Generated {len(vectors)}
|
|
480
|
+
logger.info(f"Generated {len(vectors)} embedding(s)")
|
|
494
481
|
return vectors
|
|
495
482
|
except (
|
|
496
483
|
AttributeError,
|
|
@@ -503,16 +490,36 @@ class EmbeddingService:
|
|
|
503
490
|
logger.error(f"Error generating embeddings: {e}", exc_info=True)
|
|
504
491
|
raise EmbeddingServiceError(f"Embedding generation failed: {str(e)}") from e
|
|
505
492
|
|
|
493
|
+
async def embed_chunks(self, chunks: list[str], model: str | None = None) -> list[list[float]]:
|
|
494
|
+
"""
|
|
495
|
+
Generate embeddings for text chunks (list only).
|
|
496
|
+
|
|
497
|
+
DEPRECATED: Use embed() instead, which accepts both strings and lists.
|
|
498
|
+
This method is kept for backward compatibility.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
chunks: List of text chunks to embed
|
|
502
|
+
model: Optional model identifier (passed to embedding provider)
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
List of embedding vectors (each is a list of floats)
|
|
506
|
+
|
|
507
|
+
Example:
|
|
508
|
+
chunks = ["chunk 1", "chunk 2"]
|
|
509
|
+
vectors = await service.embed_chunks(chunks, model="text-embedding-3-small")
|
|
510
|
+
"""
|
|
511
|
+
return await self.embed(chunks, model=model)
|
|
512
|
+
|
|
506
513
|
async def process_and_store(
|
|
507
514
|
self,
|
|
508
515
|
text_content: str,
|
|
509
516
|
source_id: str,
|
|
510
517
|
collection: Any, # MongoDB collection (AppDB Collection or Motor collection)
|
|
511
|
-
max_tokens:
|
|
512
|
-
tokenizer_model:
|
|
513
|
-
embedding_model:
|
|
514
|
-
metadata:
|
|
515
|
-
) ->
|
|
518
|
+
max_tokens: int | None = None,
|
|
519
|
+
tokenizer_model: str | None = None,
|
|
520
|
+
embedding_model: str | None = None,
|
|
521
|
+
metadata: dict[str, Any] | None = None,
|
|
522
|
+
) -> dict[str, Any]:
|
|
516
523
|
"""
|
|
517
524
|
Process text and store chunks with embeddings in MongoDB.
|
|
518
525
|
|
|
@@ -573,7 +580,7 @@ class EmbeddingService:
|
|
|
573
580
|
ConnectionError,
|
|
574
581
|
OSError,
|
|
575
582
|
) as e:
|
|
576
|
-
logger.
|
|
583
|
+
logger.exception(f"Failed to generate embeddings for {source_id}: {e}")
|
|
577
584
|
raise EmbeddingServiceError(f"Embedding generation failed: {str(e)}") from e
|
|
578
585
|
|
|
579
586
|
if len(vectors) != len(chunks):
|
|
@@ -583,7 +590,7 @@ class EmbeddingService:
|
|
|
583
590
|
|
|
584
591
|
# Step 3: Prepare documents for insertion
|
|
585
592
|
documents_to_insert = []
|
|
586
|
-
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
|
|
593
|
+
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors, strict=False)):
|
|
587
594
|
doc = {
|
|
588
595
|
"source_id": source_id,
|
|
589
596
|
"chunk_index": i,
|
|
@@ -614,9 +621,7 @@ class EmbeddingService:
|
|
|
614
621
|
result = await collection.insert_many(documents_to_insert)
|
|
615
622
|
inserted_count = len(result.inserted_ids)
|
|
616
623
|
|
|
617
|
-
logger.info(
|
|
618
|
-
f"Successfully inserted {inserted_count} documents for source: {source_id}"
|
|
619
|
-
)
|
|
624
|
+
logger.info(f"Successfully inserted {inserted_count} documents for source: {source_id}")
|
|
620
625
|
|
|
621
626
|
return {
|
|
622
627
|
"chunks_created": len(chunks),
|
|
@@ -632,18 +637,16 @@ class EmbeddingService:
|
|
|
632
637
|
KeyError,
|
|
633
638
|
ConnectionError,
|
|
634
639
|
) as e:
|
|
635
|
-
logger.error(
|
|
636
|
-
f"Failed to store documents for {source_id}: {e}", exc_info=True
|
|
637
|
-
)
|
|
640
|
+
logger.error(f"Failed to store documents for {source_id}: {e}", exc_info=True)
|
|
638
641
|
raise EmbeddingServiceError(f"Storage failed: {str(e)}") from e
|
|
639
642
|
|
|
640
643
|
async def process_text(
|
|
641
644
|
self,
|
|
642
645
|
text_content: str,
|
|
643
|
-
max_tokens:
|
|
644
|
-
tokenizer_model:
|
|
645
|
-
embedding_model:
|
|
646
|
-
) ->
|
|
646
|
+
max_tokens: int | None = None,
|
|
647
|
+
tokenizer_model: str | None = None,
|
|
648
|
+
embedding_model: str | None = None,
|
|
649
|
+
) -> list[dict[str, Any]]:
|
|
647
650
|
"""
|
|
648
651
|
Process text and return chunks with embeddings (without storing).
|
|
649
652
|
|
|
@@ -687,7 +690,7 @@ class EmbeddingService:
|
|
|
687
690
|
|
|
688
691
|
# Prepare results
|
|
689
692
|
results = []
|
|
690
|
-
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
|
|
693
|
+
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors, strict=False)):
|
|
691
694
|
results.append(
|
|
692
695
|
{
|
|
693
696
|
"chunk_index": i,
|
|
@@ -706,8 +709,8 @@ class EmbeddingService:
|
|
|
706
709
|
|
|
707
710
|
# Dependency injection helper
|
|
708
711
|
def get_embedding_service(
|
|
709
|
-
embedding_provider:
|
|
710
|
-
config:
|
|
712
|
+
embedding_provider: BaseEmbeddingProvider | None = None,
|
|
713
|
+
config: dict[str, Any] | None = None,
|
|
711
714
|
) -> EmbeddingService:
|
|
712
715
|
"""
|
|
713
716
|
Create EmbeddingService instance with auto-detected or provided embedding provider.
|