nucliadb 6.4.0.post4196__py3-none-any.whl → 6.4.0.post4204__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/back_pressure/__init__.py +20 -0
- nucliadb/common/back_pressure/cache.py +86 -0
- nucliadb/common/back_pressure/materializer.py +315 -0
- nucliadb/common/back_pressure/settings.py +72 -0
- nucliadb/common/back_pressure/utils.py +59 -0
- nucliadb/search/search/chat/ask.py +1 -1
- nucliadb/writer/api/v1/export_import.py +2 -2
- nucliadb/writer/api/v1/field.py +3 -3
- nucliadb/writer/api/v1/resource.py +5 -5
- nucliadb/writer/api/v1/upload.py +3 -3
- nucliadb/writer/lifecycle.py +2 -2
- nucliadb/writer/settings.py +0 -51
- {nucliadb-6.4.0.post4196.dist-info → nucliadb-6.4.0.post4204.dist-info}/METADATA +6 -6
- {nucliadb-6.4.0.post4196.dist-info → nucliadb-6.4.0.post4204.dist-info}/RECORD +17 -13
- nucliadb/writer/back_pressure.py +0 -485
- {nucliadb-6.4.0.post4196.dist-info → nucliadb-6.4.0.post4204.dist-info}/WHEEL +0 -0
- {nucliadb-6.4.0.post4196.dist-info → nucliadb-6.4.0.post4204.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.4.0.post4196.dist-info → nucliadb-6.4.0.post4204.dist-info}/top_level.txt +0 -0
nucliadb/writer/back_pressure.py
DELETED
@@ -1,485 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
import asyncio
|
22
|
-
import contextlib
|
23
|
-
import threading
|
24
|
-
from dataclasses import dataclass
|
25
|
-
from datetime import datetime, timedelta
|
26
|
-
from typing import Optional
|
27
|
-
|
28
|
-
from cachetools import TTLCache
|
29
|
-
from fastapi import HTTPException, Request
|
30
|
-
|
31
|
-
from nucliadb.common import datamanagers
|
32
|
-
from nucliadb.common.context import ApplicationContext
|
33
|
-
from nucliadb.common.context.fastapi import get_app_context
|
34
|
-
from nucliadb.common.http_clients.processing import ProcessingHTTPClient
|
35
|
-
from nucliadb.writer import logger
|
36
|
-
from nucliadb.writer.settings import back_pressure_settings as settings
|
37
|
-
from nucliadb_protos.writer_pb2 import ShardObject
|
38
|
-
from nucliadb_telemetry import metrics
|
39
|
-
from nucliadb_utils import const
|
40
|
-
from nucliadb_utils.nats import NatsConnectionManager
|
41
|
-
from nucliadb_utils.settings import is_onprem_nucliadb
|
42
|
-
|
43
|
-
__all__ = ["maybe_back_pressure"]
|
44
|
-
|
45
|
-
|
46
|
-
back_pressure_observer = metrics.Observer("nucliadb_back_pressure", labels={"type": ""})
|
47
|
-
|
48
|
-
|
49
|
-
RATE_LIMITED_REQUESTS_COUNTER = metrics.Counter(
|
50
|
-
"nucliadb_rate_limited_requests", labels={"type": "", "cached": ""}
|
51
|
-
)
|
52
|
-
|
53
|
-
|
54
|
-
@dataclass
|
55
|
-
class BackPressureData:
|
56
|
-
type: str
|
57
|
-
try_after: datetime
|
58
|
-
|
59
|
-
|
60
|
-
class BackPressureException(Exception):
|
61
|
-
def __init__(self, data: BackPressureData):
|
62
|
-
self.data = data
|
63
|
-
|
64
|
-
|
65
|
-
def is_back_pressure_enabled() -> bool:
|
66
|
-
return settings.enabled
|
67
|
-
|
68
|
-
|
69
|
-
class BackPressureCache:
|
70
|
-
"""
|
71
|
-
Global cache for storing already computed try again in times.
|
72
|
-
|
73
|
-
It allows us to avoid making the same calculations multiple
|
74
|
-
times if back pressure has been applied.
|
75
|
-
"""
|
76
|
-
|
77
|
-
def __init__(self):
|
78
|
-
self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
|
79
|
-
self._lock = threading.Lock()
|
80
|
-
|
81
|
-
def get(self, key: str) -> Optional[BackPressureData]:
|
82
|
-
with self._lock:
|
83
|
-
data = self._cache.get(key, None)
|
84
|
-
if data is None:
|
85
|
-
return None
|
86
|
-
if datetime.utcnow() >= data.try_after:
|
87
|
-
# The key has expired, so remove it from the cache
|
88
|
-
self._cache.pop(key, None)
|
89
|
-
return None
|
90
|
-
return data
|
91
|
-
|
92
|
-
def set(self, key: str, data: BackPressureData):
|
93
|
-
with self._lock:
|
94
|
-
self._cache[key] = data
|
95
|
-
|
96
|
-
|
97
|
-
_cache = BackPressureCache()
|
98
|
-
|
99
|
-
|
100
|
-
@contextlib.contextmanager
|
101
|
-
def cached_back_pressure(kbid: str, resource_uuid: Optional[str] = None):
|
102
|
-
"""
|
103
|
-
Context manager that handles the caching of the try again in time so that
|
104
|
-
we don't recompute try again times if we have already applied back pressure.
|
105
|
-
"""
|
106
|
-
|
107
|
-
cache_key = "-".join([kbid, resource_uuid or ""])
|
108
|
-
|
109
|
-
data: Optional[BackPressureData] = _cache.get(cache_key)
|
110
|
-
if data is not None:
|
111
|
-
try_after = data.try_after
|
112
|
-
back_pressure_type = data.type
|
113
|
-
RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})
|
114
|
-
logger.info(
|
115
|
-
"Back pressure applied from cache",
|
116
|
-
extra={
|
117
|
-
"type": back_pressure_type,
|
118
|
-
"try_after": try_after,
|
119
|
-
"kbid": kbid,
|
120
|
-
"resource_uuid": resource_uuid,
|
121
|
-
},
|
122
|
-
)
|
123
|
-
raise HTTPException(
|
124
|
-
status_code=429,
|
125
|
-
detail={
|
126
|
-
"message": f"Too many messages pending to ingest. Retry after {try_after}",
|
127
|
-
"try_after": try_after.timestamp(),
|
128
|
-
"back_pressure_type": back_pressure_type,
|
129
|
-
},
|
130
|
-
)
|
131
|
-
try:
|
132
|
-
yield
|
133
|
-
except BackPressureException as exc:
|
134
|
-
try_after = exc.data.try_after
|
135
|
-
back_pressure_type = exc.data.type
|
136
|
-
RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "false"})
|
137
|
-
_cache.set(cache_key, exc.data)
|
138
|
-
raise HTTPException(
|
139
|
-
status_code=429,
|
140
|
-
detail={
|
141
|
-
"message": f"Too many messages pending to ingest. Retry after {try_after}",
|
142
|
-
"try_after": try_after.timestamp(),
|
143
|
-
"back_pressure_type": back_pressure_type,
|
144
|
-
},
|
145
|
-
)
|
146
|
-
|
147
|
-
|
148
|
-
class Materializer:
|
149
|
-
"""
|
150
|
-
Singleton class that will run in the background gathering the different
|
151
|
-
stats to apply back pressure and materializing it in memory. This allows us
|
152
|
-
to do stale-reads when checking if back pressure is needed for a particular
|
153
|
-
request - thus not slowing it down.
|
154
|
-
"""
|
155
|
-
|
156
|
-
def __init__(
|
157
|
-
self,
|
158
|
-
nats_manager: NatsConnectionManager,
|
159
|
-
indexing_check_interval: int = 30,
|
160
|
-
ingest_check_interval: int = 30,
|
161
|
-
):
|
162
|
-
self.nats_manager = nats_manager
|
163
|
-
self.processing_http_client = ProcessingHTTPClient()
|
164
|
-
|
165
|
-
self.indexing_check_interval = indexing_check_interval
|
166
|
-
self.ingest_check_interval = ingest_check_interval
|
167
|
-
|
168
|
-
self.ingest_pending: int = 0
|
169
|
-
self.indexing_pending: int = 0
|
170
|
-
|
171
|
-
self._tasks: list[asyncio.Task] = []
|
172
|
-
self._running = False
|
173
|
-
|
174
|
-
self.processing_pending_cache = TTLCache(maxsize=1024, ttl=60) # type: ignore
|
175
|
-
self.processing_pending_locks: dict[str, asyncio.Lock] = {}
|
176
|
-
|
177
|
-
async def start(self):
|
178
|
-
self._tasks.append(asyncio.create_task(self._get_indexing_pending_task()))
|
179
|
-
self._tasks.append(asyncio.create_task(self._get_ingest_pending_task()))
|
180
|
-
self._running = True
|
181
|
-
|
182
|
-
async def stop(self):
|
183
|
-
for task in self._tasks:
|
184
|
-
task.cancel()
|
185
|
-
self._tasks.clear()
|
186
|
-
await self.processing_http_client.close()
|
187
|
-
self._running = False
|
188
|
-
|
189
|
-
@property
|
190
|
-
def running(self) -> bool:
|
191
|
-
return self._running
|
192
|
-
|
193
|
-
async def get_processing_pending(self, kbid: str) -> int:
|
194
|
-
"""
|
195
|
-
We don't materialize the pending messages for every kbid, but values are cached for some time.
|
196
|
-
"""
|
197
|
-
cached = self.processing_pending_cache.get(kbid)
|
198
|
-
if cached is not None:
|
199
|
-
return cached
|
200
|
-
|
201
|
-
lock = self.processing_pending_locks.setdefault(kbid, asyncio.Lock())
|
202
|
-
async with lock:
|
203
|
-
# Check again if the value has been cached while we were waiting for the lock
|
204
|
-
cached = self.processing_pending_cache.get(kbid)
|
205
|
-
if cached is not None:
|
206
|
-
return cached
|
207
|
-
|
208
|
-
# Get the pending messages and cache the result
|
209
|
-
try:
|
210
|
-
with back_pressure_observer({"type": "get_processing_pending"}):
|
211
|
-
pending = await self._get_processing_pending(kbid)
|
212
|
-
except Exception:
|
213
|
-
# Do not cache if there was an error
|
214
|
-
logger.exception(
|
215
|
-
"Error getting pending messages to process. Back pressure on proccessing for KB can't be applied.",
|
216
|
-
exc_info=True,
|
217
|
-
extra={"kbid": kbid},
|
218
|
-
)
|
219
|
-
return 0
|
220
|
-
|
221
|
-
if pending > 0:
|
222
|
-
logger.info(
|
223
|
-
f"Processing returned {pending} pending messages for KB",
|
224
|
-
extra={"kbid": kbid},
|
225
|
-
)
|
226
|
-
self.processing_pending_cache[kbid] = pending
|
227
|
-
return pending
|
228
|
-
|
229
|
-
async def _get_processing_pending(self, kbid: str) -> int:
|
230
|
-
response = await self.processing_http_client.stats(kbid=kbid, timeout=0.5)
|
231
|
-
return response.incomplete
|
232
|
-
|
233
|
-
def get_indexing_pending(self) -> int:
|
234
|
-
return self.indexing_pending
|
235
|
-
|
236
|
-
def get_ingest_pending(self) -> int:
|
237
|
-
return self.ingest_pending
|
238
|
-
|
239
|
-
async def _get_indexing_pending_task(self):
|
240
|
-
try:
|
241
|
-
while True:
|
242
|
-
try:
|
243
|
-
with back_pressure_observer({"type": "get_indexing_pending"}):
|
244
|
-
self.indexing_pending = await get_nats_consumer_pending_messages(
|
245
|
-
self.nats_manager,
|
246
|
-
stream="nidx",
|
247
|
-
consumer="nidx",
|
248
|
-
)
|
249
|
-
except Exception:
|
250
|
-
logger.exception(
|
251
|
-
"Error getting pending messages to index",
|
252
|
-
exc_info=True,
|
253
|
-
)
|
254
|
-
await asyncio.sleep(self.indexing_check_interval)
|
255
|
-
except asyncio.CancelledError:
|
256
|
-
pass
|
257
|
-
|
258
|
-
async def _get_ingest_pending_task(self):
|
259
|
-
try:
|
260
|
-
while True:
|
261
|
-
try:
|
262
|
-
with back_pressure_observer({"type": "get_ingest_pending"}):
|
263
|
-
self.ingest_pending = await get_nats_consumer_pending_messages(
|
264
|
-
self.nats_manager,
|
265
|
-
stream=const.Streams.INGEST_PROCESSED.name,
|
266
|
-
consumer=const.Streams.INGEST_PROCESSED.group,
|
267
|
-
)
|
268
|
-
except Exception:
|
269
|
-
logger.exception(
|
270
|
-
"Error getting pending messages to ingest",
|
271
|
-
exc_info=True,
|
272
|
-
)
|
273
|
-
await asyncio.sleep(self.ingest_check_interval)
|
274
|
-
except asyncio.CancelledError:
|
275
|
-
pass
|
276
|
-
|
277
|
-
|
278
|
-
MATERIALIZER: Optional[Materializer] = None
|
279
|
-
materializer_lock = threading.Lock()
|
280
|
-
|
281
|
-
|
282
|
-
async def start_materializer(context: ApplicationContext):
|
283
|
-
global MATERIALIZER
|
284
|
-
if MATERIALIZER is not None:
|
285
|
-
logger.info("Materializer already started")
|
286
|
-
return
|
287
|
-
with materializer_lock:
|
288
|
-
if MATERIALIZER is not None:
|
289
|
-
return
|
290
|
-
logger.info("Initializing materializer")
|
291
|
-
try:
|
292
|
-
nats_manager = context.nats_manager
|
293
|
-
except AttributeError:
|
294
|
-
logger.warning(
|
295
|
-
"Could not initialize materializer. Nats manager not found or not initialized yet"
|
296
|
-
)
|
297
|
-
return
|
298
|
-
materializer = Materializer(
|
299
|
-
nats_manager,
|
300
|
-
indexing_check_interval=settings.indexing_check_interval,
|
301
|
-
ingest_check_interval=settings.ingest_check_interval,
|
302
|
-
)
|
303
|
-
await materializer.start()
|
304
|
-
MATERIALIZER = materializer
|
305
|
-
|
306
|
-
|
307
|
-
async def stop_materializer():
|
308
|
-
global MATERIALIZER
|
309
|
-
if MATERIALIZER is None or not MATERIALIZER.running:
|
310
|
-
logger.info("Materializer already stopped")
|
311
|
-
return
|
312
|
-
with materializer_lock:
|
313
|
-
if MATERIALIZER is None:
|
314
|
-
return
|
315
|
-
logger.info("Stopping materializer")
|
316
|
-
await MATERIALIZER.stop()
|
317
|
-
MATERIALIZER = None
|
318
|
-
|
319
|
-
|
320
|
-
def get_materializer() -> Materializer:
|
321
|
-
global MATERIALIZER
|
322
|
-
if MATERIALIZER is None:
|
323
|
-
raise RuntimeError("Materializer not initialized")
|
324
|
-
return MATERIALIZER
|
325
|
-
|
326
|
-
|
327
|
-
async def maybe_back_pressure(request: Request, kbid: str, resource_uuid: Optional[str] = None) -> None:
|
328
|
-
"""
|
329
|
-
This function does system checks to see if we need to put back pressure on writes.
|
330
|
-
In that case, a HTTP 429 will be raised with the estimated time to try again.
|
331
|
-
"""
|
332
|
-
if not is_back_pressure_enabled() or is_onprem_nucliadb():
|
333
|
-
return
|
334
|
-
await back_pressure_checks(request, kbid, resource_uuid)
|
335
|
-
|
336
|
-
|
337
|
-
async def back_pressure_checks(request: Request, kbid: str, resource_uuid: Optional[str] = None):
|
338
|
-
"""
|
339
|
-
Will raise a 429 if back pressure is needed:
|
340
|
-
- If the processing engine is behind.
|
341
|
-
- If ingest processed consumer is behind.
|
342
|
-
- If the indexing on nodes affected by the request (kbid, and resource_uuid) is behind.
|
343
|
-
"""
|
344
|
-
context = get_app_context(request.app)
|
345
|
-
materializer = get_materializer()
|
346
|
-
with cached_back_pressure(kbid, resource_uuid):
|
347
|
-
check_ingest_behind(materializer.get_ingest_pending())
|
348
|
-
await check_indexing_behind(context, kbid, resource_uuid, materializer.get_indexing_pending())
|
349
|
-
await check_processing_behind(materializer, kbid)
|
350
|
-
|
351
|
-
|
352
|
-
async def check_processing_behind(materializer: Materializer, kbid: str):
|
353
|
-
"""
|
354
|
-
This function checks if the processing engine is behind and may raise a 429
|
355
|
-
if it is further behind than the configured threshold.
|
356
|
-
"""
|
357
|
-
max_pending = settings.max_processing_pending
|
358
|
-
if max_pending <= 0:
|
359
|
-
# Processing back pressure is disabled
|
360
|
-
return
|
361
|
-
|
362
|
-
kb_pending = await materializer.get_processing_pending(kbid)
|
363
|
-
if kb_pending > max_pending:
|
364
|
-
try_after = estimate_try_after(
|
365
|
-
rate=settings.processing_rate,
|
366
|
-
pending=kb_pending,
|
367
|
-
max_wait=settings.max_wait_time,
|
368
|
-
)
|
369
|
-
data = BackPressureData(type="processing", try_after=try_after)
|
370
|
-
logger.info(
|
371
|
-
"Processing back pressure applied",
|
372
|
-
extra={
|
373
|
-
"kbid": kbid,
|
374
|
-
"try_after": try_after,
|
375
|
-
"pending": kb_pending,
|
376
|
-
},
|
377
|
-
)
|
378
|
-
raise BackPressureException(data)
|
379
|
-
|
380
|
-
|
381
|
-
async def check_indexing_behind(
|
382
|
-
context: ApplicationContext,
|
383
|
-
kbid: str,
|
384
|
-
resource_uuid: Optional[str],
|
385
|
-
pending: int,
|
386
|
-
):
|
387
|
-
"""
|
388
|
-
If a resource uuid is provided, it will check the nodes that have the replicas
|
389
|
-
of the resource's shard, otherwise it will check the nodes of all active shards
|
390
|
-
for the KnowledgeBox.
|
391
|
-
"""
|
392
|
-
max_pending = settings.max_indexing_pending
|
393
|
-
if max_pending <= 0:
|
394
|
-
# Indexing back pressure is disabled
|
395
|
-
return
|
396
|
-
|
397
|
-
if pending > max_pending:
|
398
|
-
try_after = estimate_try_after(
|
399
|
-
rate=settings.indexing_rate,
|
400
|
-
pending=pending,
|
401
|
-
max_wait=settings.max_wait_time,
|
402
|
-
)
|
403
|
-
data = BackPressureData(type="indexing", try_after=try_after)
|
404
|
-
logger.info(
|
405
|
-
"Indexing back pressure applied",
|
406
|
-
extra={
|
407
|
-
"kbid": kbid,
|
408
|
-
"resource_uuid": resource_uuid,
|
409
|
-
"try_after": try_after,
|
410
|
-
"pending": pending,
|
411
|
-
},
|
412
|
-
)
|
413
|
-
raise BackPressureException(data)
|
414
|
-
|
415
|
-
|
416
|
-
def check_ingest_behind(ingest_pending: int):
|
417
|
-
max_pending = settings.max_ingest_pending
|
418
|
-
if max_pending <= 0:
|
419
|
-
# Ingest back pressure is disabled
|
420
|
-
return
|
421
|
-
|
422
|
-
if ingest_pending > max_pending:
|
423
|
-
try_after = estimate_try_after(
|
424
|
-
rate=settings.ingest_rate,
|
425
|
-
pending=ingest_pending,
|
426
|
-
max_wait=settings.max_wait_time,
|
427
|
-
)
|
428
|
-
data = BackPressureData(type="ingest", try_after=try_after)
|
429
|
-
logger.info(
|
430
|
-
"Ingest back pressure applied",
|
431
|
-
extra={"try_after": try_after, "pending": ingest_pending},
|
432
|
-
)
|
433
|
-
raise BackPressureException(data)
|
434
|
-
|
435
|
-
|
436
|
-
def estimate_try_after(rate: float, pending: int, max_wait: int) -> datetime:
|
437
|
-
"""
|
438
|
-
This function estimates the time to try again based on the rate and the number of pending messages.
|
439
|
-
"""
|
440
|
-
delta_seconds = min(pending / rate, max_wait)
|
441
|
-
return datetime.utcnow() + timedelta(seconds=delta_seconds)
|
442
|
-
|
443
|
-
|
444
|
-
async def get_nats_consumer_pending_messages(
|
445
|
-
nats_manager: NatsConnectionManager, *, stream: str, consumer: str
|
446
|
-
) -> int:
|
447
|
-
# get raw js client
|
448
|
-
js = nats_manager.js
|
449
|
-
consumer_info = await js.consumer_info(stream, consumer)
|
450
|
-
return consumer_info.num_pending
|
451
|
-
|
452
|
-
|
453
|
-
async def get_kb_active_shard(context: ApplicationContext, kbid: str) -> Optional[ShardObject]:
|
454
|
-
async with context.kv_driver.transaction(read_only=True) as txn:
|
455
|
-
return await context.shard_manager.get_current_active_shard(txn, kbid)
|
456
|
-
|
457
|
-
|
458
|
-
async def get_resource_shard(
|
459
|
-
context: ApplicationContext, kbid: str, resource_uuid: str
|
460
|
-
) -> Optional[ShardObject]:
|
461
|
-
async with datamanagers.with_ro_transaction() as txn:
|
462
|
-
shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=resource_uuid)
|
463
|
-
if shard_id is None:
|
464
|
-
# Resource does not exist
|
465
|
-
logger.debug(
|
466
|
-
"Resource shard not found",
|
467
|
-
extra={"kbid": kbid, "resource_uuid": resource_uuid},
|
468
|
-
)
|
469
|
-
return None
|
470
|
-
|
471
|
-
all_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
472
|
-
if all_shards is None:
|
473
|
-
# KB doesn't exist or has been deleted
|
474
|
-
logger.debug("No shards found for KB", extra={"kbid": kbid})
|
475
|
-
return None
|
476
|
-
|
477
|
-
for shard in all_shards.shards:
|
478
|
-
if shard.shard == shard_id:
|
479
|
-
return shard
|
480
|
-
else:
|
481
|
-
logger.error(
|
482
|
-
"Resource shard not found",
|
483
|
-
extra={"kbid": kbid, "resource_uuid": resource_uuid, "shard_id": shard_id},
|
484
|
-
)
|
485
|
-
return None
|
File without changes
|
File without changes
|
File without changes
|