nucliadb 6.4.0.post4196__py3-none-any.whl → 6.4.0.post4204__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,485 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- import asyncio
22
- import contextlib
23
- import threading
24
- from dataclasses import dataclass
25
- from datetime import datetime, timedelta
26
- from typing import Optional
27
-
28
- from cachetools import TTLCache
29
- from fastapi import HTTPException, Request
30
-
31
- from nucliadb.common import datamanagers
32
- from nucliadb.common.context import ApplicationContext
33
- from nucliadb.common.context.fastapi import get_app_context
34
- from nucliadb.common.http_clients.processing import ProcessingHTTPClient
35
- from nucliadb.writer import logger
36
- from nucliadb.writer.settings import back_pressure_settings as settings
37
- from nucliadb_protos.writer_pb2 import ShardObject
38
- from nucliadb_telemetry import metrics
39
- from nucliadb_utils import const
40
- from nucliadb_utils.nats import NatsConnectionManager
41
- from nucliadb_utils.settings import is_onprem_nucliadb
42
-
43
- __all__ = ["maybe_back_pressure"]
44
-
45
-
46
- back_pressure_observer = metrics.Observer("nucliadb_back_pressure", labels={"type": ""})
47
-
48
-
49
- RATE_LIMITED_REQUESTS_COUNTER = metrics.Counter(
50
- "nucliadb_rate_limited_requests", labels={"type": "", "cached": ""}
51
- )
52
-
53
-
54
- @dataclass
55
- class BackPressureData:
56
- type: str
57
- try_after: datetime
58
-
59
-
60
- class BackPressureException(Exception):
61
- def __init__(self, data: BackPressureData):
62
- self.data = data
63
-
64
-
65
- def is_back_pressure_enabled() -> bool:
66
- return settings.enabled
67
-
68
-
69
- class BackPressureCache:
70
- """
71
- Global cache for storing already computed try again in times.
72
-
73
- It allows us to avoid making the same calculations multiple
74
- times if back pressure has been applied.
75
- """
76
-
77
- def __init__(self):
78
- self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
79
- self._lock = threading.Lock()
80
-
81
- def get(self, key: str) -> Optional[BackPressureData]:
82
- with self._lock:
83
- data = self._cache.get(key, None)
84
- if data is None:
85
- return None
86
- if datetime.utcnow() >= data.try_after:
87
- # The key has expired, so remove it from the cache
88
- self._cache.pop(key, None)
89
- return None
90
- return data
91
-
92
- def set(self, key: str, data: BackPressureData):
93
- with self._lock:
94
- self._cache[key] = data
95
-
96
-
97
- _cache = BackPressureCache()
98
-
99
-
100
- @contextlib.contextmanager
101
- def cached_back_pressure(kbid: str, resource_uuid: Optional[str] = None):
102
- """
103
- Context manager that handles the caching of the try again in time so that
104
- we don't recompute try again times if we have already applied back pressure.
105
- """
106
-
107
- cache_key = "-".join([kbid, resource_uuid or ""])
108
-
109
- data: Optional[BackPressureData] = _cache.get(cache_key)
110
- if data is not None:
111
- try_after = data.try_after
112
- back_pressure_type = data.type
113
- RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})
114
- logger.info(
115
- "Back pressure applied from cache",
116
- extra={
117
- "type": back_pressure_type,
118
- "try_after": try_after,
119
- "kbid": kbid,
120
- "resource_uuid": resource_uuid,
121
- },
122
- )
123
- raise HTTPException(
124
- status_code=429,
125
- detail={
126
- "message": f"Too many messages pending to ingest. Retry after {try_after}",
127
- "try_after": try_after.timestamp(),
128
- "back_pressure_type": back_pressure_type,
129
- },
130
- )
131
- try:
132
- yield
133
- except BackPressureException as exc:
134
- try_after = exc.data.try_after
135
- back_pressure_type = exc.data.type
136
- RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "false"})
137
- _cache.set(cache_key, exc.data)
138
- raise HTTPException(
139
- status_code=429,
140
- detail={
141
- "message": f"Too many messages pending to ingest. Retry after {try_after}",
142
- "try_after": try_after.timestamp(),
143
- "back_pressure_type": back_pressure_type,
144
- },
145
- )
146
-
147
-
148
- class Materializer:
149
- """
150
- Singleton class that will run in the background gathering the different
151
- stats to apply back pressure and materializing it in memory. This allows us
152
- to do stale-reads when checking if back pressure is needed for a particular
153
- request - thus not slowing it down.
154
- """
155
-
156
- def __init__(
157
- self,
158
- nats_manager: NatsConnectionManager,
159
- indexing_check_interval: int = 30,
160
- ingest_check_interval: int = 30,
161
- ):
162
- self.nats_manager = nats_manager
163
- self.processing_http_client = ProcessingHTTPClient()
164
-
165
- self.indexing_check_interval = indexing_check_interval
166
- self.ingest_check_interval = ingest_check_interval
167
-
168
- self.ingest_pending: int = 0
169
- self.indexing_pending: int = 0
170
-
171
- self._tasks: list[asyncio.Task] = []
172
- self._running = False
173
-
174
- self.processing_pending_cache = TTLCache(maxsize=1024, ttl=60) # type: ignore
175
- self.processing_pending_locks: dict[str, asyncio.Lock] = {}
176
-
177
- async def start(self):
178
- self._tasks.append(asyncio.create_task(self._get_indexing_pending_task()))
179
- self._tasks.append(asyncio.create_task(self._get_ingest_pending_task()))
180
- self._running = True
181
-
182
- async def stop(self):
183
- for task in self._tasks:
184
- task.cancel()
185
- self._tasks.clear()
186
- await self.processing_http_client.close()
187
- self._running = False
188
-
189
- @property
190
- def running(self) -> bool:
191
- return self._running
192
-
193
- async def get_processing_pending(self, kbid: str) -> int:
194
- """
195
- We don't materialize the pending messages for every kbid, but values are cached for some time.
196
- """
197
- cached = self.processing_pending_cache.get(kbid)
198
- if cached is not None:
199
- return cached
200
-
201
- lock = self.processing_pending_locks.setdefault(kbid, asyncio.Lock())
202
- async with lock:
203
- # Check again if the value has been cached while we were waiting for the lock
204
- cached = self.processing_pending_cache.get(kbid)
205
- if cached is not None:
206
- return cached
207
-
208
- # Get the pending messages and cache the result
209
- try:
210
- with back_pressure_observer({"type": "get_processing_pending"}):
211
- pending = await self._get_processing_pending(kbid)
212
- except Exception:
213
- # Do not cache if there was an error
214
- logger.exception(
215
- "Error getting pending messages to process. Back pressure on proccessing for KB can't be applied.",
216
- exc_info=True,
217
- extra={"kbid": kbid},
218
- )
219
- return 0
220
-
221
- if pending > 0:
222
- logger.info(
223
- f"Processing returned {pending} pending messages for KB",
224
- extra={"kbid": kbid},
225
- )
226
- self.processing_pending_cache[kbid] = pending
227
- return pending
228
-
229
- async def _get_processing_pending(self, kbid: str) -> int:
230
- response = await self.processing_http_client.stats(kbid=kbid, timeout=0.5)
231
- return response.incomplete
232
-
233
- def get_indexing_pending(self) -> int:
234
- return self.indexing_pending
235
-
236
- def get_ingest_pending(self) -> int:
237
- return self.ingest_pending
238
-
239
- async def _get_indexing_pending_task(self):
240
- try:
241
- while True:
242
- try:
243
- with back_pressure_observer({"type": "get_indexing_pending"}):
244
- self.indexing_pending = await get_nats_consumer_pending_messages(
245
- self.nats_manager,
246
- stream="nidx",
247
- consumer="nidx",
248
- )
249
- except Exception:
250
- logger.exception(
251
- "Error getting pending messages to index",
252
- exc_info=True,
253
- )
254
- await asyncio.sleep(self.indexing_check_interval)
255
- except asyncio.CancelledError:
256
- pass
257
-
258
- async def _get_ingest_pending_task(self):
259
- try:
260
- while True:
261
- try:
262
- with back_pressure_observer({"type": "get_ingest_pending"}):
263
- self.ingest_pending = await get_nats_consumer_pending_messages(
264
- self.nats_manager,
265
- stream=const.Streams.INGEST_PROCESSED.name,
266
- consumer=const.Streams.INGEST_PROCESSED.group,
267
- )
268
- except Exception:
269
- logger.exception(
270
- "Error getting pending messages to ingest",
271
- exc_info=True,
272
- )
273
- await asyncio.sleep(self.ingest_check_interval)
274
- except asyncio.CancelledError:
275
- pass
276
-
277
-
278
- MATERIALIZER: Optional[Materializer] = None
279
- materializer_lock = threading.Lock()
280
-
281
-
282
- async def start_materializer(context: ApplicationContext):
283
- global MATERIALIZER
284
- if MATERIALIZER is not None:
285
- logger.info("Materializer already started")
286
- return
287
- with materializer_lock:
288
- if MATERIALIZER is not None:
289
- return
290
- logger.info("Initializing materializer")
291
- try:
292
- nats_manager = context.nats_manager
293
- except AttributeError:
294
- logger.warning(
295
- "Could not initialize materializer. Nats manager not found or not initialized yet"
296
- )
297
- return
298
- materializer = Materializer(
299
- nats_manager,
300
- indexing_check_interval=settings.indexing_check_interval,
301
- ingest_check_interval=settings.ingest_check_interval,
302
- )
303
- await materializer.start()
304
- MATERIALIZER = materializer
305
-
306
-
307
- async def stop_materializer():
308
- global MATERIALIZER
309
- if MATERIALIZER is None or not MATERIALIZER.running:
310
- logger.info("Materializer already stopped")
311
- return
312
- with materializer_lock:
313
- if MATERIALIZER is None:
314
- return
315
- logger.info("Stopping materializer")
316
- await MATERIALIZER.stop()
317
- MATERIALIZER = None
318
-
319
-
320
- def get_materializer() -> Materializer:
321
- global MATERIALIZER
322
- if MATERIALIZER is None:
323
- raise RuntimeError("Materializer not initialized")
324
- return MATERIALIZER
325
-
326
-
327
- async def maybe_back_pressure(request: Request, kbid: str, resource_uuid: Optional[str] = None) -> None:
328
- """
329
- This function does system checks to see if we need to put back pressure on writes.
330
- In that case, a HTTP 429 will be raised with the estimated time to try again.
331
- """
332
- if not is_back_pressure_enabled() or is_onprem_nucliadb():
333
- return
334
- await back_pressure_checks(request, kbid, resource_uuid)
335
-
336
-
337
- async def back_pressure_checks(request: Request, kbid: str, resource_uuid: Optional[str] = None):
338
- """
339
- Will raise a 429 if back pressure is needed:
340
- - If the processing engine is behind.
341
- - If ingest processed consumer is behind.
342
- - If the indexing on nodes affected by the request (kbid, and resource_uuid) is behind.
343
- """
344
- context = get_app_context(request.app)
345
- materializer = get_materializer()
346
- with cached_back_pressure(kbid, resource_uuid):
347
- check_ingest_behind(materializer.get_ingest_pending())
348
- await check_indexing_behind(context, kbid, resource_uuid, materializer.get_indexing_pending())
349
- await check_processing_behind(materializer, kbid)
350
-
351
-
352
- async def check_processing_behind(materializer: Materializer, kbid: str):
353
- """
354
- This function checks if the processing engine is behind and may raise a 429
355
- if it is further behind than the configured threshold.
356
- """
357
- max_pending = settings.max_processing_pending
358
- if max_pending <= 0:
359
- # Processing back pressure is disabled
360
- return
361
-
362
- kb_pending = await materializer.get_processing_pending(kbid)
363
- if kb_pending > max_pending:
364
- try_after = estimate_try_after(
365
- rate=settings.processing_rate,
366
- pending=kb_pending,
367
- max_wait=settings.max_wait_time,
368
- )
369
- data = BackPressureData(type="processing", try_after=try_after)
370
- logger.info(
371
- "Processing back pressure applied",
372
- extra={
373
- "kbid": kbid,
374
- "try_after": try_after,
375
- "pending": kb_pending,
376
- },
377
- )
378
- raise BackPressureException(data)
379
-
380
-
381
- async def check_indexing_behind(
382
- context: ApplicationContext,
383
- kbid: str,
384
- resource_uuid: Optional[str],
385
- pending: int,
386
- ):
387
- """
388
- If a resource uuid is provided, it will check the nodes that have the replicas
389
- of the resource's shard, otherwise it will check the nodes of all active shards
390
- for the KnowledgeBox.
391
- """
392
- max_pending = settings.max_indexing_pending
393
- if max_pending <= 0:
394
- # Indexing back pressure is disabled
395
- return
396
-
397
- if pending > max_pending:
398
- try_after = estimate_try_after(
399
- rate=settings.indexing_rate,
400
- pending=pending,
401
- max_wait=settings.max_wait_time,
402
- )
403
- data = BackPressureData(type="indexing", try_after=try_after)
404
- logger.info(
405
- "Indexing back pressure applied",
406
- extra={
407
- "kbid": kbid,
408
- "resource_uuid": resource_uuid,
409
- "try_after": try_after,
410
- "pending": pending,
411
- },
412
- )
413
- raise BackPressureException(data)
414
-
415
-
416
- def check_ingest_behind(ingest_pending: int):
417
- max_pending = settings.max_ingest_pending
418
- if max_pending <= 0:
419
- # Ingest back pressure is disabled
420
- return
421
-
422
- if ingest_pending > max_pending:
423
- try_after = estimate_try_after(
424
- rate=settings.ingest_rate,
425
- pending=ingest_pending,
426
- max_wait=settings.max_wait_time,
427
- )
428
- data = BackPressureData(type="ingest", try_after=try_after)
429
- logger.info(
430
- "Ingest back pressure applied",
431
- extra={"try_after": try_after, "pending": ingest_pending},
432
- )
433
- raise BackPressureException(data)
434
-
435
-
436
- def estimate_try_after(rate: float, pending: int, max_wait: int) -> datetime:
437
- """
438
- This function estimates the time to try again based on the rate and the number of pending messages.
439
- """
440
- delta_seconds = min(pending / rate, max_wait)
441
- return datetime.utcnow() + timedelta(seconds=delta_seconds)
442
-
443
-
444
- async def get_nats_consumer_pending_messages(
445
- nats_manager: NatsConnectionManager, *, stream: str, consumer: str
446
- ) -> int:
447
- # get raw js client
448
- js = nats_manager.js
449
- consumer_info = await js.consumer_info(stream, consumer)
450
- return consumer_info.num_pending
451
-
452
-
453
- async def get_kb_active_shard(context: ApplicationContext, kbid: str) -> Optional[ShardObject]:
454
- async with context.kv_driver.transaction(read_only=True) as txn:
455
- return await context.shard_manager.get_current_active_shard(txn, kbid)
456
-
457
-
458
- async def get_resource_shard(
459
- context: ApplicationContext, kbid: str, resource_uuid: str
460
- ) -> Optional[ShardObject]:
461
- async with datamanagers.with_ro_transaction() as txn:
462
- shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=resource_uuid)
463
- if shard_id is None:
464
- # Resource does not exist
465
- logger.debug(
466
- "Resource shard not found",
467
- extra={"kbid": kbid, "resource_uuid": resource_uuid},
468
- )
469
- return None
470
-
471
- all_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
472
- if all_shards is None:
473
- # KB doesn't exist or has been deleted
474
- logger.debug("No shards found for KB", extra={"kbid": kbid})
475
- return None
476
-
477
- for shard in all_shards.shards:
478
- if shard.shard == shard_id:
479
- return shard
480
- else:
481
- logger.error(
482
- "Resource shard not found",
483
- extra={"kbid": kbid, "resource_uuid": resource_uuid, "shard_id": shard_id},
484
- )
485
- return None