nucliadb 6.4.0.post4192__py3-none-any.whl → 6.4.0.post4196__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nucliadb/common/cache.py CHANGED
@@ -18,19 +18,28 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- import asyncio
22
21
  import contextlib
22
+ import logging
23
23
  from abc import ABC, abstractmethod
24
24
  from contextvars import ContextVar
25
25
  from dataclasses import dataclass
26
26
  from functools import cached_property
27
27
  from typing import Generic, Optional, TypeVar
28
28
 
29
- from lru import LRU
29
+ import backoff
30
+ from async_lru import _LRUCacheWrapper, alru_cache
31
+ from typing_extensions import ParamSpec
30
32
 
33
+ from nucliadb.common.ids import FieldId
34
+ from nucliadb.common.maindb.utils import get_driver
35
+ from nucliadb.ingest.fields.base import FieldTypes
36
+ from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
31
37
  from nucliadb.ingest.orm.resource import Resource as ResourceORM
32
38
  from nucliadb_protos.utils_pb2 import ExtractedText
33
39
  from nucliadb_telemetry.metrics import Counter, Gauge
40
+ from nucliadb_utils.utilities import get_storage
41
+
42
+ logger = logging.getLogger(__name__)
34
43
 
35
44
  # specific metrics per cache type
36
45
  cached_resources = Gauge("nucliadb_cached_resources")
@@ -39,6 +48,7 @@ resource_cache_ops = Counter("nucliadb_resource_cache_ops", labels={"type": ""})
39
48
  extracted_text_cache_ops = Counter("nucliadb_extracted_text_cache_ops", labels={"type": ""})
40
49
 
41
50
 
51
+ K = ParamSpec("K")
42
52
  T = TypeVar("T")
43
53
 
44
54
 
@@ -48,7 +58,7 @@ class CacheMetrics:
48
58
  ops: Counter
49
59
 
50
60
 
51
- class Cache(Generic[T], ABC):
61
+ class Cache(Generic[K, T], ABC):
52
62
  """Low-level bounded cache implementation with access to per-key async locks
53
63
  in case cache users want to lock concurrent access.
54
64
 
@@ -56,52 +66,43 @@ class Cache(Generic[T], ABC):
56
66
 
57
67
  """
58
68
 
59
- def __init__(self, cache_size: int) -> None:
60
- self.cache: LRU[str, T] = LRU(cache_size, callback=self._evicted_callback)
61
- self.locks: dict[str, asyncio.Lock] = {}
62
-
63
- def _evicted_callback(self, key: str, value: T):
64
- self.locks.pop(key, None)
65
- self.metrics.ops.inc({"type": "evict"})
66
-
67
- def get(self, key: str) -> Optional[T]:
68
- return self.cache.get(key)
69
-
70
- # Get a lock for a specific key. Locks will be evicted at the same time as
71
- # key-value pairs
72
- def get_lock(self, key: str) -> asyncio.Lock:
73
- return self.locks.setdefault(key, asyncio.Lock())
74
-
75
- def set(self, key: str, value: T):
76
- len_before = len(self.cache)
77
-
78
- self.cache[key] = value
69
+ cache: _LRUCacheWrapper[Optional[T]]
79
70
 
80
- len_after = len(self.cache)
81
- if len_after - len_before > 0:
82
- self.metrics._cache_size.inc(len_after - len_before)
71
+ async def get(self, *args: K.args, **kwargs: K.kwargs) -> Optional[T]:
72
+ result = await self.cache(*args)
73
+ # Do not cache None
74
+ if result is None:
75
+ self.cache.cache_invalidate(*args)
76
+ return result
83
77
 
84
- def __contains__(self, key: str) -> bool:
85
- return self.cache.__contains__(key)
86
-
87
- def clear(self):
88
- self.metrics._cache_size.dec(len(self.cache))
89
- self.cache.clear()
90
- self.locks.clear()
78
+ def finalize(self):
79
+ info = self.cache.cache_info()
80
+ self.metrics.ops.inc({"type": "miss"}, value=info.misses)
81
+ self.metrics.ops.inc({"type": "hit"}, value=info.hits)
91
82
 
92
83
  @abstractmethod
93
84
  @cached_property
94
85
  def metrics(self) -> CacheMetrics: ...
95
86
 
96
87
 
97
- class ResourceCache(Cache[ResourceORM]):
88
+ class ResourceCache(Cache[[str, str], ResourceORM]):
89
+ def __init__(self, cache_size: int) -> None:
90
+ @alru_cache(maxsize=cache_size)
91
+ async def _get_resource(kbid: str, rid: str) -> Optional[ResourceORM]:
92
+ storage = await get_storage()
93
+ async with get_driver().transaction(read_only=True) as txn:
94
+ kb = KnowledgeBoxORM(txn, storage, kbid)
95
+ return await kb.get(rid)
96
+
97
+ self.cache = _get_resource
98
+
98
99
  metrics = CacheMetrics(
99
100
  _cache_size=cached_resources,
100
101
  ops=resource_cache_ops,
101
102
  )
102
103
 
103
104
 
104
- class ExtractedTextCache(Cache[ExtractedText]):
105
+ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
105
106
  """
106
107
  Used to cache extracted text from a resource in memory during the process
107
108
  of search results hydration.
@@ -111,6 +112,30 @@ class ExtractedTextCache(Cache[ExtractedText]):
111
112
  fetched for each field where the text block is found.
112
113
  """
113
114
 
115
+ def __init__(self, cache_size: int) -> None:
116
+ @alru_cache(maxsize=cache_size)
117
+ @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
118
+ async def _get_extracted_text(kbid: str, field_id: FieldId) -> Optional[ExtractedText]:
119
+ storage = await get_storage()
120
+ try:
121
+ sf = storage.file_extracted(
122
+ kbid, field_id.rid, field_id.type, field_id.key, FieldTypes.FIELD_TEXT.value
123
+ )
124
+ return await storage.download_pb(sf, ExtractedText)
125
+ except Exception:
126
+ logger.warning(
127
+ "Error getting extracted text for field. Retrying",
128
+ exc_info=True,
129
+ extra={
130
+ "kbid": kbid,
131
+ "resource_id": field_id.rid,
132
+ "field": f"{field_id.type}/{field_id.key}",
133
+ },
134
+ )
135
+ raise
136
+
137
+ self.cache = _get_extracted_text
138
+
114
139
  metrics = CacheMetrics(
115
140
  _cache_size=cached_extracted_texts,
116
141
  ops=extracted_text_cache_ops,
@@ -154,7 +179,7 @@ def _use_cache(klass: type[Cache], context_var: ContextVar, /, **kwargs):
154
179
  yield cache
155
180
  finally:
156
181
  context_var.reset(token)
157
- cache.clear()
182
+ cache.finalize()
158
183
 
159
184
 
160
185
  @contextlib.contextmanager
@@ -30,7 +30,6 @@ import backoff
30
30
  import httpx
31
31
  from fastapi import Request, Response
32
32
  from fastapi.responses import StreamingResponse
33
- from lru import LRU
34
33
  from pydantic import BaseModel, Field, model_validator
35
34
  from typing_extensions import Self
36
35
 
@@ -480,8 +479,7 @@ class ProxiedLearningConfig(LearningConfigService):
480
479
  yield client
481
480
 
482
481
 
483
- _IN_MEMORY_CONFIGS: dict[str, LearningConfiguration]
484
- _IN_MEMORY_CONFIGS = LRU(50) # type: ignore
482
+ _IN_MEMORY_CONFIGS: dict[str, LearningConfiguration] = {}
485
483
 
486
484
 
487
485
  class InMemoryLearningConfig(LearningConfigService):
@@ -45,26 +45,12 @@ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
45
45
  """
46
46
  Will try to get the resource from the cache, if it's not there it will fetch it from the ORM and cache it.
47
47
  """
48
- orm_resource: Optional[ResourceORM] = None
49
-
50
48
  resource_cache = get_resource_cache()
51
49
  if resource_cache is None:
52
50
  logger.warning("Resource cache not set")
53
51
  return await _orm_get_resource(kbid, uuid)
54
52
 
55
- async with resource_cache.get_lock(uuid):
56
- if uuid not in resource_cache:
57
- resource_cache.metrics.ops.inc({"type": "miss"})
58
- orm_resource = await _orm_get_resource(kbid, uuid)
59
- else:
60
- resource_cache.metrics.ops.inc({"type": "hit"})
61
-
62
- if orm_resource is not None:
63
- resource_cache.set(uuid, orm_resource)
64
- else:
65
- orm_resource = resource_cache.get(uuid)
66
-
67
- return orm_resource
53
+ return await resource_cache.get(kbid, uuid)
68
54
 
69
55
 
70
56
  async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
@@ -75,30 +61,17 @@ async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
75
61
 
76
62
 
77
63
  async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
64
+ if field.extracted_text is not None:
65
+ return field.extracted_text
66
+
78
67
  cache = get_extracted_text_cache()
79
68
  if cache is None:
80
69
  logger.warning("Extracted text cache not set")
81
70
  return await field.get_extracted_text()
82
71
 
83
- key = f"{field.kbid}/{field.uuid}/{field.id}"
84
- extracted_text = cache.get(key)
85
- if extracted_text is not None:
86
- cache.metrics.ops.inc({"type": "hit"})
87
- return extracted_text
88
-
89
- async with cache.get_lock(key):
90
- # Check again in case another task already fetched it
91
- extracted_text = cache.get(key)
92
- if extracted_text is not None:
93
- cache.metrics.ops.inc({"type": "hit"})
94
- return extracted_text
95
-
96
- cache.metrics.ops.inc({"type": "miss"})
97
- extracted_text = await field_get_extracted_text(field)
98
- if extracted_text is not None:
99
- # Only cache if we actually have extracted text
100
- cache.set(key, extracted_text)
101
- return extracted_text
72
+ extracted_text = await cache.get(field.kbid, FieldId(field.uuid, field.type, field.id))
73
+ field.extracted_text = extracted_text
74
+ return extracted_text
102
75
 
103
76
 
104
77
  @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
@@ -32,20 +32,11 @@ from nucliadb_utils.utilities import get_storage
32
32
 
33
33
  async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
34
34
  resource_cache = get_resource_cache()
35
- orm_resource: Optional[ResourceORM] = None
36
35
  if resource_cache is None:
37
36
  return await _get_resource_from_db(kbid, uuid)
38
37
  logger.warning("Resource cache is not set")
39
38
 
40
- if uuid not in resource_cache:
41
- resource_cache.metrics.ops.inc({"type": "miss"})
42
- orm_resource = await _get_resource_from_db(kbid, uuid)
43
- if orm_resource is not None:
44
- resource_cache.set(uuid, orm_resource)
45
- else:
46
- resource_cache.metrics.ops.inc({"type": "hit"})
47
- orm_resource = resource_cache.get(uuid)
48
- return orm_resource
39
+ return await resource_cache.get(kbid, uuid)
49
40
 
50
41
 
51
42
  async def _get_resource_from_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.4.0.post4192
3
+ Version: 6.4.0.post4196
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,17 +20,17 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4192
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4192
25
- Requires-Dist: nucliadb-protos>=6.4.0.post4192
26
- Requires-Dist: nucliadb-models>=6.4.0.post4192
27
- Requires-Dist: nidx-protos>=6.4.0.post4192
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4196
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4196
25
+ Requires-Dist: nucliadb-protos>=6.4.0.post4196
26
+ Requires-Dist: nucliadb-models>=6.4.0.post4196
27
+ Requires-Dist: nidx-protos>=6.4.0.post4196
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn[standard]
31
31
  Requires-Dist: argdantic
32
32
  Requires-Dist: aiohttp>=3.11.11
33
- Requires-Dist: lru-dict>=1.1.7
33
+ Requires-Dist: async-lru>=2
34
34
  Requires-Dist: backoff
35
35
  Requires-Dist: aiofiles>=0.8.0
36
36
  Requires-Dist: psutil>=5.9.7
@@ -39,7 +39,7 @@ migrations/pg/0004_catalog_facets.py,sha256=FJFASHjfEHG3sNve9BP2HnnLO4xr7dnR6Qpc
39
39
  migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
40
40
  nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
41
41
  nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
42
- nucliadb/learning_proxy.py,sha256=Gf76qXxjl1lrHEFaCpOUfjjf0ab6eGLNxLMJz3-M_mo,19354
42
+ nucliadb/learning_proxy.py,sha256=ZFCb4sTWGmNE1SNNAF2eJEcFksn1VMW7AD8-vu7o_U4,19294
43
43
  nucliadb/metrics_exporter.py,sha256=N9ncgPdmkP5_hqlxcBrH7ZFrgM7BkpaZrndmUpdfDKc,4981
44
44
  nucliadb/openapi.py,sha256=wDiw0dVEvTpJvbatkJ0JZLkKm9RItZT5PWRHjqRfqTA,2272
45
45
  nucliadb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -53,7 +53,7 @@ nucliadb/backups/settings.py,sha256=SyzsInj1BRbBI0atg5IXWbMbOZ_eVg4eSQ3IcnUhCxQ,
53
53
  nucliadb/backups/tasks.py,sha256=WkL1LgdYBHbV_A5ilyYv5p3zmXwxH68TDudytN5f7zk,4225
54
54
  nucliadb/backups/utils.py,sha256=_Vogjqcru5oqNZM-bZ0q7Ju79Bv1PD-LVFEa7Z-Q13I,1261
55
55
  nucliadb/common/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
56
- nucliadb/common/cache.py,sha256=r3zFCivPq9BWY1S4UCraY_DnSiXh7Bd0C7wmwGDX0D8,5255
56
+ nucliadb/common/cache.py,sha256=NM69CVvNjlh58jiVUF1JeYPmBO7_L4rB3tffxK0k_vI,6549
57
57
  nucliadb/common/constants.py,sha256=QpigxJh_CtD85Evy0PtV5cVq6x0U_f9xfIcXz1ymkUg,869
58
58
  nucliadb/common/counters.py,sha256=8lOi3A2HeLDDlcNaS2QT1SfD3350VPBjiY3FkmHH1V8,977
59
59
  nucliadb/common/ids.py,sha256=4QjoIofes_vtKj2HsFWZf8VVIVWXxdkYtLpx1n618Us,8239
@@ -228,7 +228,7 @@ nucliadb/search/api/v1/resource/utils.py,sha256=-NjZqAQtFEXKpIh8ui5S26ItnJ5rzmmG
228
228
  nucliadb/search/requesters/__init__.py,sha256=itSI7dtTwFP55YMX4iK7JzdMHS5CQVUiB1XzQu4UBh8,833
229
229
  nucliadb/search/requesters/utils.py,sha256=o5JtXX5KrqMtUJo3u6rw9EOOKXPiw-GaF0oGuZu7PPc,6225
230
230
  nucliadb/search/search/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
231
- nucliadb/search/search/cache.py,sha256=WUQe5tbAbJSjbcqqMWve9w_rziZBOCi_HCqaw31ghJU,5428
231
+ nucliadb/search/search/cache.py,sha256=-6l3i2Qi8ig2SM_FCgOLIaQ48XVj7L5ctd5PdQRY5mY,4458
232
232
  nucliadb/search/search/cut.py,sha256=ytY0_GY7ocNjfxTb4aosxEp4ZfhQNDP--JkhEMGD298,1153
233
233
  nucliadb/search/search/exceptions.py,sha256=klGLgAGGrXcSGix_W6418ZBMqDchAIGjN77ofkOScEI,1039
234
234
  nucliadb/search/search/fetch.py,sha256=eiljOKim-4OOEZn-3fyVZSYxztCH156BXYdqlIwVdN4,6181
@@ -325,7 +325,7 @@ nucliadb/train/generators/paragraph_streaming.py,sha256=1xsc_IqP-1M0TzYTqu5qCvWB
325
325
  nucliadb/train/generators/question_answer_streaming.py,sha256=yZZD0GpuHdV-BT4O8CV1sYVDk8ri8yGPVhfjSp3FTBQ,5626
326
326
  nucliadb/train/generators/sentence_classifier.py,sha256=bp-UeIbZ0vm4ujbTbZnqdI0JWijuMi53cmeLbP7dofI,5063
327
327
  nucliadb/train/generators/token_classifier.py,sha256=DdyMbrpxIVGWdTcz3SEN_3HwxKffUV3JGyTZzlCET8c,9503
328
- nucliadb/train/generators/utils.py,sha256=0jbCfD50RORC2T-pmvRHX4iXbioZQ7ZoRKwImpd7U_4,3981
328
+ nucliadb/train/generators/utils.py,sha256=ZNwvEVPZr-eP0MW3ABN7a11hPQKaa0NdVaRcgBcTp5w,3601
329
329
  nucliadb/writer/__init__.py,sha256=S298mrZL3vr62OrBqi97mdLxgR5cReMlRJgnaQHZV7s,1304
330
330
  nucliadb/writer/app.py,sha256=ABBO8-u4pDAa61b3mCdD0TFhuHAYcxMkgpZSGgWARuE,2736
331
331
  nucliadb/writer/back_pressure.py,sha256=4OwFGq9pvAbChB3WBZAY36lclfD-gD2ouC6YsKA4bIo,16892
@@ -365,8 +365,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
365
365
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
366
366
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
367
367
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
368
- nucliadb-6.4.0.post4192.dist-info/METADATA,sha256=U5VsHHLt_nINiBGZmirG1AEaniLWqSXM9mIM2I_Bz44,4226
369
- nucliadb-6.4.0.post4192.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
370
- nucliadb-6.4.0.post4192.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
371
- nucliadb-6.4.0.post4192.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
372
- nucliadb-6.4.0.post4192.dist-info/RECORD,,
368
+ nucliadb-6.4.0.post4196.dist-info/METADATA,sha256=jtJRm9idqV9MmQAmslJDXhOsEzTsiLBHKsdTvCJ6pqM,4223
369
+ nucliadb-6.4.0.post4196.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
370
+ nucliadb-6.4.0.post4196.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
371
+ nucliadb-6.4.0.post4196.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
372
+ nucliadb-6.4.0.post4196.dist-info/RECORD,,