nucliadb 6.3.5.post4032__py3-none-any.whl → 6.3.5.post4033__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nucliadb/common/cache.py CHANGED
@@ -19,6 +19,7 @@
19
19
  #
20
20
 
21
21
  import asyncio
22
+ import contextlib
22
23
  from abc import ABC, abstractmethod
23
24
  from contextvars import ContextVar
24
25
  from dataclasses import dataclass
@@ -80,20 +81,14 @@ class Cache(Generic[T], ABC):
80
81
  if len_after - len_before > 0:
81
82
  self.metrics._cache_size.inc(len_after - len_before)
82
83
 
83
- def contains(self, key: str) -> bool:
84
- return key in self.cache
84
+ def __contains__(self, key: str) -> bool:
85
+ return self.cache.__contains__(key)
85
86
 
86
87
  def clear(self):
87
88
  self.metrics._cache_size.dec(len(self.cache))
88
89
  self.cache.clear()
89
90
  self.locks.clear()
90
91
 
91
- def __del__(self):
92
- # we want to clear the cache before deleting the object and set the
93
- # metric appropriately
94
- # XXX: apparently, this doesn't work properly. Don't rely on it
95
- self.clear()
96
-
97
92
  @abstractmethod
98
93
  @cached_property
99
94
  def metrics(self) -> CacheMetrics: ...
@@ -105,11 +100,6 @@ class ResourceCache(Cache[ResourceORM]):
105
100
  ops=resource_cache_ops,
106
101
  )
107
102
 
108
- # This cache size is an arbitrary number, once we have a metric in place and
109
- # we analyze memory consumption, we can adjust it with more knoweldge
110
- def __init__(self, cache_size: int = 128) -> None:
111
- super().__init__(cache_size)
112
-
113
103
 
114
104
  class ExtractedTextCache(Cache[ExtractedText]):
115
105
  """
@@ -126,9 +116,6 @@ class ExtractedTextCache(Cache[ExtractedText]):
126
116
  ops=extracted_text_cache_ops,
127
117
  )
128
118
 
129
- def __init__(self, cache_size: int = 128):
130
- super().__init__(cache_size)
131
-
132
119
 
133
120
  # Global caches (per asyncio task)
134
121
 
@@ -139,46 +126,44 @@ etcache: ContextVar[Optional[ExtractedTextCache]] = ContextVar("etcache", defaul
139
126
  # Cache management
140
127
 
141
128
 
142
- # Get or create a resource cache specific to the current asyncio task (and all
143
- # its subtasks). If you spawn subtasks that use this cache, make sure to create
144
- # it in the parent task, otherwise each subtask will have its own independent
145
- # cache instance
146
- def get_or_create_resource_cache(clear: bool = False) -> ResourceCache:
147
- cache: Optional[ResourceCache] = rcache.get()
148
- if cache is None or clear:
149
- cache = ResourceCache()
150
- rcache.set(cache)
151
- return cache
152
-
153
-
154
129
  def get_resource_cache() -> Optional[ResourceCache]:
155
130
  return rcache.get()
156
131
 
157
132
 
158
- def set_resource_cache() -> None:
159
- cache = ResourceCache()
160
- rcache.set(cache)
133
+ def get_extracted_text_cache() -> Optional[ExtractedTextCache]:
134
+ return etcache.get()
135
+
161
136
 
137
+ @contextlib.contextmanager
138
+ def _use_cache(klass: type[Cache], context_var: ContextVar, /, **kwargs):
139
+ """Context manager that manages a context var cache. It's responsible of
140
+ cache creation and cleanup.
162
141
 
163
- # Delete resource cache and all its content
164
- def delete_resource_cache() -> None:
165
- cache = rcache.get()
166
- if cache is not None:
167
- rcache.set(None)
168
- cache.clear()
142
+ Note the configured cache is specific to the current asyncio task (and all
143
+ its subtasks). If you spawn subtasks that should share a cache, make sure
144
+ the parent task is the one using this decorator, otherwise, each subtask
145
+ will use its own independent cache instance
169
146
 
147
+ Do not use the cache object outside the scope of this context manager!
148
+ Otherwise, metrics and cleanup could get wrong.
170
149
 
171
- def get_extracted_text_cache() -> Optional[ExtractedTextCache]:
172
- return etcache.get()
150
+ """
151
+ cache = klass(**kwargs)
152
+ token = context_var.set(cache)
153
+ try:
154
+ yield cache
155
+ finally:
156
+ context_var.reset(token)
157
+ cache.clear()
173
158
 
174
159
 
175
- def set_extracted_text_cache() -> None:
176
- value = ExtractedTextCache()
177
- etcache.set(value)
160
+ @contextlib.contextmanager
161
+ def resource_cache(size: int):
162
+ with _use_cache(ResourceCache, rcache, cache_size=size) as cache:
163
+ yield cache
178
164
 
179
165
 
180
- def delete_extracted_text_cache() -> None:
181
- cache = etcache.get()
182
- if cache is not None:
183
- etcache.set(None)
184
- cache.clear()
166
+ @contextlib.contextmanager
167
+ def extracted_text_cache(size: int):
168
+ with _use_cache(ExtractedTextCache, etcache, cache_size=size) as cache:
169
+ yield cache
@@ -24,12 +24,10 @@ from typing import Optional
24
24
  import backoff
25
25
 
26
26
  from nucliadb.common.cache import (
27
- delete_extracted_text_cache,
28
- delete_resource_cache,
27
+ extracted_text_cache,
29
28
  get_extracted_text_cache,
30
29
  get_resource_cache,
31
- set_extracted_text_cache,
32
- set_resource_cache,
30
+ resource_cache,
33
31
  )
34
32
  from nucliadb.common.ids import FieldId
35
33
  from nucliadb.common.maindb.utils import get_driver
@@ -55,7 +53,7 @@ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
55
53
  return await _orm_get_resource(kbid, uuid)
56
54
 
57
55
  async with resource_cache.get_lock(uuid):
58
- if not resource_cache.contains(uuid):
56
+ if uuid not in resource_cache:
59
57
  resource_cache.metrics.ops.inc({"type": "miss"})
60
58
  orm_resource = await _orm_get_resource(kbid, uuid)
61
59
  else:
@@ -146,10 +144,9 @@ def request_caches():
146
144
  ... resource = await get_resource(kbid, uuid)
147
145
  ... extracted_text = await get_extracted_text_from_field_id(kbid, rid, field_id)
148
146
  """
149
- set_resource_cache()
150
- set_extracted_text_cache()
151
- try:
147
+
148
+ # This cache size is an arbitrary number, once we have a metric in place and
149
+ # we analyze memory consumption, we can adjust it with more knoweldge
150
+ cache_size = 50
151
+ with resource_cache(cache_size), extracted_text_cache(cache_size):
152
152
  yield
153
- finally:
154
- delete_resource_cache()
155
- delete_extracted_text_cache()
@@ -22,6 +22,7 @@ from typing import AsyncIterator, Optional
22
22
 
23
23
  from fastapi import HTTPException
24
24
 
25
+ from nucliadb.common.cache import resource_cache
25
26
  from nucliadb.train.generators.field_classifier import (
26
27
  field_classification_batch_generator,
27
28
  )
@@ -85,7 +86,10 @@ async def generate_train_data(kbid: str, shard: str, trainset: TrainSet):
85
86
  detail=f"Invalid train type '{TaskType.Name(trainset.type)}'",
86
87
  )
87
88
 
88
- async for item in batch_generator:
89
- payload = item.SerializeToString()
90
- yield len(payload).to_bytes(4, byteorder="big", signed=False)
91
- yield payload
89
+ # This cache size is an arbitrary number, once we have a metric in place and
90
+ # we analyze memory consumption, we can adjust it with more knoweldge
91
+ with resource_cache(size=20):
92
+ async for item in batch_generator:
93
+ payload = item.SerializeToString()
94
+ yield len(payload).to_bytes(4, byteorder="big", signed=False)
95
+ yield payload
@@ -20,7 +20,7 @@
20
20
 
21
21
  from typing import Any, AsyncGenerator, AsyncIterator, Optional, Type
22
22
 
23
- from nucliadb.common.cache import get_or_create_resource_cache
23
+ from nucliadb.common.cache import get_resource_cache
24
24
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB
25
25
  from nucliadb.common.maindb.utils import get_driver
26
26
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
@@ -31,22 +31,30 @@ from nucliadb_utils.utilities import get_storage
31
31
 
32
32
 
33
33
  async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
34
- resource_cache = get_or_create_resource_cache()
34
+ resource_cache = get_resource_cache()
35
35
  orm_resource: Optional[ResourceORM] = None
36
- if not resource_cache.contains(uuid):
36
+ if resource_cache is None:
37
+ return await _get_resource_from_db(kbid, uuid)
38
+ logger.warning("Resource cache is not set")
39
+
40
+ if uuid not in resource_cache:
37
41
  resource_cache.metrics.ops.inc({"type": "miss"})
38
- storage = await get_storage(service_name=SERVICE_NAME)
39
- async with get_driver().transaction(read_only=True) as transaction:
40
- kb = KnowledgeBoxORM(transaction, storage, kbid)
41
- orm_resource = await kb.get(uuid)
42
- if orm_resource is not None:
43
- resource_cache.set(uuid, orm_resource)
42
+ orm_resource = await _get_resource_from_db(kbid, uuid)
43
+ if orm_resource is not None:
44
+ resource_cache.set(uuid, orm_resource)
44
45
  else:
45
46
  resource_cache.metrics.ops.inc({"type": "hit"})
46
47
  orm_resource = resource_cache.get(uuid)
47
48
  return orm_resource
48
49
 
49
50
 
51
+ async def _get_resource_from_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
52
+ storage = await get_storage(service_name=SERVICE_NAME)
53
+ async with get_driver().transaction(read_only=True) as transaction:
54
+ kb = KnowledgeBoxORM(transaction, storage, kbid)
55
+ return await kb.get(uuid)
56
+
57
+
50
58
  async def get_paragraph(kbid: str, paragraph_id: str) -> str:
51
59
  if paragraph_id.count("/") == 5:
52
60
  rid, field_type, field, split_str, start_end = paragraph_id.split("/")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.3.5.post4032
3
+ Version: 6.3.5.post4033
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.5.post4032
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.post4032
25
- Requires-Dist: nucliadb-protos>=6.3.5.post4032
26
- Requires-Dist: nucliadb-models>=6.3.5.post4032
27
- Requires-Dist: nidx-protos>=6.3.5.post4032
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.5.post4033
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.post4033
25
+ Requires-Dist: nucliadb-protos>=6.3.5.post4033
26
+ Requires-Dist: nucliadb-models>=6.3.5.post4033
27
+ Requires-Dist: nidx-protos>=6.3.5.post4033
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn[standard]
@@ -53,7 +53,7 @@ nucliadb/backups/settings.py,sha256=SyzsInj1BRbBI0atg5IXWbMbOZ_eVg4eSQ3IcnUhCxQ,
53
53
  nucliadb/backups/tasks.py,sha256=WkL1LgdYBHbV_A5ilyYv5p3zmXwxH68TDudytN5f7zk,4225
54
54
  nucliadb/backups/utils.py,sha256=_Vogjqcru5oqNZM-bZ0q7Ju79Bv1PD-LVFEa7Z-Q13I,1261
55
55
  nucliadb/common/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
56
- nucliadb/common/cache.py,sha256=bwkKJ3tgsDE794SIklkOrAwY8vPSvyambC0y8ziMthg,5669
56
+ nucliadb/common/cache.py,sha256=r3zFCivPq9BWY1S4UCraY_DnSiXh7Bd0C7wmwGDX0D8,5255
57
57
  nucliadb/common/constants.py,sha256=QpigxJh_CtD85Evy0PtV5cVq6x0U_f9xfIcXz1ymkUg,869
58
58
  nucliadb/common/counters.py,sha256=8lOi3A2HeLDDlcNaS2QT1SfD3350VPBjiY3FkmHH1V8,977
59
59
  nucliadb/common/ids.py,sha256=4QjoIofes_vtKj2HsFWZf8VVIVWXxdkYtLpx1n618Us,8239
@@ -228,7 +228,7 @@ nucliadb/search/api/v1/resource/utils.py,sha256=-NjZqAQtFEXKpIh8ui5S26ItnJ5rzmmG
228
228
  nucliadb/search/requesters/__init__.py,sha256=itSI7dtTwFP55YMX4iK7JzdMHS5CQVUiB1XzQu4UBh8,833
229
229
  nucliadb/search/requesters/utils.py,sha256=cQZ4-NftiMljoWQ7-Zl7nWfr6u_FY8u_wc9kTvKQcAg,6999
230
230
  nucliadb/search/search/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
231
- nucliadb/search/search/cache.py,sha256=F7rT0X-bp02Qh-heNo1tRhXG3iqTkL41F1wA_HIZdoM,5404
231
+ nucliadb/search/search/cache.py,sha256=WUQe5tbAbJSjbcqqMWve9w_rziZBOCi_HCqaw31ghJU,5428
232
232
  nucliadb/search/search/cut.py,sha256=ytY0_GY7ocNjfxTb4aosxEp4ZfhQNDP--JkhEMGD298,1153
233
233
  nucliadb/search/search/exceptions.py,sha256=klGLgAGGrXcSGix_W6418ZBMqDchAIGjN77ofkOScEI,1039
234
234
  nucliadb/search/search/fetch.py,sha256=XJHIFnZmXM_8Kb37lb4lg1GYG7cZ1plT-qAIb_QziX4,6184
@@ -293,7 +293,7 @@ nucliadb/tests/config.py,sha256=JN_Jhgj-fwM9_8IeO9pwxr6C1PiwRDrXxm67Y38rU30,2080
293
293
  nucliadb/tests/vectors.py,sha256=CcNKx-E8LPpyvRyljbmb-Tn_wST9Juw2CBoogWrKiTk,62843
294
294
  nucliadb/train/__init__.py,sha256=NVwe5yULoHXb80itIJT8YJYEz2xbiOPQ7_OMys6XJw8,1301
295
295
  nucliadb/train/app.py,sha256=TiRttTvekLuZdIvi46E4HyuumDTkR4G4Luqq3fEdjes,2824
296
- nucliadb/train/generator.py,sha256=0_zqWsLUHmJZl0lXhGorO5CWSkl42-k78dqb1slZ5h0,3904
296
+ nucliadb/train/generator.py,sha256=w2Jv605Zv4o1F9SQc1NlpuVHiJ3vyaCufv2DKTDlXb4,4158
297
297
  nucliadb/train/lifecycle.py,sha256=a96KuAVZ0sf9TVVW6v6szVXn2eGBaboszwnv_HdWiZk,1820
298
298
  nucliadb/train/models.py,sha256=BmgmMjDsu_1Ih5JDAqo6whhume90q0ASJcDP9dkMQm8,1198
299
299
  nucliadb/train/nodes.py,sha256=HROQMRw2g5sJTnuBagh3B0id3iWonRJ68tg3skOme9k,5748
@@ -321,7 +321,7 @@ nucliadb/train/generators/paragraph_streaming.py,sha256=dsM7a5hBd2iokvFuxnZhQeko
321
321
  nucliadb/train/generators/question_answer_streaming.py,sha256=P7-de4W4yW2mgEQ82fF2OZVyx6QJHXezY52qDciDcmw,5680
322
322
  nucliadb/train/generators/sentence_classifier.py,sha256=DuvXfnWvLhklYR_qFGk2LqUyl2JE7CMVFwuHaPyC9Ys,5121
323
323
  nucliadb/train/generators/token_classifier.py,sha256=0848GqoXh8ywU82cPUrkzOM53-lZ1MVCw--8yDABigY,9557
324
- nucliadb/train/generators/utils.py,sha256=bArGm3MyFvHAIImhnQ3GZ6u2mZAhmRd3t3AbfDK-Aeg,3756
324
+ nucliadb/train/generators/utils.py,sha256=0jbCfD50RORC2T-pmvRHX4iXbioZQ7ZoRKwImpd7U_4,3981
325
325
  nucliadb/writer/__init__.py,sha256=S298mrZL3vr62OrBqi97mdLxgR5cReMlRJgnaQHZV7s,1304
326
326
  nucliadb/writer/app.py,sha256=ABBO8-u4pDAa61b3mCdD0TFhuHAYcxMkgpZSGgWARuE,2736
327
327
  nucliadb/writer/back_pressure.py,sha256=JaiC2JAugVA92gDHzABZFiuQexiOKZC9C-3Jn9VF-M0,17898
@@ -361,8 +361,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
361
361
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
362
362
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
363
363
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
364
- nucliadb-6.3.5.post4032.dist-info/METADATA,sha256=hbwiwOO_3khU7Rm8IyIFwpmCR3KwiILVV-agkTKmNgA,4301
365
- nucliadb-6.3.5.post4032.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
366
- nucliadb-6.3.5.post4032.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
367
- nucliadb-6.3.5.post4032.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
368
- nucliadb-6.3.5.post4032.dist-info/RECORD,,
364
+ nucliadb-6.3.5.post4033.dist-info/METADATA,sha256=3k6HCLT1QjmqSlXDmUdknc4RqNGQQQ6Up3klneIqzwU,4301
365
+ nucliadb-6.3.5.post4033.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
366
+ nucliadb-6.3.5.post4033.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
367
+ nucliadb-6.3.5.post4033.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
368
+ nucliadb-6.3.5.post4033.dist-info/RECORD,,