nucliadb 6.3.5.post4028__py3-none-any.whl → 6.3.5.post4033__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/cache.py +32 -47
- nucliadb/search/search/cache.py +28 -12
- nucliadb/train/generator.py +8 -4
- nucliadb/train/generators/utils.py +17 -9
- {nucliadb-6.3.5.post4028.dist-info → nucliadb-6.3.5.post4033.dist-info}/METADATA +6 -6
- {nucliadb-6.3.5.post4028.dist-info → nucliadb-6.3.5.post4033.dist-info}/RECORD +9 -9
- {nucliadb-6.3.5.post4028.dist-info → nucliadb-6.3.5.post4033.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.5.post4028.dist-info → nucliadb-6.3.5.post4033.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.5.post4028.dist-info → nucliadb-6.3.5.post4033.dist-info}/top_level.txt +0 -0
nucliadb/common/cache.py
CHANGED
@@ -19,6 +19,7 @@
|
|
19
19
|
#
|
20
20
|
|
21
21
|
import asyncio
|
22
|
+
import contextlib
|
22
23
|
from abc import ABC, abstractmethod
|
23
24
|
from contextvars import ContextVar
|
24
25
|
from dataclasses import dataclass
|
@@ -80,20 +81,14 @@ class Cache(Generic[T], ABC):
|
|
80
81
|
if len_after - len_before > 0:
|
81
82
|
self.metrics._cache_size.inc(len_after - len_before)
|
82
83
|
|
83
|
-
def
|
84
|
-
return
|
84
|
+
def __contains__(self, key: str) -> bool:
|
85
|
+
return self.cache.__contains__(key)
|
85
86
|
|
86
87
|
def clear(self):
|
87
88
|
self.metrics._cache_size.dec(len(self.cache))
|
88
89
|
self.cache.clear()
|
89
90
|
self.locks.clear()
|
90
91
|
|
91
|
-
def __del__(self):
|
92
|
-
# we want to clear the cache before deleting the object and set the
|
93
|
-
# metric appropriately
|
94
|
-
# XXX: apparently, this doesn't work properly. Don't rely on it
|
95
|
-
self.clear()
|
96
|
-
|
97
92
|
@abstractmethod
|
98
93
|
@cached_property
|
99
94
|
def metrics(self) -> CacheMetrics: ...
|
@@ -105,11 +100,6 @@ class ResourceCache(Cache[ResourceORM]):
|
|
105
100
|
ops=resource_cache_ops,
|
106
101
|
)
|
107
102
|
|
108
|
-
# This cache size is an arbitrary number, once we have a metric in place and
|
109
|
-
# we analyze memory consumption, we can adjust it with more knoweldge
|
110
|
-
def __init__(self, cache_size: int = 128) -> None:
|
111
|
-
super().__init__(cache_size)
|
112
|
-
|
113
103
|
|
114
104
|
class ExtractedTextCache(Cache[ExtractedText]):
|
115
105
|
"""
|
@@ -126,9 +116,6 @@ class ExtractedTextCache(Cache[ExtractedText]):
|
|
126
116
|
ops=extracted_text_cache_ops,
|
127
117
|
)
|
128
118
|
|
129
|
-
def __init__(self, cache_size: int = 128):
|
130
|
-
super().__init__(cache_size)
|
131
|
-
|
132
119
|
|
133
120
|
# Global caches (per asyncio task)
|
134
121
|
|
@@ -139,46 +126,44 @@ etcache: ContextVar[Optional[ExtractedTextCache]] = ContextVar("etcache", defaul
|
|
139
126
|
# Cache management
|
140
127
|
|
141
128
|
|
142
|
-
# Get or create a resource cache specific to the current asyncio task (and all
|
143
|
-
# its subtasks). If you spawn subtasks that use this cache, make sure to create
|
144
|
-
# it in the parent task, otherwise each subtask will have its own independent
|
145
|
-
# cache instance
|
146
|
-
def get_or_create_resource_cache(clear: bool = False) -> ResourceCache:
|
147
|
-
cache: Optional[ResourceCache] = rcache.get()
|
148
|
-
if cache is None or clear:
|
149
|
-
cache = ResourceCache()
|
150
|
-
rcache.set(cache)
|
151
|
-
return cache
|
152
|
-
|
153
|
-
|
154
129
|
def get_resource_cache() -> Optional[ResourceCache]:
|
155
130
|
return rcache.get()
|
156
131
|
|
157
132
|
|
158
|
-
def
|
159
|
-
|
160
|
-
|
133
|
+
def get_extracted_text_cache() -> Optional[ExtractedTextCache]:
|
134
|
+
return etcache.get()
|
135
|
+
|
161
136
|
|
137
|
+
@contextlib.contextmanager
|
138
|
+
def _use_cache(klass: type[Cache], context_var: ContextVar, /, **kwargs):
|
139
|
+
"""Context manager that manages a context var cache. It's responsible of
|
140
|
+
cache creation and cleanup.
|
162
141
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
rcache.set(None)
|
168
|
-
cache.clear()
|
142
|
+
Note the configured cache is specific to the current asyncio task (and all
|
143
|
+
its subtasks). If you spawn subtasks that should share a cache, make sure
|
144
|
+
the parent task is the one using this decorator, otherwise, each subtask
|
145
|
+
will use its own independent cache instance
|
169
146
|
|
147
|
+
Do not use the cache object outside the scope of this context manager!
|
148
|
+
Otherwise, metrics and cleanup could get wrong.
|
170
149
|
|
171
|
-
|
172
|
-
|
150
|
+
"""
|
151
|
+
cache = klass(**kwargs)
|
152
|
+
token = context_var.set(cache)
|
153
|
+
try:
|
154
|
+
yield cache
|
155
|
+
finally:
|
156
|
+
context_var.reset(token)
|
157
|
+
cache.clear()
|
173
158
|
|
174
159
|
|
175
|
-
|
176
|
-
|
177
|
-
|
160
|
+
@contextlib.contextmanager
|
161
|
+
def resource_cache(size: int):
|
162
|
+
with _use_cache(ResourceCache, rcache, cache_size=size) as cache:
|
163
|
+
yield cache
|
178
164
|
|
179
165
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
cache.clear()
|
166
|
+
@contextlib.contextmanager
|
167
|
+
def extracted_text_cache(size: int):
|
168
|
+
with _use_cache(ExtractedTextCache, etcache, cache_size=size) as cache:
|
169
|
+
yield cache
|
nucliadb/search/search/cache.py
CHANGED
@@ -21,13 +21,13 @@ import contextlib
|
|
21
21
|
import logging
|
22
22
|
from typing import Optional
|
23
23
|
|
24
|
+
import backoff
|
25
|
+
|
24
26
|
from nucliadb.common.cache import (
|
25
|
-
|
26
|
-
delete_resource_cache,
|
27
|
+
extracted_text_cache,
|
27
28
|
get_extracted_text_cache,
|
28
29
|
get_resource_cache,
|
29
|
-
|
30
|
-
set_resource_cache,
|
30
|
+
resource_cache,
|
31
31
|
)
|
32
32
|
from nucliadb.common.ids import FieldId
|
33
33
|
from nucliadb.common.maindb.utils import get_driver
|
@@ -53,7 +53,7 @@ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
|
53
53
|
return await _orm_get_resource(kbid, uuid)
|
54
54
|
|
55
55
|
async with resource_cache.get_lock(uuid):
|
56
|
-
if not resource_cache
|
56
|
+
if uuid not in resource_cache:
|
57
57
|
resource_cache.metrics.ops.inc({"type": "miss"})
|
58
58
|
orm_resource = await _orm_get_resource(kbid, uuid)
|
59
59
|
else:
|
@@ -94,13 +94,30 @@ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
|
|
94
94
|
return extracted_text
|
95
95
|
|
96
96
|
cache.metrics.ops.inc({"type": "miss"})
|
97
|
-
extracted_text = await field
|
97
|
+
extracted_text = await field_get_extracted_text(field)
|
98
98
|
if extracted_text is not None:
|
99
99
|
# Only cache if we actually have extracted text
|
100
100
|
cache.set(key, extracted_text)
|
101
101
|
return extracted_text
|
102
102
|
|
103
103
|
|
104
|
+
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
|
105
|
+
async def field_get_extracted_text(field: Field) -> Optional[ExtractedText]:
|
106
|
+
try:
|
107
|
+
return await field.get_extracted_text()
|
108
|
+
except Exception:
|
109
|
+
logger.warning(
|
110
|
+
"Error getting extracted text for field. Retrying",
|
111
|
+
exc_info=True,
|
112
|
+
extra={
|
113
|
+
"kbid": field.kbid,
|
114
|
+
"resource_id": field.resource.uuid,
|
115
|
+
"field": f"{field.type}/{field.id}",
|
116
|
+
},
|
117
|
+
)
|
118
|
+
raise
|
119
|
+
|
120
|
+
|
104
121
|
async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
|
105
122
|
rid = field.rid
|
106
123
|
orm_resource = await get_resource(kbid, rid)
|
@@ -127,10 +144,9 @@ def request_caches():
|
|
127
144
|
... resource = await get_resource(kbid, uuid)
|
128
145
|
... extracted_text = await get_extracted_text_from_field_id(kbid, rid, field_id)
|
129
146
|
"""
|
130
|
-
|
131
|
-
|
132
|
-
|
147
|
+
|
148
|
+
# This cache size is an arbitrary number, once we have a metric in place and
|
149
|
+
# we analyze memory consumption, we can adjust it with more knoweldge
|
150
|
+
cache_size = 50
|
151
|
+
with resource_cache(cache_size), extracted_text_cache(cache_size):
|
133
152
|
yield
|
134
|
-
finally:
|
135
|
-
delete_resource_cache()
|
136
|
-
delete_extracted_text_cache()
|
nucliadb/train/generator.py
CHANGED
@@ -22,6 +22,7 @@ from typing import AsyncIterator, Optional
|
|
22
22
|
|
23
23
|
from fastapi import HTTPException
|
24
24
|
|
25
|
+
from nucliadb.common.cache import resource_cache
|
25
26
|
from nucliadb.train.generators.field_classifier import (
|
26
27
|
field_classification_batch_generator,
|
27
28
|
)
|
@@ -85,7 +86,10 @@ async def generate_train_data(kbid: str, shard: str, trainset: TrainSet):
|
|
85
86
|
detail=f"Invalid train type '{TaskType.Name(trainset.type)}'",
|
86
87
|
)
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
# This cache size is an arbitrary number, once we have a metric in place and
|
90
|
+
# we analyze memory consumption, we can adjust it with more knoweldge
|
91
|
+
with resource_cache(size=20):
|
92
|
+
async for item in batch_generator:
|
93
|
+
payload = item.SerializeToString()
|
94
|
+
yield len(payload).to_bytes(4, byteorder="big", signed=False)
|
95
|
+
yield payload
|
@@ -20,7 +20,7 @@
|
|
20
20
|
|
21
21
|
from typing import Any, AsyncGenerator, AsyncIterator, Optional, Type
|
22
22
|
|
23
|
-
from nucliadb.common.cache import
|
23
|
+
from nucliadb.common.cache import get_resource_cache
|
24
24
|
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB
|
25
25
|
from nucliadb.common.maindb.utils import get_driver
|
26
26
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
@@ -31,22 +31,30 @@ from nucliadb_utils.utilities import get_storage
|
|
31
31
|
|
32
32
|
|
33
33
|
async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
34
|
-
resource_cache =
|
34
|
+
resource_cache = get_resource_cache()
|
35
35
|
orm_resource: Optional[ResourceORM] = None
|
36
|
-
if
|
36
|
+
if resource_cache is None:
|
37
|
+
return await _get_resource_from_db(kbid, uuid)
|
38
|
+
logger.warning("Resource cache is not set")
|
39
|
+
|
40
|
+
if uuid not in resource_cache:
|
37
41
|
resource_cache.metrics.ops.inc({"type": "miss"})
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
orm_resource = await kb.get(uuid)
|
42
|
-
if orm_resource is not None:
|
43
|
-
resource_cache.set(uuid, orm_resource)
|
42
|
+
orm_resource = await _get_resource_from_db(kbid, uuid)
|
43
|
+
if orm_resource is not None:
|
44
|
+
resource_cache.set(uuid, orm_resource)
|
44
45
|
else:
|
45
46
|
resource_cache.metrics.ops.inc({"type": "hit"})
|
46
47
|
orm_resource = resource_cache.get(uuid)
|
47
48
|
return orm_resource
|
48
49
|
|
49
50
|
|
51
|
+
async def _get_resource_from_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
52
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
53
|
+
async with get_driver().transaction(read_only=True) as transaction:
|
54
|
+
kb = KnowledgeBoxORM(transaction, storage, kbid)
|
55
|
+
return await kb.get(uuid)
|
56
|
+
|
57
|
+
|
50
58
|
async def get_paragraph(kbid: str, paragraph_id: str) -> str:
|
51
59
|
if paragraph_id.count("/") == 5:
|
52
60
|
rid, field_type, field, split_str, start_end = paragraph_id.split("/")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.3.5.
|
3
|
+
Version: 6.3.5.post4033
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.3.5.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.3.5.
|
26
|
-
Requires-Dist: nucliadb-models>=6.3.5.
|
27
|
-
Requires-Dist: nidx-protos>=6.3.5.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.3.5.post4033
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.post4033
|
25
|
+
Requires-Dist: nucliadb-protos>=6.3.5.post4033
|
26
|
+
Requires-Dist: nucliadb-models>=6.3.5.post4033
|
27
|
+
Requires-Dist: nidx-protos>=6.3.5.post4033
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|
@@ -53,7 +53,7 @@ nucliadb/backups/settings.py,sha256=SyzsInj1BRbBI0atg5IXWbMbOZ_eVg4eSQ3IcnUhCxQ,
|
|
53
53
|
nucliadb/backups/tasks.py,sha256=WkL1LgdYBHbV_A5ilyYv5p3zmXwxH68TDudytN5f7zk,4225
|
54
54
|
nucliadb/backups/utils.py,sha256=_Vogjqcru5oqNZM-bZ0q7Ju79Bv1PD-LVFEa7Z-Q13I,1261
|
55
55
|
nucliadb/common/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
56
|
-
nucliadb/common/cache.py,sha256=
|
56
|
+
nucliadb/common/cache.py,sha256=r3zFCivPq9BWY1S4UCraY_DnSiXh7Bd0C7wmwGDX0D8,5255
|
57
57
|
nucliadb/common/constants.py,sha256=QpigxJh_CtD85Evy0PtV5cVq6x0U_f9xfIcXz1ymkUg,869
|
58
58
|
nucliadb/common/counters.py,sha256=8lOi3A2HeLDDlcNaS2QT1SfD3350VPBjiY3FkmHH1V8,977
|
59
59
|
nucliadb/common/ids.py,sha256=4QjoIofes_vtKj2HsFWZf8VVIVWXxdkYtLpx1n618Us,8239
|
@@ -228,7 +228,7 @@ nucliadb/search/api/v1/resource/utils.py,sha256=-NjZqAQtFEXKpIh8ui5S26ItnJ5rzmmG
|
|
228
228
|
nucliadb/search/requesters/__init__.py,sha256=itSI7dtTwFP55YMX4iK7JzdMHS5CQVUiB1XzQu4UBh8,833
|
229
229
|
nucliadb/search/requesters/utils.py,sha256=cQZ4-NftiMljoWQ7-Zl7nWfr6u_FY8u_wc9kTvKQcAg,6999
|
230
230
|
nucliadb/search/search/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
231
|
-
nucliadb/search/search/cache.py,sha256=
|
231
|
+
nucliadb/search/search/cache.py,sha256=WUQe5tbAbJSjbcqqMWve9w_rziZBOCi_HCqaw31ghJU,5428
|
232
232
|
nucliadb/search/search/cut.py,sha256=ytY0_GY7ocNjfxTb4aosxEp4ZfhQNDP--JkhEMGD298,1153
|
233
233
|
nucliadb/search/search/exceptions.py,sha256=klGLgAGGrXcSGix_W6418ZBMqDchAIGjN77ofkOScEI,1039
|
234
234
|
nucliadb/search/search/fetch.py,sha256=XJHIFnZmXM_8Kb37lb4lg1GYG7cZ1plT-qAIb_QziX4,6184
|
@@ -293,7 +293,7 @@ nucliadb/tests/config.py,sha256=JN_Jhgj-fwM9_8IeO9pwxr6C1PiwRDrXxm67Y38rU30,2080
|
|
293
293
|
nucliadb/tests/vectors.py,sha256=CcNKx-E8LPpyvRyljbmb-Tn_wST9Juw2CBoogWrKiTk,62843
|
294
294
|
nucliadb/train/__init__.py,sha256=NVwe5yULoHXb80itIJT8YJYEz2xbiOPQ7_OMys6XJw8,1301
|
295
295
|
nucliadb/train/app.py,sha256=TiRttTvekLuZdIvi46E4HyuumDTkR4G4Luqq3fEdjes,2824
|
296
|
-
nucliadb/train/generator.py,sha256=
|
296
|
+
nucliadb/train/generator.py,sha256=w2Jv605Zv4o1F9SQc1NlpuVHiJ3vyaCufv2DKTDlXb4,4158
|
297
297
|
nucliadb/train/lifecycle.py,sha256=a96KuAVZ0sf9TVVW6v6szVXn2eGBaboszwnv_HdWiZk,1820
|
298
298
|
nucliadb/train/models.py,sha256=BmgmMjDsu_1Ih5JDAqo6whhume90q0ASJcDP9dkMQm8,1198
|
299
299
|
nucliadb/train/nodes.py,sha256=HROQMRw2g5sJTnuBagh3B0id3iWonRJ68tg3skOme9k,5748
|
@@ -321,7 +321,7 @@ nucliadb/train/generators/paragraph_streaming.py,sha256=dsM7a5hBd2iokvFuxnZhQeko
|
|
321
321
|
nucliadb/train/generators/question_answer_streaming.py,sha256=P7-de4W4yW2mgEQ82fF2OZVyx6QJHXezY52qDciDcmw,5680
|
322
322
|
nucliadb/train/generators/sentence_classifier.py,sha256=DuvXfnWvLhklYR_qFGk2LqUyl2JE7CMVFwuHaPyC9Ys,5121
|
323
323
|
nucliadb/train/generators/token_classifier.py,sha256=0848GqoXh8ywU82cPUrkzOM53-lZ1MVCw--8yDABigY,9557
|
324
|
-
nucliadb/train/generators/utils.py,sha256=
|
324
|
+
nucliadb/train/generators/utils.py,sha256=0jbCfD50RORC2T-pmvRHX4iXbioZQ7ZoRKwImpd7U_4,3981
|
325
325
|
nucliadb/writer/__init__.py,sha256=S298mrZL3vr62OrBqi97mdLxgR5cReMlRJgnaQHZV7s,1304
|
326
326
|
nucliadb/writer/app.py,sha256=ABBO8-u4pDAa61b3mCdD0TFhuHAYcxMkgpZSGgWARuE,2736
|
327
327
|
nucliadb/writer/back_pressure.py,sha256=JaiC2JAugVA92gDHzABZFiuQexiOKZC9C-3Jn9VF-M0,17898
|
@@ -361,8 +361,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
361
361
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
362
362
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
363
363
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
364
|
-
nucliadb-6.3.5.
|
365
|
-
nucliadb-6.3.5.
|
366
|
-
nucliadb-6.3.5.
|
367
|
-
nucliadb-6.3.5.
|
368
|
-
nucliadb-6.3.5.
|
364
|
+
nucliadb-6.3.5.post4033.dist-info/METADATA,sha256=3k6HCLT1QjmqSlXDmUdknc4RqNGQQQ6Up3klneIqzwU,4301
|
365
|
+
nucliadb-6.3.5.post4033.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
366
|
+
nucliadb-6.3.5.post4033.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
367
|
+
nucliadb-6.3.5.post4033.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
368
|
+
nucliadb-6.3.5.post4033.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|