nucliadb 6.8.1.post4981__py3-none-any.whl → 6.8.1.post4988__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

nucliadb/common/ids.py CHANGED
@@ -47,6 +47,8 @@ FIELD_TYPE_NAME_TO_STR = {
47
47
  FieldTypeName.CONVERSATION: "c",
48
48
  }
49
49
 
50
+ FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
51
+
50
52
 
51
53
  @dataclass
52
54
  class FieldId:
@@ -65,7 +67,7 @@ class FieldId:
65
67
 
66
68
  Examples:
67
69
 
68
- >>> FieldId(rid="rid", type="u", key="/my-link")
70
+ >>> FieldId(rid="rid", type="u", key="my-link")
69
71
  FieldID("rid/u/my-link")
70
72
  >>> FieldId.from_string("rid/u/my-link")
71
73
  FieldID("rid/u/my-link")
@@ -77,31 +79,6 @@ class FieldId:
77
79
  # also knwon as `split`, this indicates a part of a field in, for example, conversations
78
80
  subfield_id: Optional[str] = None
79
81
 
80
- def __repr__(self) -> str:
81
- return f"FieldId({self.full()})"
82
-
83
- def short_without_subfield(self) -> str:
84
- return f"/{self.type}/{self.key}"
85
-
86
- def full(self) -> str:
87
- if self.subfield_id is None:
88
- return f"{self.rid}/{self.type}/{self.key}"
89
- else:
90
- return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
91
-
92
- def __hash__(self) -> int:
93
- return hash(self.full())
94
-
95
- @property
96
- def pb_type(self) -> FieldType.ValueType:
97
- return FIELD_TYPE_STR_TO_PB[self.type]
98
-
99
- @classmethod
100
- def from_pb(
101
- cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
102
- ) -> "FieldId":
103
- return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
104
-
105
82
  @classmethod
106
83
  def from_string(cls, value: str) -> "FieldId":
107
84
  """
@@ -120,11 +97,11 @@ class FieldId:
120
97
  parts = value.split("/")
121
98
  if len(parts) == 3:
122
99
  rid, _type, key = parts
123
- _type = cls.parse_field_type(_type)
100
+ _type = cls._parse_field_type(_type)
124
101
  return cls(rid=rid, type=_type, key=key)
125
102
  elif len(parts) == 4:
126
103
  rid, _type, key, subfield_id = parts
127
- _type = cls.parse_field_type(_type)
104
+ _type = cls._parse_field_type(_type)
128
105
  return cls(
129
106
  rid=rid,
130
107
  type=_type,
@@ -135,7 +112,46 @@ class FieldId:
135
112
  raise ValueError(f"Invalid FieldId: {value}")
136
113
 
137
114
  @classmethod
138
- def parse_field_type(cls, _type: str) -> str:
115
+ def from_pb(
116
+ cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
117
+ ) -> "FieldId":
118
+ return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
119
+
120
+ @property
121
+ def pb_type(self) -> FieldType.ValueType:
122
+ return FIELD_TYPE_STR_TO_PB[self.type]
123
+
124
+ def full(self) -> str:
125
+ if self.subfield_id is None:
126
+ return f"{self.rid}/{self.type}/{self.key}"
127
+ else:
128
+ return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
129
+
130
+ def short_without_subfield(self) -> str:
131
+ return f"/{self.type}/{self.key}"
132
+
133
+ def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
134
+ """Generate a ParagraphId from the current field given its start and
135
+ end.
136
+
137
+ """
138
+ return ParagraphId(
139
+ field_id=self,
140
+ paragraph_start=paragraph_start,
141
+ paragraph_end=paragraph_end,
142
+ )
143
+
144
+ def __str__(self) -> str:
145
+ return self.full()
146
+
147
+ def __repr__(self) -> str:
148
+ return f"FieldId({self.full()})"
149
+
150
+ def __hash__(self) -> int:
151
+ return hash(self.full())
152
+
153
+ @staticmethod
154
+ def _parse_field_type(_type: str) -> str:
139
155
  if _type not in FIELD_TYPE_STR_TO_PB:
140
156
  # Try to parse the enum value
141
157
  # XXX: This is to support field types that are integer values of FieldType
@@ -157,19 +173,6 @@ class ParagraphId:
157
173
  paragraph_start: int
158
174
  paragraph_end: int
159
175
 
160
- def __repr__(self) -> str:
161
- return f"ParagraphId({self.full()})"
162
-
163
- def full(self) -> str:
164
- return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
165
-
166
- def __hash__(self) -> int:
167
- return hash(self.full())
168
-
169
- @property
170
- def rid(self) -> str:
171
- return self.field_id.rid
172
-
173
176
  @classmethod
174
177
  def from_string(cls, value: str) -> "ParagraphId":
175
178
  parts = value.split("/")
@@ -192,6 +195,22 @@ class ParagraphId:
192
195
  paragraph_end=vid.vector_end,
193
196
  )
194
197
 
198
+ @property
199
+ def rid(self) -> str:
200
+ return self.field_id.rid
201
+
202
+ def full(self) -> str:
203
+ return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
204
+
205
+ def __str__(self) -> str:
206
+ return self.full()
207
+
208
+ def __repr__(self) -> str:
209
+ return f"ParagraphId({self.full()})"
210
+
211
+ def __hash__(self) -> int:
212
+ return hash(self.full())
213
+
195
214
 
196
215
  @dataclass
197
216
  class VectorId:
@@ -217,19 +236,6 @@ class VectorId:
217
236
  vector_start: int
218
237
  vector_end: int
219
238
 
220
- def __repr__(self) -> str:
221
- return f"VectorId({self.full()})"
222
-
223
- def full(self) -> str:
224
- return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
225
-
226
- def __hash__(self) -> int:
227
- return hash(self.full())
228
-
229
- @property
230
- def rid(self) -> str:
231
- return self.field_id.rid
232
-
233
239
  @classmethod
234
240
  def from_string(cls, value: str) -> "VectorId":
235
241
  parts = value.split("/")
@@ -239,6 +245,22 @@ class VectorId:
239
245
  field_id = FieldId.from_string("/".join(parts[:-2]))
240
246
  return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
241
247
 
248
+ @property
249
+ def rid(self) -> str:
250
+ return self.field_id.rid
251
+
252
+ def full(self) -> str:
253
+ return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
254
+
255
+ def __str__(self) -> str:
256
+ return self.full()
257
+
258
+ def __repr__(self) -> str:
259
+ return f"VectorId({self.full()})"
260
+
261
+ def __hash__(self) -> int:
262
+ return hash(self.full())
263
+
242
264
 
243
265
  def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
244
266
  """Data augmentation generated fields have a strict id with the following
@@ -29,6 +29,7 @@ from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
29
29
  from google.protobuf.message import DecodeError, Message
30
30
 
31
31
  from nucliadb.common import datamanagers
32
+ from nucliadb.common.ids import FieldId
32
33
  from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
33
34
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
35
  from nucliadb_protos.resources_pb2 import (
@@ -125,6 +126,14 @@ class Field(Generic[PbType]):
125
126
  def uuid(self) -> str:
126
127
  return self.resource.uuid
127
128
 
129
+ @property
130
+ def field_id(self) -> FieldId:
131
+ return FieldId(
132
+ rid=self.resource.uuid,
133
+ type=self.type,
134
+ key=self.id,
135
+ )
136
+
128
137
  @property
129
138
  def storage(self) -> Storage:
130
139
  return self.resource.storage
@@ -135,7 +135,7 @@ class Resource:
135
135
  await self.txn.set(new_key, self.uuid.encode())
136
136
 
137
137
  # Basic
138
- async def get_basic(self) -> Optional[PBBasic]:
138
+ async def get_basic(self) -> PBBasic:
139
139
  if self.basic is None:
140
140
  basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
141
141
  self.basic = basic if basic is not None else PBBasic()
@@ -354,7 +354,16 @@ class Resource:
354
354
 
355
355
  await field_obj.delete()
356
356
 
357
+ async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
358
+ """Return whether this resource has this field or not."""
359
+ all_fields_ids = await self.get_fields_ids()
360
+ for field_type, field_id in all_fields_ids:
361
+ if field_type == type and field_id == field:
362
+ return True
363
+ return False
364
+
357
365
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
366
+ # REVIEW: are we sure we don't want to actually check this?
358
367
  return (type, field) in self.fields
359
368
 
360
369
  async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
@@ -19,7 +19,7 @@
19
19
  #
20
20
  from typing import Dict
21
21
 
22
- from fastapi import Request
22
+ from fastapi import Header, Request
23
23
  from fastapi_versioning import version
24
24
  from nuclia_models.config.proto import ExtractConfig, SplitConfiguration
25
25
 
@@ -60,15 +60,11 @@ async def download_model(
60
60
  )
61
61
  @requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
62
62
  @version(1)
63
- async def get_configuration(
64
- request: Request,
65
- kbid: str,
66
- ):
63
+ async def get_configuration(request: Request, kbid: str):
67
64
  return await learning_config_proxy(
68
65
  request,
69
66
  "GET",
70
67
  f"/config/{kbid}",
71
- headers={"account-id": request.headers.get("x-nucliadb-account", "")},
72
68
  )
73
69
 
74
70
 
@@ -122,14 +118,13 @@ async def get_model(
122
118
  @requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
123
119
  @version(1)
124
120
  async def get_schema_for_configuration_updates(
125
- request: Request,
126
- kbid: str,
121
+ request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
127
122
  ):
128
123
  return await learning_config_proxy(
129
124
  request,
130
125
  "GET",
131
126
  f"/schema/{kbid}",
132
- headers={"account-id": request.headers.get("x-nucliadb-account", "")},
127
+ headers={"account-id": x_nucliadb_account},
133
128
  )
134
129
 
135
130
 
@@ -23,6 +23,7 @@ from . import ( # noqa: F401
23
23
  feedback,
24
24
  find,
25
25
  graph,
26
+ hydrate,
26
27
  knowledgebox,
27
28
  predict_proxy,
28
29
  search,
@@ -0,0 +1,328 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from typing import Awaitable, Optional, Union
22
+
23
+ from async_lru import alru_cache
24
+ from fastapi import Request, Response
25
+ from fastapi_versioning import version
26
+
27
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
28
+ from nucliadb.ingest.fields.base import Field
29
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
30
+ from nucliadb.search.search import cache
31
+ from nucliadb.search.search.cache import request_caches
32
+ from nucliadb.search.search.hydrator.fields import hydrate_field, page_preview_id
33
+ from nucliadb.search.search.hydrator.images import (
34
+ download_page_preview,
35
+ )
36
+ from nucliadb.search.search.hydrator.paragraphs import ParagraphIndex, hydrate_paragraph
37
+ from nucliadb.search.search.hydrator.resources import hydrate_resource
38
+ from nucliadb_models.hydration import (
39
+ Hydrated,
40
+ HydratedConversationField,
41
+ HydratedFileField,
42
+ HydratedGenericField,
43
+ HydratedLinkField,
44
+ HydratedParagraph,
45
+ HydratedResource,
46
+ HydratedTextField,
47
+ HydrateRequest,
48
+ Hydration,
49
+ ParagraphHydration,
50
+ )
51
+ from nucliadb_models.resource import NucliaDBRoles
52
+ from nucliadb_models.search import Image
53
+ from nucliadb_utils.authentication import requires
54
+
55
+
56
+ @api.post(
57
+ f"/{KB_PREFIX}/{{kbid}}/hydrate",
58
+ status_code=200,
59
+ summary="Hydrate a set of paragraphs",
60
+ description="Internal API endpoint to hydrate a set of paragraphs",
61
+ include_in_schema=False,
62
+ response_model_exclude_unset=True,
63
+ tags=["Hydration"],
64
+ )
65
+ @requires(NucliaDBRoles.READER)
66
+ @version(1)
67
+ async def hydrate_endpoint(
68
+ request: Request,
69
+ response: Response,
70
+ kbid: str,
71
+ item: HydrateRequest,
72
+ ) -> Hydrated:
73
+ with request_caches():
74
+ return await Hydrator(kbid, item.hydration).hydrate(item.data)
75
+
76
+
77
+ class HydratedBuilder:
78
+ """Builder class to construct an Hydrated payload."""
79
+
80
+ def __init__(self) -> None:
81
+ self._resources: dict[str, HydratedResource] = {}
82
+ self._fields: dict[
83
+ str,
84
+ Union[
85
+ HydratedTextField,
86
+ HydratedFileField,
87
+ HydratedLinkField,
88
+ HydratedConversationField,
89
+ HydratedGenericField,
90
+ ],
91
+ ] = {}
92
+ self._paragraphs: dict[str, HydratedParagraph] = {}
93
+
94
+ @property
95
+ def resources(self) -> dict[str, HydratedResource]:
96
+ return self._resources
97
+
98
+ @property
99
+ def fields(
100
+ self,
101
+ ) -> dict[
102
+ str,
103
+ Union[
104
+ HydratedTextField,
105
+ HydratedFileField,
106
+ HydratedLinkField,
107
+ HydratedConversationField,
108
+ HydratedGenericField,
109
+ ],
110
+ ]:
111
+ return self._fields
112
+
113
+ @property
114
+ def paragraphs(self) -> dict[str, HydratedParagraph]:
115
+ return self._paragraphs
116
+
117
+ def build(self) -> Hydrated:
118
+ return Hydrated(
119
+ resources=self._resources,
120
+ fields=self._fields,
121
+ paragraphs=self._paragraphs,
122
+ )
123
+
124
+ def add_resource(self, rid: str, resource: HydratedResource):
125
+ self._resources[rid] = resource
126
+
127
+ def add_field(
128
+ self,
129
+ field_id: FieldId,
130
+ field: Union[
131
+ HydratedTextField,
132
+ HydratedFileField,
133
+ HydratedLinkField,
134
+ HydratedConversationField,
135
+ HydratedGenericField,
136
+ ],
137
+ ):
138
+ self._fields[field_id.full()] = field
139
+
140
+ def has_field(self, field_id: FieldId) -> bool:
141
+ return field_id.full() in self._fields
142
+
143
+ def add_paragraph(self, paragraph_id: ParagraphId, paragraph: HydratedParagraph):
144
+ self._paragraphs[paragraph_id.full()] = paragraph
145
+
146
+ def add_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
147
+ field_id = paragraph_id.field_id
148
+ field = self._fields[field_id.full()]
149
+
150
+ if not isinstance(field, HydratedFileField):
151
+ # Other field types have no page preview concept
152
+ return
153
+
154
+ if field.previews is None:
155
+ field.previews = {}
156
+
157
+ preview_id = page_preview_id(page)
158
+ field.previews[preview_id] = image
159
+
160
+ paragraph = self._paragraphs[paragraph_id.full()]
161
+ assert paragraph.page is not None, "should already be set"
162
+ paragraph.page.page_preview_ref = preview_id
163
+
164
+ def add_table_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
165
+ field_id = paragraph_id.field_id
166
+ field = self._fields[field_id.full()]
167
+
168
+ if not isinstance(field, HydratedFileField):
169
+ # Other field types have no page preview concept
170
+ return
171
+
172
+ if field.previews is None:
173
+ field.previews = {}
174
+
175
+ preview_id = page_preview_id(page)
176
+ field.previews[preview_id] = image
177
+
178
+ paragraph = self._paragraphs[paragraph_id.full()]
179
+ assert paragraph.table is not None, "should already be set"
180
+ paragraph.table.page_preview_ref = preview_id
181
+
182
+
183
+ class Hydrator:
184
+ def __init__(self, kbid: str, config: Hydration):
185
+ self.kbid = kbid
186
+ self.config = config
187
+ self.hydrated = HydratedBuilder()
188
+
189
+ # cached paragraphs per field
190
+ self.field_paragraphs: dict[FieldId, ParagraphIndex] = {}
191
+
192
+ self.max_ops = asyncio.Semaphore(50)
193
+
194
+ async def hydrate(self, paragraph_ids: list[str]) -> Hydrated:
195
+ paragraph_tasks = {}
196
+ field_tasks = {}
197
+ resource_tasks = {}
198
+
199
+ unique_paragraph_ids = set(paragraph_ids)
200
+ for user_paragraph_id in unique_paragraph_ids:
201
+ try:
202
+ paragraph_id = ParagraphId.from_string(user_paragraph_id)
203
+ except ValueError:
204
+ # skip paragraphs with invalid format
205
+ continue
206
+
207
+ field_id = paragraph_id.field_id
208
+ rid = paragraph_id.rid
209
+
210
+ resource = await cache.get_resource(self.kbid, rid)
211
+ if resource is None:
212
+ # skip resources that aren't in the DB
213
+ continue
214
+
215
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
216
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
217
+ # skip a fields that aren't in the DB
218
+ continue
219
+ field = await resource.get_field(field_id.key, field_id.pb_type)
220
+
221
+ if field_id not in self.field_paragraphs:
222
+ field_paragraphs_index = ParagraphIndex(field_id)
223
+ self.field_paragraphs[field_id] = field_paragraphs_index
224
+ field_paragraphs_index = self.field_paragraphs[field_id]
225
+
226
+ paragraph_tasks[paragraph_id] = asyncio.create_task(
227
+ self._limited_concurrency(
228
+ hydrate_paragraph(
229
+ resource, field, paragraph_id, self.config.paragraph, field_paragraphs_index
230
+ ),
231
+ )
232
+ )
233
+
234
+ if field_id not in field_tasks:
235
+ field_tasks[field_id] = asyncio.create_task(
236
+ self._limited_concurrency(hydrate_field(resource, field_id, self.config.field))
237
+ )
238
+
239
+ if rid not in resource_tasks:
240
+ if self.config.resource is not None:
241
+ resource_tasks[rid] = asyncio.create_task(
242
+ self._limited_concurrency(hydrate_resource(resource, rid, self.config.resource))
243
+ )
244
+
245
+ ops = [
246
+ *paragraph_tasks.values(),
247
+ *field_tasks.values(),
248
+ *resource_tasks.values(),
249
+ ]
250
+ results = await asyncio.gather(*ops)
251
+ hydrated_paragraphs = results[: len(paragraph_tasks)]
252
+ hydrated_fields = results[len(paragraph_tasks) : len(paragraph_tasks) + len(field_tasks)]
253
+ hydrated_resources = results[
254
+ len(paragraph_tasks) + len(field_tasks) : len(paragraph_tasks)
255
+ + len(field_tasks)
256
+ + len(resource_tasks)
257
+ ]
258
+
259
+ for rid, hydrated_resource in zip(resource_tasks.keys(), hydrated_resources):
260
+ self.hydrated.add_resource(rid, hydrated_resource)
261
+
262
+ for field_id, hydrated_field in zip(field_tasks.keys(), hydrated_fields):
263
+ if hydrated_field is not None:
264
+ self.hydrated.add_field(field_id, hydrated_field)
265
+
266
+ for paragraph_id, (hydrated_paragraph, extra) in zip(
267
+ paragraph_tasks.keys(), hydrated_paragraphs
268
+ ):
269
+ self.hydrated.add_paragraph(paragraph_id, hydrated_paragraph)
270
+
271
+ for related_paragraph_id in extra.related_paragraph_ids:
272
+ field_id = related_paragraph_id.field_id
273
+ rid = related_paragraph_id.rid
274
+
275
+ resource = await cache.get_resource(self.kbid, rid)
276
+ if resource is None:
277
+ # skip resources that aren't in the DB
278
+ continue
279
+
280
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
281
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
282
+ # skip a fields that aren't in the DB
283
+ continue
284
+ field = await resource.get_field(field_id.key, field_id.pb_type)
285
+
286
+ if field_id not in self.field_paragraphs:
287
+ field_paragraphs_index = ParagraphIndex(field_id)
288
+ self.field_paragraphs[field_id] = field_paragraphs_index
289
+ field_paragraphs_index = self.field_paragraphs[field_id]
290
+
291
+ (hydrated_paragraph, _) = await hydrate_paragraph(
292
+ resource,
293
+ field,
294
+ related_paragraph_id,
295
+ ParagraphHydration(
296
+ text=self.config.paragraph.text, image=None, table=None, page=None, related=None
297
+ ),
298
+ field_paragraphs_index,
299
+ )
300
+ self.hydrated.add_paragraph(related_paragraph_id, hydrated_paragraph)
301
+
302
+ if self.hydrated.has_field(field_id):
303
+ # we only hydrate page and table previews for fields the user
304
+ # allowed hydration, skipping fields with explicitly disabled
305
+ # hydration
306
+
307
+ if extra.field_page is not None:
308
+ page_number = extra.field_page
309
+ preview = await self.cached_download_page_preview(field, page_number)
310
+ if preview is not None:
311
+ self.hydrated.add_page_preview(paragraph_id, page_number, preview)
312
+
313
+ if extra.field_table_page is not None:
314
+ page_number = extra.field_table_page
315
+ preview = await self.cached_download_page_preview(field, page_number)
316
+ if preview is not None:
317
+ self.hydrated.add_table_page_preview(paragraph_id, page_number, preview)
318
+
319
+ return self.hydrated.build()
320
+
321
+ # TODO: proper typing
322
+ async def _limited_concurrency(self, aw: Awaitable):
323
+ async with self.max_ops:
324
+ return await aw
325
+
326
+ @alru_cache(maxsize=None)
327
+ async def cached_download_page_preview(self, field: Field, page: int) -> Optional[Image]:
328
+ return await download_page_preview(field, page)
@@ -21,8 +21,6 @@ import contextlib
21
21
  import logging
22
22
  from typing import Optional
23
23
 
24
- import backoff
25
-
26
24
  from nucliadb.common.cache import (
27
25
  extracted_text_cache,
28
26
  get_extracted_text_cache,
@@ -74,23 +72,6 @@ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
74
72
  return extracted_text
75
73
 
76
74
 
77
- @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
78
- async def field_get_extracted_text(field: Field) -> Optional[ExtractedText]:
79
- try:
80
- return await field.get_extracted_text()
81
- except Exception:
82
- logger.warning(
83
- "Error getting extracted text for field. Retrying",
84
- exc_info=True,
85
- extra={
86
- "kbid": field.kbid,
87
- "resource_id": field.resource.uuid,
88
- "field": f"{field.type}/{field.id}",
89
- },
90
- )
91
- raise
92
-
93
-
94
75
  async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
95
76
  rid = field.rid
96
77
  orm_resource = await get_resource(kbid, rid)
@@ -28,7 +28,8 @@ from nucliadb.common.external_index_providers.base import TextBlockMatch
28
28
  from nucliadb.common.ids import FieldId
29
29
  from nucliadb.common.maindb.utils import get_driver
30
30
  from nucliadb.ingest.serialize import managed_serialize
31
- from nucliadb.search.search import cache, paragraphs
31
+ from nucliadb.search.search import cache
32
+ from nucliadb.search.search.paragraphs import get_paragraph_text
32
33
  from nucliadb_models.common import FieldTypeName
33
34
  from nucliadb_models.resource import ExtractedDataTypeName, Resource
34
35
  from nucliadb_models.search import (
@@ -170,7 +171,7 @@ async def hydrate_text_block(
170
171
  if concurrency_control is not None:
171
172
  await stack.enter_async_context(concurrency_control)
172
173
 
173
- text_block.text = await paragraphs.get_paragraph_text(
174
+ text_block.text = await get_paragraph_text(
174
175
  kbid=kbid,
175
176
  paragraph_id=text_block.paragraph_id,
176
177
  highlight=options.highlight,