nucliadb 6.8.1.post4983__py3-none-any.whl → 6.8.1.post5003__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- nucliadb/common/ids.py +77 -55
- nucliadb/ingest/fields/base.py +9 -0
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/search/cache.py +0 -19
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +3 -2
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +126 -0
- nucliadb/search/search/hydrator/paragraphs.py +305 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- {nucliadb-6.8.1.post4983.dist-info → nucliadb-6.8.1.post5003.dist-info}/METADATA +6 -6
- {nucliadb-6.8.1.post4983.dist-info → nucliadb-6.8.1.post5003.dist-info}/RECORD +16 -11
- {nucliadb-6.8.1.post4983.dist-info → nucliadb-6.8.1.post5003.dist-info}/WHEEL +0 -0
- {nucliadb-6.8.1.post4983.dist-info → nucliadb-6.8.1.post5003.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.8.1.post4983.dist-info → nucliadb-6.8.1.post5003.dist-info}/top_level.txt +0 -0
nucliadb/common/ids.py
CHANGED
|
@@ -47,6 +47,8 @@ FIELD_TYPE_NAME_TO_STR = {
|
|
|
47
47
|
FieldTypeName.CONVERSATION: "c",
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
|
|
51
|
+
|
|
50
52
|
|
|
51
53
|
@dataclass
|
|
52
54
|
class FieldId:
|
|
@@ -65,7 +67,7 @@ class FieldId:
|
|
|
65
67
|
|
|
66
68
|
Examples:
|
|
67
69
|
|
|
68
|
-
>>> FieldId(rid="rid", type="u", key="
|
|
70
|
+
>>> FieldId(rid="rid", type="u", key="my-link")
|
|
69
71
|
FieldID("rid/u/my-link")
|
|
70
72
|
>>> FieldId.from_string("rid/u/my-link")
|
|
71
73
|
FieldID("rid/u/my-link")
|
|
@@ -77,31 +79,6 @@ class FieldId:
|
|
|
77
79
|
# also knwon as `split`, this indicates a part of a field in, for example, conversations
|
|
78
80
|
subfield_id: Optional[str] = None
|
|
79
81
|
|
|
80
|
-
def __repr__(self) -> str:
|
|
81
|
-
return f"FieldId({self.full()})"
|
|
82
|
-
|
|
83
|
-
def short_without_subfield(self) -> str:
|
|
84
|
-
return f"/{self.type}/{self.key}"
|
|
85
|
-
|
|
86
|
-
def full(self) -> str:
|
|
87
|
-
if self.subfield_id is None:
|
|
88
|
-
return f"{self.rid}/{self.type}/{self.key}"
|
|
89
|
-
else:
|
|
90
|
-
return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
|
|
91
|
-
|
|
92
|
-
def __hash__(self) -> int:
|
|
93
|
-
return hash(self.full())
|
|
94
|
-
|
|
95
|
-
@property
|
|
96
|
-
def pb_type(self) -> FieldType.ValueType:
|
|
97
|
-
return FIELD_TYPE_STR_TO_PB[self.type]
|
|
98
|
-
|
|
99
|
-
@classmethod
|
|
100
|
-
def from_pb(
|
|
101
|
-
cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
|
|
102
|
-
) -> "FieldId":
|
|
103
|
-
return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
|
|
104
|
-
|
|
105
82
|
@classmethod
|
|
106
83
|
def from_string(cls, value: str) -> "FieldId":
|
|
107
84
|
"""
|
|
@@ -120,11 +97,11 @@ class FieldId:
|
|
|
120
97
|
parts = value.split("/")
|
|
121
98
|
if len(parts) == 3:
|
|
122
99
|
rid, _type, key = parts
|
|
123
|
-
_type = cls.
|
|
100
|
+
_type = cls._parse_field_type(_type)
|
|
124
101
|
return cls(rid=rid, type=_type, key=key)
|
|
125
102
|
elif len(parts) == 4:
|
|
126
103
|
rid, _type, key, subfield_id = parts
|
|
127
|
-
_type = cls.
|
|
104
|
+
_type = cls._parse_field_type(_type)
|
|
128
105
|
return cls(
|
|
129
106
|
rid=rid,
|
|
130
107
|
type=_type,
|
|
@@ -135,7 +112,46 @@ class FieldId:
|
|
|
135
112
|
raise ValueError(f"Invalid FieldId: {value}")
|
|
136
113
|
|
|
137
114
|
@classmethod
|
|
138
|
-
def
|
|
115
|
+
def from_pb(
|
|
116
|
+
cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
|
|
117
|
+
) -> "FieldId":
|
|
118
|
+
return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def pb_type(self) -> FieldType.ValueType:
|
|
122
|
+
return FIELD_TYPE_STR_TO_PB[self.type]
|
|
123
|
+
|
|
124
|
+
def full(self) -> str:
|
|
125
|
+
if self.subfield_id is None:
|
|
126
|
+
return f"{self.rid}/{self.type}/{self.key}"
|
|
127
|
+
else:
|
|
128
|
+
return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
|
|
129
|
+
|
|
130
|
+
def short_without_subfield(self) -> str:
|
|
131
|
+
return f"/{self.type}/{self.key}"
|
|
132
|
+
|
|
133
|
+
def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
|
|
134
|
+
"""Generate a ParagraphId from the current field given its start and
|
|
135
|
+
end.
|
|
136
|
+
|
|
137
|
+
"""
|
|
138
|
+
return ParagraphId(
|
|
139
|
+
field_id=self,
|
|
140
|
+
paragraph_start=paragraph_start,
|
|
141
|
+
paragraph_end=paragraph_end,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def __str__(self) -> str:
|
|
145
|
+
return self.full()
|
|
146
|
+
|
|
147
|
+
def __repr__(self) -> str:
|
|
148
|
+
return f"FieldId({self.full()})"
|
|
149
|
+
|
|
150
|
+
def __hash__(self) -> int:
|
|
151
|
+
return hash(self.full())
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _parse_field_type(_type: str) -> str:
|
|
139
155
|
if _type not in FIELD_TYPE_STR_TO_PB:
|
|
140
156
|
# Try to parse the enum value
|
|
141
157
|
# XXX: This is to support field types that are integer values of FieldType
|
|
@@ -157,19 +173,6 @@ class ParagraphId:
|
|
|
157
173
|
paragraph_start: int
|
|
158
174
|
paragraph_end: int
|
|
159
175
|
|
|
160
|
-
def __repr__(self) -> str:
|
|
161
|
-
return f"ParagraphId({self.full()})"
|
|
162
|
-
|
|
163
|
-
def full(self) -> str:
|
|
164
|
-
return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
|
|
165
|
-
|
|
166
|
-
def __hash__(self) -> int:
|
|
167
|
-
return hash(self.full())
|
|
168
|
-
|
|
169
|
-
@property
|
|
170
|
-
def rid(self) -> str:
|
|
171
|
-
return self.field_id.rid
|
|
172
|
-
|
|
173
176
|
@classmethod
|
|
174
177
|
def from_string(cls, value: str) -> "ParagraphId":
|
|
175
178
|
parts = value.split("/")
|
|
@@ -192,6 +195,22 @@ class ParagraphId:
|
|
|
192
195
|
paragraph_end=vid.vector_end,
|
|
193
196
|
)
|
|
194
197
|
|
|
198
|
+
@property
|
|
199
|
+
def rid(self) -> str:
|
|
200
|
+
return self.field_id.rid
|
|
201
|
+
|
|
202
|
+
def full(self) -> str:
|
|
203
|
+
return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
|
|
204
|
+
|
|
205
|
+
def __str__(self) -> str:
|
|
206
|
+
return self.full()
|
|
207
|
+
|
|
208
|
+
def __repr__(self) -> str:
|
|
209
|
+
return f"ParagraphId({self.full()})"
|
|
210
|
+
|
|
211
|
+
def __hash__(self) -> int:
|
|
212
|
+
return hash(self.full())
|
|
213
|
+
|
|
195
214
|
|
|
196
215
|
@dataclass
|
|
197
216
|
class VectorId:
|
|
@@ -217,19 +236,6 @@ class VectorId:
|
|
|
217
236
|
vector_start: int
|
|
218
237
|
vector_end: int
|
|
219
238
|
|
|
220
|
-
def __repr__(self) -> str:
|
|
221
|
-
return f"VectorId({self.full()})"
|
|
222
|
-
|
|
223
|
-
def full(self) -> str:
|
|
224
|
-
return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
|
|
225
|
-
|
|
226
|
-
def __hash__(self) -> int:
|
|
227
|
-
return hash(self.full())
|
|
228
|
-
|
|
229
|
-
@property
|
|
230
|
-
def rid(self) -> str:
|
|
231
|
-
return self.field_id.rid
|
|
232
|
-
|
|
233
239
|
@classmethod
|
|
234
240
|
def from_string(cls, value: str) -> "VectorId":
|
|
235
241
|
parts = value.split("/")
|
|
@@ -239,6 +245,22 @@ class VectorId:
|
|
|
239
245
|
field_id = FieldId.from_string("/".join(parts[:-2]))
|
|
240
246
|
return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
|
|
241
247
|
|
|
248
|
+
@property
|
|
249
|
+
def rid(self) -> str:
|
|
250
|
+
return self.field_id.rid
|
|
251
|
+
|
|
252
|
+
def full(self) -> str:
|
|
253
|
+
return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
|
|
254
|
+
|
|
255
|
+
def __str__(self) -> str:
|
|
256
|
+
return self.full()
|
|
257
|
+
|
|
258
|
+
def __repr__(self) -> str:
|
|
259
|
+
return f"VectorId({self.full()})"
|
|
260
|
+
|
|
261
|
+
def __hash__(self) -> int:
|
|
262
|
+
return hash(self.full())
|
|
263
|
+
|
|
242
264
|
|
|
243
265
|
def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
|
|
244
266
|
"""Data augmentation generated fields have a strict id with the following
|
nucliadb/ingest/fields/base.py
CHANGED
|
@@ -29,6 +29,7 @@ from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
|
|
|
29
29
|
from google.protobuf.message import DecodeError, Message
|
|
30
30
|
|
|
31
31
|
from nucliadb.common import datamanagers
|
|
32
|
+
from nucliadb.common.ids import FieldId
|
|
32
33
|
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
|
33
34
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
|
34
35
|
from nucliadb_protos.resources_pb2 import (
|
|
@@ -125,6 +126,14 @@ class Field(Generic[PbType]):
|
|
|
125
126
|
def uuid(self) -> str:
|
|
126
127
|
return self.resource.uuid
|
|
127
128
|
|
|
129
|
+
@property
|
|
130
|
+
def field_id(self) -> FieldId:
|
|
131
|
+
return FieldId(
|
|
132
|
+
rid=self.resource.uuid,
|
|
133
|
+
type=self.type,
|
|
134
|
+
key=self.id,
|
|
135
|
+
)
|
|
136
|
+
|
|
128
137
|
@property
|
|
129
138
|
def storage(self) -> Storage:
|
|
130
139
|
return self.resource.storage
|
nucliadb/ingest/orm/resource.py
CHANGED
|
@@ -135,7 +135,7 @@ class Resource:
|
|
|
135
135
|
await self.txn.set(new_key, self.uuid.encode())
|
|
136
136
|
|
|
137
137
|
# Basic
|
|
138
|
-
async def get_basic(self) ->
|
|
138
|
+
async def get_basic(self) -> PBBasic:
|
|
139
139
|
if self.basic is None:
|
|
140
140
|
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
|
141
141
|
self.basic = basic if basic is not None else PBBasic()
|
|
@@ -354,7 +354,16 @@ class Resource:
|
|
|
354
354
|
|
|
355
355
|
await field_obj.delete()
|
|
356
356
|
|
|
357
|
+
async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
|
|
358
|
+
"""Return whether this resource has this field or not."""
|
|
359
|
+
all_fields_ids = await self.get_fields_ids()
|
|
360
|
+
for field_type, field_id in all_fields_ids:
|
|
361
|
+
if field_type == type and field_id == field:
|
|
362
|
+
return True
|
|
363
|
+
return False
|
|
364
|
+
|
|
357
365
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
|
366
|
+
# REVIEW: are we sure we don't want to actually check this?
|
|
358
367
|
return (type, field) in self.fields
|
|
359
368
|
|
|
360
369
|
async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from typing import Awaitable, Optional, Union
|
|
22
|
+
|
|
23
|
+
from async_lru import alru_cache
|
|
24
|
+
from fastapi import Request, Response
|
|
25
|
+
from fastapi_versioning import version
|
|
26
|
+
|
|
27
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
|
|
28
|
+
from nucliadb.ingest.fields.base import Field
|
|
29
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
|
30
|
+
from nucliadb.search.search import cache
|
|
31
|
+
from nucliadb.search.search.cache import request_caches
|
|
32
|
+
from nucliadb.search.search.hydrator.fields import hydrate_field, page_preview_id
|
|
33
|
+
from nucliadb.search.search.hydrator.images import (
|
|
34
|
+
download_page_preview,
|
|
35
|
+
)
|
|
36
|
+
from nucliadb.search.search.hydrator.paragraphs import ParagraphIndex, hydrate_paragraph
|
|
37
|
+
from nucliadb.search.search.hydrator.resources import hydrate_resource
|
|
38
|
+
from nucliadb_models.hydration import (
|
|
39
|
+
Hydrated,
|
|
40
|
+
HydratedConversationField,
|
|
41
|
+
HydratedFileField,
|
|
42
|
+
HydratedGenericField,
|
|
43
|
+
HydratedLinkField,
|
|
44
|
+
HydratedParagraph,
|
|
45
|
+
HydratedResource,
|
|
46
|
+
HydratedTextField,
|
|
47
|
+
HydrateRequest,
|
|
48
|
+
Hydration,
|
|
49
|
+
ParagraphHydration,
|
|
50
|
+
)
|
|
51
|
+
from nucliadb_models.resource import NucliaDBRoles
|
|
52
|
+
from nucliadb_models.search import Image
|
|
53
|
+
from nucliadb_utils.authentication import requires
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@api.post(
|
|
57
|
+
f"/{KB_PREFIX}/{{kbid}}/hydrate",
|
|
58
|
+
status_code=200,
|
|
59
|
+
summary="Hydrate a set of paragraphs",
|
|
60
|
+
description="Internal API endpoint to hydrate a set of paragraphs",
|
|
61
|
+
include_in_schema=False,
|
|
62
|
+
response_model_exclude_unset=True,
|
|
63
|
+
tags=["Hydration"],
|
|
64
|
+
)
|
|
65
|
+
@requires(NucliaDBRoles.READER)
|
|
66
|
+
@version(1)
|
|
67
|
+
async def hydrate_endpoint(
|
|
68
|
+
request: Request,
|
|
69
|
+
response: Response,
|
|
70
|
+
kbid: str,
|
|
71
|
+
item: HydrateRequest,
|
|
72
|
+
) -> Hydrated:
|
|
73
|
+
with request_caches():
|
|
74
|
+
return await Hydrator(kbid, item.hydration).hydrate(item.data)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class HydratedBuilder:
|
|
78
|
+
"""Builder class to construct an Hydrated payload."""
|
|
79
|
+
|
|
80
|
+
def __init__(self) -> None:
|
|
81
|
+
self._resources: dict[str, HydratedResource] = {}
|
|
82
|
+
self._fields: dict[
|
|
83
|
+
str,
|
|
84
|
+
Union[
|
|
85
|
+
HydratedTextField,
|
|
86
|
+
HydratedFileField,
|
|
87
|
+
HydratedLinkField,
|
|
88
|
+
HydratedConversationField,
|
|
89
|
+
HydratedGenericField,
|
|
90
|
+
],
|
|
91
|
+
] = {}
|
|
92
|
+
self._paragraphs: dict[str, HydratedParagraph] = {}
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def resources(self) -> dict[str, HydratedResource]:
|
|
96
|
+
return self._resources
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def fields(
|
|
100
|
+
self,
|
|
101
|
+
) -> dict[
|
|
102
|
+
str,
|
|
103
|
+
Union[
|
|
104
|
+
HydratedTextField,
|
|
105
|
+
HydratedFileField,
|
|
106
|
+
HydratedLinkField,
|
|
107
|
+
HydratedConversationField,
|
|
108
|
+
HydratedGenericField,
|
|
109
|
+
],
|
|
110
|
+
]:
|
|
111
|
+
return self._fields
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def paragraphs(self) -> dict[str, HydratedParagraph]:
|
|
115
|
+
return self._paragraphs
|
|
116
|
+
|
|
117
|
+
def build(self) -> Hydrated:
|
|
118
|
+
return Hydrated(
|
|
119
|
+
resources=self._resources,
|
|
120
|
+
fields=self._fields,
|
|
121
|
+
paragraphs=self._paragraphs,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def add_resource(self, rid: str, resource: HydratedResource):
|
|
125
|
+
self._resources[rid] = resource
|
|
126
|
+
|
|
127
|
+
def add_field(
|
|
128
|
+
self,
|
|
129
|
+
field_id: FieldId,
|
|
130
|
+
field: Union[
|
|
131
|
+
HydratedTextField,
|
|
132
|
+
HydratedFileField,
|
|
133
|
+
HydratedLinkField,
|
|
134
|
+
HydratedConversationField,
|
|
135
|
+
HydratedGenericField,
|
|
136
|
+
],
|
|
137
|
+
):
|
|
138
|
+
self._fields[field_id.full()] = field
|
|
139
|
+
|
|
140
|
+
def has_field(self, field_id: FieldId) -> bool:
|
|
141
|
+
return field_id.full() in self._fields
|
|
142
|
+
|
|
143
|
+
def add_paragraph(self, paragraph_id: ParagraphId, paragraph: HydratedParagraph):
|
|
144
|
+
self._paragraphs[paragraph_id.full()] = paragraph
|
|
145
|
+
|
|
146
|
+
def add_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
|
|
147
|
+
field_id = paragraph_id.field_id
|
|
148
|
+
field = self._fields[field_id.full()]
|
|
149
|
+
|
|
150
|
+
if not isinstance(field, HydratedFileField):
|
|
151
|
+
# Other field types have no page preview concept
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
if field.previews is None:
|
|
155
|
+
field.previews = {}
|
|
156
|
+
|
|
157
|
+
preview_id = page_preview_id(page)
|
|
158
|
+
field.previews[preview_id] = image
|
|
159
|
+
|
|
160
|
+
paragraph = self._paragraphs[paragraph_id.full()]
|
|
161
|
+
assert paragraph.page is not None, "should already be set"
|
|
162
|
+
paragraph.page.page_preview_ref = preview_id
|
|
163
|
+
|
|
164
|
+
def add_table_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
|
|
165
|
+
field_id = paragraph_id.field_id
|
|
166
|
+
field = self._fields[field_id.full()]
|
|
167
|
+
|
|
168
|
+
if not isinstance(field, HydratedFileField):
|
|
169
|
+
# Other field types have no page preview concept
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
if field.previews is None:
|
|
173
|
+
field.previews = {}
|
|
174
|
+
|
|
175
|
+
preview_id = page_preview_id(page)
|
|
176
|
+
field.previews[preview_id] = image
|
|
177
|
+
|
|
178
|
+
paragraph = self._paragraphs[paragraph_id.full()]
|
|
179
|
+
assert paragraph.table is not None, "should already be set"
|
|
180
|
+
paragraph.table.page_preview_ref = preview_id
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class Hydrator:
|
|
184
|
+
def __init__(self, kbid: str, config: Hydration):
|
|
185
|
+
self.kbid = kbid
|
|
186
|
+
self.config = config
|
|
187
|
+
self.hydrated = HydratedBuilder()
|
|
188
|
+
|
|
189
|
+
# cached paragraphs per field
|
|
190
|
+
self.field_paragraphs: dict[FieldId, ParagraphIndex] = {}
|
|
191
|
+
|
|
192
|
+
self.max_ops = asyncio.Semaphore(50)
|
|
193
|
+
|
|
194
|
+
async def hydrate(self, paragraph_ids: list[str]) -> Hydrated:
|
|
195
|
+
paragraph_tasks = {}
|
|
196
|
+
field_tasks = {}
|
|
197
|
+
resource_tasks = {}
|
|
198
|
+
|
|
199
|
+
unique_paragraph_ids = set(paragraph_ids)
|
|
200
|
+
for user_paragraph_id in unique_paragraph_ids:
|
|
201
|
+
try:
|
|
202
|
+
paragraph_id = ParagraphId.from_string(user_paragraph_id)
|
|
203
|
+
except ValueError:
|
|
204
|
+
# skip paragraphs with invalid format
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
field_id = paragraph_id.field_id
|
|
208
|
+
rid = paragraph_id.rid
|
|
209
|
+
|
|
210
|
+
resource = await cache.get_resource(self.kbid, rid)
|
|
211
|
+
if resource is None:
|
|
212
|
+
# skip resources that aren't in the DB
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
|
|
216
|
+
if not (await resource.field_exists(field_type_pb, field_id.key)):
|
|
217
|
+
# skip a fields that aren't in the DB
|
|
218
|
+
continue
|
|
219
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
220
|
+
|
|
221
|
+
if field_id not in self.field_paragraphs:
|
|
222
|
+
field_paragraphs_index = ParagraphIndex(field_id)
|
|
223
|
+
self.field_paragraphs[field_id] = field_paragraphs_index
|
|
224
|
+
field_paragraphs_index = self.field_paragraphs[field_id]
|
|
225
|
+
|
|
226
|
+
paragraph_tasks[paragraph_id] = asyncio.create_task(
|
|
227
|
+
self._limited_concurrency(
|
|
228
|
+
hydrate_paragraph(
|
|
229
|
+
resource, field, paragraph_id, self.config.paragraph, field_paragraphs_index
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if field_id not in field_tasks:
|
|
235
|
+
field_tasks[field_id] = asyncio.create_task(
|
|
236
|
+
self._limited_concurrency(hydrate_field(resource, field_id, self.config.field))
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
if rid not in resource_tasks:
|
|
240
|
+
if self.config.resource is not None:
|
|
241
|
+
resource_tasks[rid] = asyncio.create_task(
|
|
242
|
+
self._limited_concurrency(hydrate_resource(resource, rid, self.config.resource))
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
ops = [
|
|
246
|
+
*paragraph_tasks.values(),
|
|
247
|
+
*field_tasks.values(),
|
|
248
|
+
*resource_tasks.values(),
|
|
249
|
+
]
|
|
250
|
+
results = await asyncio.gather(*ops)
|
|
251
|
+
hydrated_paragraphs = results[: len(paragraph_tasks)]
|
|
252
|
+
hydrated_fields = results[len(paragraph_tasks) : len(paragraph_tasks) + len(field_tasks)]
|
|
253
|
+
hydrated_resources = results[
|
|
254
|
+
len(paragraph_tasks) + len(field_tasks) : len(paragraph_tasks)
|
|
255
|
+
+ len(field_tasks)
|
|
256
|
+
+ len(resource_tasks)
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
for rid, hydrated_resource in zip(resource_tasks.keys(), hydrated_resources):
|
|
260
|
+
self.hydrated.add_resource(rid, hydrated_resource)
|
|
261
|
+
|
|
262
|
+
for field_id, hydrated_field in zip(field_tasks.keys(), hydrated_fields):
|
|
263
|
+
if hydrated_field is not None:
|
|
264
|
+
self.hydrated.add_field(field_id, hydrated_field)
|
|
265
|
+
|
|
266
|
+
for paragraph_id, (hydrated_paragraph, extra) in zip(
|
|
267
|
+
paragraph_tasks.keys(), hydrated_paragraphs
|
|
268
|
+
):
|
|
269
|
+
self.hydrated.add_paragraph(paragraph_id, hydrated_paragraph)
|
|
270
|
+
|
|
271
|
+
for related_paragraph_id in extra.related_paragraph_ids:
|
|
272
|
+
field_id = related_paragraph_id.field_id
|
|
273
|
+
rid = related_paragraph_id.rid
|
|
274
|
+
|
|
275
|
+
resource = await cache.get_resource(self.kbid, rid)
|
|
276
|
+
if resource is None:
|
|
277
|
+
# skip resources that aren't in the DB
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
|
|
281
|
+
if not (await resource.field_exists(field_type_pb, field_id.key)):
|
|
282
|
+
# skip a fields that aren't in the DB
|
|
283
|
+
continue
|
|
284
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
285
|
+
|
|
286
|
+
if field_id not in self.field_paragraphs:
|
|
287
|
+
field_paragraphs_index = ParagraphIndex(field_id)
|
|
288
|
+
self.field_paragraphs[field_id] = field_paragraphs_index
|
|
289
|
+
field_paragraphs_index = self.field_paragraphs[field_id]
|
|
290
|
+
|
|
291
|
+
(hydrated_paragraph, _) = await hydrate_paragraph(
|
|
292
|
+
resource,
|
|
293
|
+
field,
|
|
294
|
+
related_paragraph_id,
|
|
295
|
+
ParagraphHydration(
|
|
296
|
+
text=self.config.paragraph.text, image=None, table=None, page=None, related=None
|
|
297
|
+
),
|
|
298
|
+
field_paragraphs_index,
|
|
299
|
+
)
|
|
300
|
+
self.hydrated.add_paragraph(related_paragraph_id, hydrated_paragraph)
|
|
301
|
+
|
|
302
|
+
if self.hydrated.has_field(field_id):
|
|
303
|
+
# we only hydrate page and table previews for fields the user
|
|
304
|
+
# allowed hydration, skipping fields with explicitly disabled
|
|
305
|
+
# hydration
|
|
306
|
+
|
|
307
|
+
if extra.field_page is not None:
|
|
308
|
+
page_number = extra.field_page
|
|
309
|
+
preview = await self.cached_download_page_preview(field, page_number)
|
|
310
|
+
if preview is not None:
|
|
311
|
+
self.hydrated.add_page_preview(paragraph_id, page_number, preview)
|
|
312
|
+
|
|
313
|
+
if extra.field_table_page is not None:
|
|
314
|
+
page_number = extra.field_table_page
|
|
315
|
+
preview = await self.cached_download_page_preview(field, page_number)
|
|
316
|
+
if preview is not None:
|
|
317
|
+
self.hydrated.add_table_page_preview(paragraph_id, page_number, preview)
|
|
318
|
+
|
|
319
|
+
return self.hydrated.build()
|
|
320
|
+
|
|
321
|
+
# TODO: proper typing
|
|
322
|
+
async def _limited_concurrency(self, aw: Awaitable):
|
|
323
|
+
async with self.max_ops:
|
|
324
|
+
return await aw
|
|
325
|
+
|
|
326
|
+
@alru_cache(maxsize=None)
|
|
327
|
+
async def cached_download_page_preview(self, field: Field, page: int) -> Optional[Image]:
|
|
328
|
+
return await download_page_preview(field, page)
|
nucliadb/search/search/cache.py
CHANGED
|
@@ -21,8 +21,6 @@ import contextlib
|
|
|
21
21
|
import logging
|
|
22
22
|
from typing import Optional
|
|
23
23
|
|
|
24
|
-
import backoff
|
|
25
|
-
|
|
26
24
|
from nucliadb.common.cache import (
|
|
27
25
|
extracted_text_cache,
|
|
28
26
|
get_extracted_text_cache,
|
|
@@ -74,23 +72,6 @@ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
|
|
|
74
72
|
return extracted_text
|
|
75
73
|
|
|
76
74
|
|
|
77
|
-
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
|
|
78
|
-
async def field_get_extracted_text(field: Field) -> Optional[ExtractedText]:
|
|
79
|
-
try:
|
|
80
|
-
return await field.get_extracted_text()
|
|
81
|
-
except Exception:
|
|
82
|
-
logger.warning(
|
|
83
|
-
"Error getting extracted text for field. Retrying",
|
|
84
|
-
exc_info=True,
|
|
85
|
-
extra={
|
|
86
|
-
"kbid": field.kbid,
|
|
87
|
-
"resource_id": field.resource.uuid,
|
|
88
|
-
"field": f"{field.type}/{field.id}",
|
|
89
|
-
},
|
|
90
|
-
)
|
|
91
|
-
raise
|
|
92
|
-
|
|
93
|
-
|
|
94
75
|
async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
|
|
95
76
|
rid = field.rid
|
|
96
77
|
orm_resource = await get_resource(kbid, rid)
|
|
@@ -28,7 +28,8 @@ from nucliadb.common.external_index_providers.base import TextBlockMatch
|
|
|
28
28
|
from nucliadb.common.ids import FieldId
|
|
29
29
|
from nucliadb.common.maindb.utils import get_driver
|
|
30
30
|
from nucliadb.ingest.serialize import managed_serialize
|
|
31
|
-
from nucliadb.search.search import cache
|
|
31
|
+
from nucliadb.search.search import cache
|
|
32
|
+
from nucliadb.search.search.paragraphs import get_paragraph_text
|
|
32
33
|
from nucliadb_models.common import FieldTypeName
|
|
33
34
|
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
34
35
|
from nucliadb_models.search import (
|
|
@@ -170,7 +171,7 @@ async def hydrate_text_block(
|
|
|
170
171
|
if concurrency_control is not None:
|
|
171
172
|
await stack.enter_async_context(concurrency_control)
|
|
172
173
|
|
|
173
|
-
text_block.text = await
|
|
174
|
+
text_block.text = await get_paragraph_text(
|
|
174
175
|
kbid=kbid,
|
|
175
176
|
paragraph_id=text_block.paragraph_id,
|
|
176
177
|
highlight=options.highlight,
|