nucliadb 6.3.5.post3985__py3-none-any.whl → 6.3.5.post3995__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +3 -2
- nucliadb/common/cluster/rollover.py +3 -3
- nucliadb/common/cluster/utils.py +8 -4
- nucliadb/common/external_index_providers/pinecone.py +7 -44
- nucliadb/ingest/fields/exceptions.py +4 -0
- nucliadb/ingest/orm/brain_v2.py +782 -0
- nucliadb/ingest/orm/index_message.py +409 -0
- nucliadb/ingest/orm/metrics.py +1 -1
- nucliadb/ingest/orm/processor/data_augmentation.py +2 -2
- nucliadb/ingest/orm/processor/pgcatalog.py +3 -2
- nucliadb/ingest/orm/processor/processor.py +61 -47
- nucliadb/ingest/orm/resource.py +70 -50
- nucliadb/ingest/orm/utils.py +1 -2
- nucliadb/ingest/processing.py +2 -54
- nucliadb/ingest/service/writer.py +2 -2
- nucliadb/models/internal/__init__.py +19 -0
- nucliadb/models/internal/processing.py +160 -0
- nucliadb/writer/api/v1/field.py +1 -1
- nucliadb/writer/api/v1/resource.py +2 -1
- nucliadb/writer/api/v1/upload.py +1 -1
- nucliadb/writer/resource/basic.py +2 -3
- nucliadb/writer/resource/field.py +13 -14
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/METADATA +6 -6
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/RECORD +27 -23
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/top_level.txt +0 -0
nucliadb/ingest/orm/resource.py
CHANGED
@@ -68,7 +68,9 @@ from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
|
68
68
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
69
69
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
70
70
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
71
|
+
from nucliadb_utils import const
|
71
72
|
from nucliadb_utils.storages.storage import Storage
|
73
|
+
from nucliadb_utils.utilities import has_feature
|
72
74
|
|
73
75
|
if TYPE_CHECKING: # pragma: no cover
|
74
76
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
@@ -156,6 +158,14 @@ class Resource:
|
|
156
158
|
if basic_in_payload.HasField("metadata") and basic_in_payload.metadata.useful:
|
157
159
|
current_basic.metadata.status = basic_in_payload.metadata.status
|
158
160
|
|
161
|
+
def has_index_message_v2_feature(self) -> bool:
|
162
|
+
return has_feature(
|
163
|
+
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
164
|
+
context={
|
165
|
+
"kbid": self.kb.kbid,
|
166
|
+
},
|
167
|
+
)
|
168
|
+
|
159
169
|
@processor_observer.wrap({"type": "set_basic"})
|
160
170
|
async def set_basic(
|
161
171
|
self,
|
@@ -208,27 +218,29 @@ class Resource:
|
|
208
218
|
del self.basic.fieldmetadata[:]
|
209
219
|
self.basic.fieldmetadata.extend(updated)
|
210
220
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
if field_metadata is not None:
|
220
|
-
page_positions: Optional[FilePagePositions] = None
|
221
|
-
if isinstance(field_obj, File):
|
222
|
-
page_positions = await get_file_page_positions(field_obj)
|
223
|
-
|
224
|
-
self.indexer.apply_field_metadata(
|
225
|
-
field_id,
|
226
|
-
field_metadata,
|
227
|
-
page_positions=page_positions,
|
228
|
-
extracted_text=await field_obj.get_extracted_text(),
|
229
|
-
basic_user_field_metadata=user_field_metadata,
|
230
|
-
replace_field=True,
|
221
|
+
if not self.has_index_message_v2_feature():
|
222
|
+
# TODO: Remove this when we remove the old indexer is removed
|
223
|
+
# All modified field metadata should be indexed
|
224
|
+
# TODO: could be improved to only index the diff
|
225
|
+
for user_field_metadata in self.basic.fieldmetadata:
|
226
|
+
field_id = self.generate_field_id(fieldmetadata.field)
|
227
|
+
field_obj = await self.get_field(
|
228
|
+
fieldmetadata.field.field, fieldmetadata.field.field_type
|
231
229
|
)
|
230
|
+
field_metadata = await field_obj.get_field_metadata()
|
231
|
+
if field_metadata is not None:
|
232
|
+
page_positions: Optional[FilePagePositions] = None
|
233
|
+
if isinstance(field_obj, File):
|
234
|
+
page_positions = await get_file_page_positions(field_obj)
|
235
|
+
|
236
|
+
self.indexer.apply_field_metadata(
|
237
|
+
field_id,
|
238
|
+
field_metadata,
|
239
|
+
page_positions=page_positions,
|
240
|
+
extracted_text=await field_obj.get_extracted_text(),
|
241
|
+
basic_user_field_metadata=user_field_metadata,
|
242
|
+
replace_field=True,
|
243
|
+
)
|
232
244
|
|
233
245
|
# Some basic fields are computed off field metadata.
|
234
246
|
# This means we need to recompute upon field deletions.
|
@@ -300,7 +312,7 @@ class Resource:
|
|
300
312
|
self.modified = True
|
301
313
|
self.user_relations = payload
|
302
314
|
|
303
|
-
@processor_observer.wrap({"type": "
|
315
|
+
@processor_observer.wrap({"type": "generate_index_message_old"})
|
304
316
|
async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
|
305
317
|
brain = ResourceBrain(rid=self.uuid)
|
306
318
|
basic = await self.get_basic()
|
@@ -432,8 +444,10 @@ class Resource:
|
|
432
444
|
if field in self.all_fields_keys:
|
433
445
|
self.all_fields_keys.remove(field)
|
434
446
|
|
435
|
-
|
436
|
-
self.
|
447
|
+
# TODO: Remove this when we remove the old indexer
|
448
|
+
if not self.has_index_message_v2_feature():
|
449
|
+
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
450
|
+
self.indexer.delete_field(field_key=field_key)
|
437
451
|
|
438
452
|
await field_obj.delete()
|
439
453
|
|
@@ -814,34 +828,37 @@ class Resource:
|
|
814
828
|
load=False,
|
815
829
|
)
|
816
830
|
metadata = await field_obj.set_field_metadata(field_metadata)
|
817
|
-
field_key = self.generate_field_id(field_metadata.field)
|
818
831
|
|
819
|
-
|
820
|
-
if
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
(
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
+
# TODO: Remove this when we remove the old indexer
|
833
|
+
if not self.has_index_message_v2_feature():
|
834
|
+
field_key = self.generate_field_id(field_metadata.field)
|
835
|
+
|
836
|
+
page_positions: Optional[FilePagePositions] = None
|
837
|
+
if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
|
838
|
+
page_positions = await get_file_page_positions(field_obj)
|
839
|
+
|
840
|
+
user_field_metadata = next(
|
841
|
+
(
|
842
|
+
fm
|
843
|
+
for fm in self.basic.fieldmetadata
|
844
|
+
if fm.field.field == field_metadata.field.field
|
845
|
+
and fm.field.field_type == field_metadata.field.field_type
|
846
|
+
),
|
847
|
+
None,
|
848
|
+
)
|
832
849
|
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
850
|
+
extracted_text = await field_obj.get_extracted_text()
|
851
|
+
apply_field_metadata = partial(
|
852
|
+
self.indexer.apply_field_metadata,
|
853
|
+
field_key,
|
854
|
+
metadata,
|
855
|
+
page_positions=page_positions,
|
856
|
+
extracted_text=extracted_text,
|
857
|
+
basic_user_field_metadata=user_field_metadata,
|
858
|
+
replace_field=True,
|
859
|
+
)
|
860
|
+
loop = asyncio.get_running_loop()
|
861
|
+
await loop.run_in_executor(_executor, apply_field_metadata)
|
845
862
|
|
846
863
|
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
|
847
864
|
|
@@ -895,8 +912,11 @@ class Resource:
|
|
895
912
|
if vo is None:
|
896
913
|
raise AttributeError("Vector object not found on set_vectors")
|
897
914
|
|
898
|
-
|
915
|
+
if self.has_index_message_v2_feature():
|
916
|
+
continue
|
899
917
|
|
918
|
+
# TODO: Remove this when we remove the old indexer
|
919
|
+
# Prepare vectors to be indexed
|
900
920
|
field_key = self.generate_field_id(field_vectors.field)
|
901
921
|
dimension = vectorset.vectorset_index_config.vector_dimension
|
902
922
|
if not dimension:
|
nucliadb/ingest/orm/utils.py
CHANGED
@@ -20,8 +20,7 @@
|
|
20
20
|
import urllib.parse
|
21
21
|
from typing import Sequence
|
22
22
|
|
23
|
-
from nucliadb.
|
24
|
-
from nucliadb_models.text import PushTextFormat, Text
|
23
|
+
from nucliadb.models.internal.processing import PushPayload, PushTextFormat, Text
|
25
24
|
from nucliadb_protos.resources_pb2 import (
|
26
25
|
ExtractedTextWrapper,
|
27
26
|
FieldComputedMetadataWrapper,
|
nucliadb/ingest/processing.py
CHANGED
@@ -25,15 +25,14 @@ import uuid
|
|
25
25
|
from collections import defaultdict
|
26
26
|
from contextlib import AsyncExitStack
|
27
27
|
from enum import Enum
|
28
|
-
from typing import
|
28
|
+
from typing import Any, Optional, TypeVar
|
29
29
|
|
30
30
|
import aiohttp
|
31
31
|
import backoff
|
32
32
|
import jwt
|
33
|
-
from pydantic import BaseModel, Field
|
34
33
|
|
35
34
|
import nucliadb_models as models
|
36
|
-
from
|
35
|
+
from nucliadb.models.internal.processing import ClassificationLabel, ProcessingInfo, PushPayload
|
37
36
|
from nucliadb_models.resource import QueueType
|
38
37
|
from nucliadb_protos.resources_pb2 import CloudFile
|
39
38
|
from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
|
@@ -52,10 +51,6 @@ logger = logging.getLogger(__name__)
|
|
52
51
|
|
53
52
|
_T = TypeVar("_T")
|
54
53
|
|
55
|
-
if TYPE_CHECKING: # pragma: no cover
|
56
|
-
SourceValue = CloudFile.Source.V
|
57
|
-
else:
|
58
|
-
SourceValue = int
|
59
54
|
|
60
55
|
RETRIABLE_EXCEPTIONS = (aiohttp.client_exceptions.ClientConnectorError,)
|
61
56
|
MAX_TRIES = 4
|
@@ -71,53 +66,6 @@ processing_observer = metrics.Observer(
|
|
71
66
|
)
|
72
67
|
|
73
68
|
|
74
|
-
class Source(SourceValue, Enum): # type: ignore
|
75
|
-
HTTP = 0
|
76
|
-
INGEST = 1
|
77
|
-
|
78
|
-
|
79
|
-
class ProcessingInfo(BaseModel):
|
80
|
-
seqid: Optional[int] = None
|
81
|
-
account_seq: Optional[int] = None
|
82
|
-
queue: Optional[QueueType] = None
|
83
|
-
|
84
|
-
|
85
|
-
class PushPayload(BaseModel):
|
86
|
-
# There are multiple options of payload
|
87
|
-
uuid: str
|
88
|
-
slug: Optional[str] = None
|
89
|
-
kbid: str
|
90
|
-
source: Optional[Source] = None
|
91
|
-
userid: str
|
92
|
-
|
93
|
-
title: Optional[str] = None
|
94
|
-
|
95
|
-
genericfield: dict[str, models.Text] = {}
|
96
|
-
|
97
|
-
# New File
|
98
|
-
filefield: dict[str, str] = Field(
|
99
|
-
default={},
|
100
|
-
description="Map of each file field to the jwt token computed in ProcessingEngine methods",
|
101
|
-
)
|
102
|
-
|
103
|
-
# New Link
|
104
|
-
linkfield: dict[str, models.LinkUpload] = {}
|
105
|
-
|
106
|
-
# Diff on Text Field
|
107
|
-
textfield: dict[str, models.Text] = {}
|
108
|
-
|
109
|
-
# New conversations to process
|
110
|
-
conversationfield: dict[str, models.PushConversation] = {}
|
111
|
-
|
112
|
-
# Only internal
|
113
|
-
partition: int
|
114
|
-
|
115
|
-
# List of available processing options (with default values)
|
116
|
-
processing_options: Optional[models.PushProcessingOptions] = Field(
|
117
|
-
default_factory=models.PushProcessingOptions
|
118
|
-
)
|
119
|
-
|
120
|
-
|
121
69
|
async def start_processing_engine():
|
122
70
|
processing_engine = get_utility(Utility.PROCESSING)
|
123
71
|
if processing_engine is not None:
|
@@ -34,6 +34,7 @@ from nucliadb.ingest import SERVICE_NAME, logger
|
|
34
34
|
from nucliadb.ingest.orm.broker_message import generate_broker_message
|
35
35
|
from nucliadb.ingest.orm.entities import EntitiesManager
|
36
36
|
from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
|
37
|
+
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
37
38
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
38
39
|
from nucliadb.ingest.orm.processor import Processor, sequence_manager
|
39
40
|
from nucliadb.ingest.orm.resource import Resource as ResourceORM
|
@@ -444,9 +445,8 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
444
445
|
kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
|
445
446
|
resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
|
446
447
|
resobj.disable_vectors = not request.reindex_vectors
|
447
|
-
|
448
|
+
index_message = await get_resource_index_message(resobj, reindex=True)
|
448
449
|
shard = await self.proc.get_or_assign_resource_shard(txn, kbobj, request.rid)
|
449
|
-
index_message = brain.brain
|
450
450
|
external_index_manager = await get_external_index_manager(kbid=request.kbid)
|
451
451
|
if external_index_manager is not None:
|
452
452
|
await self.proc.external_index_add_resource(
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
|
22
|
+
# Processing classes (Those used to sent to push endpoints)
|
23
|
+
|
24
|
+
|
25
|
+
from datetime import datetime
|
26
|
+
from enum import Enum
|
27
|
+
from typing import TYPE_CHECKING, Optional
|
28
|
+
|
29
|
+
from pydantic import BaseModel, Field
|
30
|
+
|
31
|
+
from nucliadb_models.processing import PushProcessingOptions
|
32
|
+
from nucliadb_models.resource import QueueType
|
33
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
34
|
+
|
35
|
+
if TYPE_CHECKING: # pragma: no cover
|
36
|
+
SourceValue = CloudFile.Source.V
|
37
|
+
else:
|
38
|
+
SourceValue = int
|
39
|
+
|
40
|
+
|
41
|
+
class ClassificationLabel(BaseModel):
|
42
|
+
"""
|
43
|
+
NOTE: This model is used to send the labels of each field in the processing requests.
|
44
|
+
It is a rath is not meant to be used by api users.
|
45
|
+
"""
|
46
|
+
|
47
|
+
labelset: str
|
48
|
+
label: str
|
49
|
+
|
50
|
+
def __hash__(self):
|
51
|
+
return hash((self.labelset, self.label))
|
52
|
+
|
53
|
+
|
54
|
+
class PushTextFormat(int, Enum):
|
55
|
+
PLAIN = 0
|
56
|
+
HTML = 1
|
57
|
+
MARKDOWN = 2
|
58
|
+
RST = 3
|
59
|
+
JSON = 4
|
60
|
+
KEEP_MARKDOWN = 5
|
61
|
+
JSONL = 6
|
62
|
+
PLAIN_BLANKLINE_SPLIT = 7
|
63
|
+
|
64
|
+
|
65
|
+
class Text(BaseModel):
|
66
|
+
body: str
|
67
|
+
format: PushTextFormat
|
68
|
+
extract_strategy: Optional[str] = None
|
69
|
+
classification_labels: list[ClassificationLabel] = []
|
70
|
+
|
71
|
+
|
72
|
+
class LinkUpload(BaseModel):
|
73
|
+
link: str
|
74
|
+
headers: dict[str, str] = {}
|
75
|
+
cookies: dict[str, str] = {}
|
76
|
+
localstorage: dict[str, str] = {}
|
77
|
+
css_selector: Optional[str] = Field(
|
78
|
+
None,
|
79
|
+
title="Css selector",
|
80
|
+
description="Css selector to parse the link",
|
81
|
+
)
|
82
|
+
xpath: Optional[str] = Field(
|
83
|
+
None,
|
84
|
+
title="Xpath",
|
85
|
+
description="Xpath to parse the link",
|
86
|
+
)
|
87
|
+
extract_strategy: Optional[str] = None
|
88
|
+
classification_labels: list[ClassificationLabel] = []
|
89
|
+
|
90
|
+
|
91
|
+
class PushMessageFormat(int, Enum):
|
92
|
+
PLAIN = 0
|
93
|
+
HTML = 1
|
94
|
+
MARKDOWN = 2
|
95
|
+
RST = 3
|
96
|
+
JSON = 4
|
97
|
+
|
98
|
+
|
99
|
+
class PushMessageContent(BaseModel):
|
100
|
+
text: Optional[str] = None
|
101
|
+
format: PushMessageFormat
|
102
|
+
attachments: list[str] = []
|
103
|
+
|
104
|
+
|
105
|
+
class PushMessage(BaseModel):
|
106
|
+
timestamp: Optional[datetime] = None
|
107
|
+
who: Optional[str] = None
|
108
|
+
to: list[str] = []
|
109
|
+
content: PushMessageContent
|
110
|
+
ident: str
|
111
|
+
|
112
|
+
|
113
|
+
class PushConversation(BaseModel):
|
114
|
+
messages: list[PushMessage] = []
|
115
|
+
extract_strategy: Optional[str] = None
|
116
|
+
classification_labels: list[ClassificationLabel] = []
|
117
|
+
|
118
|
+
|
119
|
+
class Source(SourceValue, Enum): # type: ignore
|
120
|
+
HTTP = 0
|
121
|
+
INGEST = 1
|
122
|
+
|
123
|
+
|
124
|
+
class ProcessingInfo(BaseModel):
|
125
|
+
seqid: Optional[int] = None
|
126
|
+
account_seq: Optional[int] = None
|
127
|
+
queue: Optional[QueueType] = None
|
128
|
+
|
129
|
+
|
130
|
+
class PushPayload(BaseModel):
|
131
|
+
uuid: str
|
132
|
+
slug: Optional[str] = None
|
133
|
+
kbid: str
|
134
|
+
source: Optional[Source] = None
|
135
|
+
userid: str
|
136
|
+
|
137
|
+
title: Optional[str] = None
|
138
|
+
|
139
|
+
genericfield: dict[str, Text] = {}
|
140
|
+
|
141
|
+
# New File
|
142
|
+
filefield: dict[str, str] = Field(
|
143
|
+
default={},
|
144
|
+
description="Map of each file field to the jwt token computed in ProcessingEngine methods",
|
145
|
+
)
|
146
|
+
|
147
|
+
# New Link
|
148
|
+
linkfield: dict[str, LinkUpload] = {}
|
149
|
+
|
150
|
+
# Diff on Text Field
|
151
|
+
textfield: dict[str, Text] = {}
|
152
|
+
|
153
|
+
# New conversations to process
|
154
|
+
conversationfield: dict[str, PushConversation] = {}
|
155
|
+
|
156
|
+
# Only internal
|
157
|
+
partition: int
|
158
|
+
|
159
|
+
# List of available processing options (with default values)
|
160
|
+
processing_options: Optional[PushProcessingOptions] = Field(default_factory=PushProcessingOptions)
|
nucliadb/writer/api/v1/field.py
CHANGED
@@ -27,7 +27,7 @@ from starlette.requests import Request
|
|
27
27
|
import nucliadb_models as models
|
28
28
|
from nucliadb.common.maindb.utils import get_driver
|
29
29
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
30
|
-
from nucliadb.
|
30
|
+
from nucliadb.models.internal.processing import PushPayload, Source
|
31
31
|
from nucliadb.writer import SERVICE_NAME
|
32
32
|
from nucliadb.writer.api.constants import (
|
33
33
|
X_FILE_PASSWORD,
|
@@ -33,7 +33,7 @@ from nucliadb.common.maindb.driver import Driver
|
|
33
33
|
from nucliadb.common.maindb.exceptions import ConflictError, NotFoundError
|
34
34
|
from nucliadb.common.maindb.utils import get_driver
|
35
35
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
36
|
-
from nucliadb.
|
36
|
+
from nucliadb.models.internal.processing import ProcessingInfo, PushPayload, Source
|
37
37
|
from nucliadb.writer import SERVICE_NAME, logger
|
38
38
|
from nucliadb.writer.api.constants import X_NUCLIADB_USER, X_SKIP_STORE
|
39
39
|
from nucliadb.writer.api.v1 import transaction
|
@@ -616,6 +616,7 @@ def needs_resource_reindex(item: UpdateResourcePayload) -> bool:
|
|
616
616
|
or item.origin.metadata is not None
|
617
617
|
)
|
618
618
|
)
|
619
|
+
or item.security is not None
|
619
620
|
)
|
620
621
|
|
621
622
|
|
nucliadb/writer/api/v1/upload.py
CHANGED
@@ -33,7 +33,7 @@ from starlette.requests import Request as StarletteRequest
|
|
33
33
|
|
34
34
|
from nucliadb.common import datamanagers
|
35
35
|
from nucliadb.ingest.orm.utils import set_title
|
36
|
-
from nucliadb.
|
36
|
+
from nucliadb.models.internal.processing import PushPayload, Source
|
37
37
|
from nucliadb.models.responses import HTTPClientError
|
38
38
|
from nucliadb.writer import SERVICE_NAME
|
39
39
|
from nucliadb.writer.api.constants import X_EXTRACT_STRATEGY, X_FILENAME, X_LANGUAGE, X_MD5, X_PASSWORD
|
@@ -28,16 +28,15 @@ from nucliadb.common.models_utils.from_proto import (
|
|
28
28
|
RelationTypeMap,
|
29
29
|
)
|
30
30
|
from nucliadb.ingest.orm.utils import set_title
|
31
|
-
from nucliadb.
|
31
|
+
from nucliadb.models.internal.processing import ClassificationLabel, PushPayload, PushTextFormat, Text
|
32
32
|
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
33
33
|
from nucliadb_models.file import FileField
|
34
|
-
from nucliadb_models.labels import ClassificationLabel
|
35
34
|
from nucliadb_models.link import LinkField
|
36
35
|
from nucliadb_models.metadata import (
|
37
36
|
ParagraphAnnotation,
|
38
37
|
QuestionAnswerAnnotation,
|
39
38
|
)
|
40
|
-
from nucliadb_models.text import TEXT_FORMAT_TO_MIMETYPE
|
39
|
+
from nucliadb_models.text import TEXT_FORMAT_TO_MIMETYPE
|
41
40
|
from nucliadb_models.writer import (
|
42
41
|
ComingResourcePayload,
|
43
42
|
CreateResourcePayload,
|
@@ -29,13 +29,12 @@ from nucliadb.common.maindb.driver import Transaction
|
|
29
29
|
from nucliadb.common.models_utils import from_proto, to_proto
|
30
30
|
from nucliadb.ingest.fields.conversation import Conversation
|
31
31
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
32
|
-
from nucliadb.
|
32
|
+
from nucliadb.models.internal import processing as processing_models
|
33
|
+
from nucliadb.models.internal.processing import ClassificationLabel, PushConversation, PushPayload
|
33
34
|
from nucliadb.writer import SERVICE_NAME
|
34
35
|
from nucliadb.writer.utilities import get_processing
|
35
36
|
from nucliadb_models.common import FieldTypeName
|
36
37
|
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
37
|
-
from nucliadb_models.conversation import PushConversation
|
38
|
-
from nucliadb_models.labels import ClassificationLabel
|
39
38
|
from nucliadb_models.writer import (
|
40
39
|
CreateResourcePayload,
|
41
40
|
UpdateResourcePayload,
|
@@ -134,7 +133,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
134
133
|
including_default_value_fields=True,
|
135
134
|
)
|
136
135
|
parsed_link["link"] = parsed_link.pop("uri", None)
|
137
|
-
toprocess.linkfield[field_id] =
|
136
|
+
toprocess.linkfield[field_id] = processing_models.LinkUpload(**parsed_link)
|
138
137
|
toprocess.linkfield[field_id].classification_labels = classif_labels
|
139
138
|
|
140
139
|
if field_type_name is FieldTypeName.TEXT:
|
@@ -143,8 +142,8 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
143
142
|
preserving_proto_field_name=True,
|
144
143
|
including_default_value_fields=True,
|
145
144
|
)
|
146
|
-
parsed_text["format"] =
|
147
|
-
toprocess.textfield[field_id] =
|
145
|
+
parsed_text["format"] = processing_models.PushTextFormat[parsed_text["format"]]
|
146
|
+
toprocess.textfield[field_id] = processing_models.Text(**parsed_text)
|
148
147
|
toprocess.textfield[field_id].classification_labels = classif_labels
|
149
148
|
|
150
149
|
if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
|
@@ -174,7 +173,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
174
173
|
parsed_message["content"]["format"] = resources_pb2.MessageContent.Format.Value(
|
175
174
|
parsed_message["content"]["format"]
|
176
175
|
)
|
177
|
-
full_conversation.messages.append(
|
176
|
+
full_conversation.messages.append(processing_models.PushMessage(**parsed_message))
|
178
177
|
toprocess.conversationfield[field_id] = full_conversation
|
179
178
|
toprocess.conversationfield[field_id].classification_labels = classif_labels
|
180
179
|
|
@@ -247,9 +246,9 @@ def parse_text_field(
|
|
247
246
|
etw.field.field_type = resources_pb2.FieldType.TEXT
|
248
247
|
etw.body.text = text_field.body
|
249
248
|
writer.extracted_text.append(etw)
|
250
|
-
toprocess.textfield[key] =
|
249
|
+
toprocess.textfield[key] = processing_models.Text(
|
251
250
|
body=text_field.body,
|
252
|
-
format=getattr(
|
251
|
+
format=getattr(processing_models.PushTextFormat, text_field.format.value),
|
253
252
|
extract_strategy=text_field.extract_strategy,
|
254
253
|
classification_labels=classif_labels,
|
255
254
|
)
|
@@ -393,7 +392,7 @@ def parse_link_field(
|
|
393
392
|
if link_field.extract_strategy is not None:
|
394
393
|
writer.links[key].extract_strategy = link_field.extract_strategy
|
395
394
|
|
396
|
-
toprocess.linkfield[key] =
|
395
|
+
toprocess.linkfield[key] = processing_models.LinkUpload(
|
397
396
|
link=link_field.uri,
|
398
397
|
headers=link_field.headers or {},
|
399
398
|
cookies=link_field.cookies or {},
|
@@ -424,7 +423,7 @@ async def parse_conversation_field(
|
|
424
423
|
storage = await get_storage(service_name=SERVICE_NAME)
|
425
424
|
processing = get_processing()
|
426
425
|
field_value = resources_pb2.Conversation()
|
427
|
-
convs =
|
426
|
+
convs = processing_models.PushConversation()
|
428
427
|
for message in conversation_field.messages:
|
429
428
|
cm = resources_pb2.Message()
|
430
429
|
if message.timestamp:
|
@@ -437,9 +436,9 @@ async def parse_conversation_field(
|
|
437
436
|
if message.type_ is not None:
|
438
437
|
cm.type = resources_pb2.Message.MessageType.Value(message.type_.value)
|
439
438
|
|
440
|
-
processing_message_content =
|
439
|
+
processing_message_content = processing_models.PushMessageContent(
|
441
440
|
text=message.content.text,
|
442
|
-
format=getattr(
|
441
|
+
format=getattr(processing_models.PushMessageFormat, message.content.format.value),
|
443
442
|
)
|
444
443
|
|
445
444
|
cm.content.text = message.content.text
|
@@ -472,7 +471,7 @@ async def parse_conversation_field(
|
|
472
471
|
await processing.convert_internal_cf_to_str(cf_conv_field, storage)
|
473
472
|
)
|
474
473
|
|
475
|
-
processing_message =
|
474
|
+
processing_message = processing_models.PushMessage(
|
476
475
|
timestamp=message.timestamp,
|
477
476
|
content=processing_message_content,
|
478
477
|
ident=message.ident,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.3.5.
|
3
|
+
Version: 6.3.5.post3995
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.3.5.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.3.5.
|
26
|
-
Requires-Dist: nucliadb-models>=6.3.5.
|
27
|
-
Requires-Dist: nidx-protos>=6.3.5.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.3.5.post3995
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.post3995
|
25
|
+
Requires-Dist: nucliadb-protos>=6.3.5.post3995
|
26
|
+
Requires-Dist: nucliadb-models>=6.3.5.post3995
|
27
|
+
Requires-Dist: nidx-protos>=6.3.5.post3995
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|