nucliadb 6.9.0.post5076__py3-none-any.whl → 6.9.0.post5085__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- nucliadb/ingest/fields/conversation.py +45 -1
- nucliadb/ingest/orm/brain_v2.py +20 -2
- nucliadb/ingest/orm/index_message.py +51 -1
- nucliadb/writer/api/v1/field.py +10 -5
- nucliadb/writer/resource/field.py +29 -14
- {nucliadb-6.9.0.post5076.dist-info → nucliadb-6.9.0.post5085.dist-info}/METADATA +6 -6
- {nucliadb-6.9.0.post5076.dist-info → nucliadb-6.9.0.post5085.dist-info}/RECORD +11 -10
- {nucliadb-6.9.0.post5076.dist-info → nucliadb-6.9.0.post5085.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.0.post5076.dist-info → nucliadb-6.9.0.post5085.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.0.post5076.dist-info → nucliadb-6.9.0.post5085.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
"""Migration #39
|
|
22
|
+
|
|
23
|
+
Backfill splits metadata on conversation fields
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from typing import cast
|
|
29
|
+
|
|
30
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
31
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
32
|
+
from nucliadb.ingest.fields.conversation import (
|
|
33
|
+
CONVERSATION_SPLITS_METADATA,
|
|
34
|
+
Conversation,
|
|
35
|
+
)
|
|
36
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
|
37
|
+
from nucliadb.migrator.context import ExecutionContext
|
|
38
|
+
from nucliadb_protos import resources_pb2
|
|
39
|
+
from nucliadb_protos.resources_pb2 import SplitMetadata, SplitsMetadata
|
|
40
|
+
from nucliadb_utils.storages.storage import Storage
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
49
|
+
BATCH_SIZE = 100
|
|
50
|
+
start = ""
|
|
51
|
+
while True:
|
|
52
|
+
to_fix: list[tuple[str, str]] = []
|
|
53
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
54
|
+
txn = cast(PGTransaction, txn)
|
|
55
|
+
async with txn.connection.cursor() as cur:
|
|
56
|
+
# Retrieve a bunch of conversation fields
|
|
57
|
+
await cur.execute(
|
|
58
|
+
"""
|
|
59
|
+
SELECT key FROM resources
|
|
60
|
+
WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
|
|
61
|
+
AND key > %s
|
|
62
|
+
ORDER BY key
|
|
63
|
+
LIMIT %s""",
|
|
64
|
+
(kbid, start, BATCH_SIZE),
|
|
65
|
+
)
|
|
66
|
+
rows = await cur.fetchall()
|
|
67
|
+
if len(rows) == 0:
|
|
68
|
+
return
|
|
69
|
+
for row in rows:
|
|
70
|
+
key = row[0]
|
|
71
|
+
start = key
|
|
72
|
+
rid = key.split("/")[4]
|
|
73
|
+
field_id = key.split("/")[7]
|
|
74
|
+
to_fix.append((rid, field_id))
|
|
75
|
+
|
|
76
|
+
for rid, field_id in to_fix:
|
|
77
|
+
async with context.kv_driver.rw_transaction() as txn2:
|
|
78
|
+
splits_metadata = await build_splits_metadata(
|
|
79
|
+
txn2, context.blob_storage, kbid, rid, field_id
|
|
80
|
+
)
|
|
81
|
+
splits_metadata_key = CONVERSATION_SPLITS_METADATA.format(
|
|
82
|
+
kbid=kbid, uuid=rid, type="c", field=field_id
|
|
83
|
+
)
|
|
84
|
+
await txn2.set(splits_metadata_key, splits_metadata.SerializeToString())
|
|
85
|
+
await txn2.commit()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def build_splits_metadata(
|
|
89
|
+
txn: Transaction, storage: Storage, kbid: str, rid: str, field_id: str
|
|
90
|
+
) -> SplitsMetadata:
|
|
91
|
+
splits_metadata = SplitsMetadata()
|
|
92
|
+
kb_orm = KnowledgeBoxORM(txn, storage, kbid)
|
|
93
|
+
resource_obj = await kb_orm.get(rid)
|
|
94
|
+
if resource_obj is None:
|
|
95
|
+
return splits_metadata
|
|
96
|
+
field_obj: Conversation = await resource_obj.get_field(
|
|
97
|
+
field_id, resources_pb2.FieldType.CONVERSATION, load=False
|
|
98
|
+
)
|
|
99
|
+
conv_metadata = await field_obj.get_metadata()
|
|
100
|
+
for i in range(1, conv_metadata.pages + 1):
|
|
101
|
+
page = await field_obj.get_value(page=i)
|
|
102
|
+
if page is None:
|
|
103
|
+
continue
|
|
104
|
+
for message in page.messages:
|
|
105
|
+
splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
|
|
106
|
+
return splits_metadata
|
|
@@ -21,7 +21,7 @@ import uuid
|
|
|
21
21
|
from typing import Any, Optional
|
|
22
22
|
|
|
23
23
|
from nucliadb.ingest.fields.base import Field
|
|
24
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
|
|
24
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitMetadata, SplitsMetadata
|
|
25
25
|
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
|
26
26
|
from nucliadb_utils.storages.storage import StorageField
|
|
27
27
|
|
|
@@ -30,6 +30,7 @@ MAX_CONVERSATION_MESSAGES = 50 * 1024
|
|
|
30
30
|
PAGE_SIZE = 200
|
|
31
31
|
|
|
32
32
|
CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
|
|
33
|
+
CONVERSATION_SPLITS_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/splits_metadata"
|
|
33
34
|
CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
|
|
34
35
|
|
|
35
36
|
|
|
@@ -54,9 +55,22 @@ class Conversation(Field[PBConversation]):
|
|
|
54
55
|
):
|
|
55
56
|
super(Conversation, self).__init__(id, resource, pb, value)
|
|
56
57
|
self.value = {}
|
|
58
|
+
self._splits_metadata: Optional[SplitsMetadata] = None
|
|
57
59
|
self.metadata = None
|
|
58
60
|
|
|
61
|
+
async def delete_value(self):
|
|
62
|
+
await self.resource.txn.delete_by_prefix(
|
|
63
|
+
CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id)
|
|
64
|
+
)
|
|
65
|
+
self._split_metadata = None
|
|
66
|
+
self.metadata = None
|
|
67
|
+
self.value.clear()
|
|
68
|
+
|
|
59
69
|
async def set_value(self, payload: PBConversation):
|
|
70
|
+
if payload.replace_field:
|
|
71
|
+
# As we need to overwrite the value of the conversation, first delete any previous data.
|
|
72
|
+
await self.delete_value()
|
|
73
|
+
|
|
60
74
|
metadata = await self.get_metadata()
|
|
61
75
|
metadata.extract_strategy = payload.extract_strategy
|
|
62
76
|
metadata.split_strategy = payload.split_strategy
|
|
@@ -72,10 +86,13 @@ class Conversation(Field[PBConversation]):
|
|
|
72
86
|
last_page = PBConversation()
|
|
73
87
|
metadata.pages += 1
|
|
74
88
|
|
|
89
|
+
self._splits_metadata = await self.get_splits_metadata()
|
|
90
|
+
|
|
75
91
|
# Make sure message attachment files are on our region. This is needed
|
|
76
92
|
# to support the hybrid-onprem deployment as the attachments must be stored
|
|
77
93
|
# at the storage services of the client's premises.
|
|
78
94
|
for message in payload.messages:
|
|
95
|
+
self._splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
|
|
79
96
|
new_message_files = []
|
|
80
97
|
for idx, file in enumerate(message.content.attachments):
|
|
81
98
|
if self.storage.needs_move(file, self.kbid):
|
|
@@ -119,6 +136,7 @@ class Conversation(Field[PBConversation]):
|
|
|
119
136
|
|
|
120
137
|
# Finally, set the metadata
|
|
121
138
|
await self.db_set_metadata(metadata)
|
|
139
|
+
await self.set_splits_metadata(self._splits_metadata)
|
|
122
140
|
|
|
123
141
|
async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
|
|
124
142
|
# If no page was requested, force fetch of metadata
|
|
@@ -205,3 +223,29 @@ class Conversation(Field[PBConversation]):
|
|
|
205
223
|
self.metadata = payload
|
|
206
224
|
self.resource.modified = True
|
|
207
225
|
self._created = False
|
|
226
|
+
|
|
227
|
+
async def get_splits_metadata(self) -> SplitsMetadata:
|
|
228
|
+
if self._splits_metadata is None:
|
|
229
|
+
field_key = CONVERSATION_SPLITS_METADATA.format(
|
|
230
|
+
kbid=self.kbid,
|
|
231
|
+
uuid=self.uuid,
|
|
232
|
+
type=self.type,
|
|
233
|
+
field=self.id,
|
|
234
|
+
)
|
|
235
|
+
payload = await self.resource.txn.get(field_key)
|
|
236
|
+
if payload is None:
|
|
237
|
+
return SplitsMetadata()
|
|
238
|
+
self._splits_metadata = SplitsMetadata()
|
|
239
|
+
self._splits_metadata.ParseFromString(payload)
|
|
240
|
+
return self._splits_metadata
|
|
241
|
+
|
|
242
|
+
async def set_splits_metadata(self, payload: SplitsMetadata) -> None:
|
|
243
|
+
key = CONVERSATION_SPLITS_METADATA.format(
|
|
244
|
+
kbid=self.kbid,
|
|
245
|
+
uuid=self.uuid,
|
|
246
|
+
type=self.type,
|
|
247
|
+
field=self.id,
|
|
248
|
+
)
|
|
249
|
+
await self.resource.txn.set(key, payload.SerializeToString())
|
|
250
|
+
self._split_metadata = payload
|
|
251
|
+
self.resource.modified = True
|
nucliadb/ingest/orm/brain_v2.py
CHANGED
|
@@ -217,6 +217,7 @@ class ResourceBrain:
|
|
|
217
217
|
replace_field: bool,
|
|
218
218
|
skip_paragraphs_index: Optional[bool],
|
|
219
219
|
skip_texts_index: Optional[bool],
|
|
220
|
+
append_splits: Optional[set[str]] = None,
|
|
220
221
|
) -> None:
|
|
221
222
|
# We need to add the extracted text to the texts section of the Resource so that
|
|
222
223
|
# the paragraphs can be indexed
|
|
@@ -234,6 +235,7 @@ class ResourceBrain:
|
|
|
234
235
|
user_field_metadata,
|
|
235
236
|
replace_field=replace_field,
|
|
236
237
|
skip_paragraphs=skip_paragraphs_index,
|
|
238
|
+
append_splits=append_splits,
|
|
237
239
|
)
|
|
238
240
|
|
|
239
241
|
@observer.wrap({"type": "apply_field_paragraphs"})
|
|
@@ -246,6 +248,7 @@ class ResourceBrain:
|
|
|
246
248
|
user_field_metadata: Optional[UserFieldMetadata],
|
|
247
249
|
replace_field: bool,
|
|
248
250
|
skip_paragraphs: Optional[bool],
|
|
251
|
+
append_splits: Optional[set[str]] = None,
|
|
249
252
|
) -> None:
|
|
250
253
|
if skip_paragraphs is not None:
|
|
251
254
|
self.brain.skip_paragraphs = skip_paragraphs
|
|
@@ -254,7 +257,12 @@ class ResourceBrain:
|
|
|
254
257
|
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
|
255
258
|
# Splits of the field
|
|
256
259
|
for subfield, field_metadata in field_computed_metadata.split_metadata.items():
|
|
257
|
-
|
|
260
|
+
if should_skip_split_indexing(subfield, replace_field, append_splits):
|
|
261
|
+
continue
|
|
262
|
+
if subfield not in extracted_text.split_text:
|
|
263
|
+
# No extracted text for this split
|
|
264
|
+
continue
|
|
265
|
+
extracted_text_str = extracted_text.split_text[subfield]
|
|
258
266
|
for idx, paragraph in enumerate(field_metadata.paragraphs):
|
|
259
267
|
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
|
260
268
|
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
|
@@ -308,7 +316,7 @@ class ResourceBrain:
|
|
|
308
316
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
|
309
317
|
|
|
310
318
|
# Main field
|
|
311
|
-
extracted_text_str = extracted_text.text
|
|
319
|
+
extracted_text_str = extracted_text.text
|
|
312
320
|
for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
|
|
313
321
|
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
|
314
322
|
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
|
@@ -496,9 +504,12 @@ class ResourceBrain:
|
|
|
496
504
|
replace_field: bool = False,
|
|
497
505
|
# cut to specific dimension if specified
|
|
498
506
|
vector_dimension: Optional[int] = None,
|
|
507
|
+
append_splits: Optional[set[str]] = None,
|
|
499
508
|
):
|
|
500
509
|
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
|
501
510
|
for subfield, vectors in vo.split_vectors.items():
|
|
511
|
+
if should_skip_split_indexing(subfield, replace_field, append_splits):
|
|
512
|
+
continue
|
|
502
513
|
_field_id = ids.FieldId(
|
|
503
514
|
rid=fid.rid,
|
|
504
515
|
type=fid.type,
|
|
@@ -792,3 +803,10 @@ class ParagraphPages:
|
|
|
792
803
|
if len(self._materialized) > 0:
|
|
793
804
|
return self._materialized[-1]
|
|
794
805
|
return 0
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def should_skip_split_indexing(
|
|
809
|
+
split: str, replace_field: bool, append_splits: Optional[set[str]]
|
|
810
|
+
) -> bool:
|
|
811
|
+
# When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
|
|
812
|
+
return not replace_field and append_splits is not None and split not in append_splits
|
|
@@ -25,6 +25,7 @@ from typing import Optional, Sequence
|
|
|
25
25
|
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
26
26
|
|
|
27
27
|
from nucliadb.common import datamanagers
|
|
28
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
|
28
29
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
|
29
30
|
from nucliadb.ingest.fields.file import File
|
|
30
31
|
from nucliadb.ingest.orm.brain_v2 import ResourceBrain
|
|
@@ -32,6 +33,7 @@ from nucliadb.ingest.orm.metrics import index_message_observer as observer
|
|
|
32
33
|
from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
|
|
33
34
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
|
34
35
|
from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
|
|
36
|
+
from nucliadb_protos.utils_pb2 import ExtractedText
|
|
35
37
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
|
36
38
|
|
|
37
39
|
|
|
@@ -69,6 +71,7 @@ class IndexMessageBuilder:
|
|
|
69
71
|
relations: bool = True,
|
|
70
72
|
replace: bool = True,
|
|
71
73
|
vectorset_configs: Optional[list[VectorSetConfig]] = None,
|
|
74
|
+
append_splits: Optional[set[str]] = None,
|
|
72
75
|
):
|
|
73
76
|
field = await self.resource.get_field(fieldid.field, fieldid.field_type)
|
|
74
77
|
extracted_text = await field.get_extracted_text()
|
|
@@ -120,6 +123,7 @@ class IndexMessageBuilder:
|
|
|
120
123
|
replace_field=replace_paragraphs,
|
|
121
124
|
skip_paragraphs_index=skip_paragraphs_index,
|
|
122
125
|
skip_texts_index=skip_texts_index,
|
|
126
|
+
append_splits=append_splits,
|
|
123
127
|
)
|
|
124
128
|
if vectors:
|
|
125
129
|
assert vectorset_configs is not None
|
|
@@ -137,6 +141,7 @@ class IndexMessageBuilder:
|
|
|
137
141
|
vectorset=vectorset_config.vectorset_id,
|
|
138
142
|
replace_field=replace,
|
|
139
143
|
vector_dimension=dimension,
|
|
144
|
+
append_splits=append_splits,
|
|
140
145
|
)
|
|
141
146
|
if relations:
|
|
142
147
|
await asyncio.to_thread(
|
|
@@ -214,6 +219,19 @@ class IndexMessageBuilder:
|
|
|
214
219
|
for fieldid in fields_to_index:
|
|
215
220
|
if fieldid in message.delete_fields:
|
|
216
221
|
continue
|
|
222
|
+
|
|
223
|
+
# For conversation fields, we only replace the full field if it is not an append messages operation.
|
|
224
|
+
# All other fields are always replaced upon modification.
|
|
225
|
+
replace_field = True
|
|
226
|
+
modified_splits = None
|
|
227
|
+
if fieldid.field_type == FieldType.CONVERSATION:
|
|
228
|
+
modified_splits = await get_bm_modified_split_ids(fieldid, message, self.resource)
|
|
229
|
+
stored_splits = await get_stored_split_ids(fieldid, self.resource)
|
|
230
|
+
is_append_messages_op = modified_splits.issubset(stored_splits) and 0 < len(
|
|
231
|
+
modified_splits
|
|
232
|
+
) < len(stored_splits)
|
|
233
|
+
replace_field = not is_append_messages_op
|
|
234
|
+
|
|
217
235
|
await self._apply_field_index_data(
|
|
218
236
|
self.brain,
|
|
219
237
|
fieldid,
|
|
@@ -222,8 +240,9 @@ class IndexMessageBuilder:
|
|
|
222
240
|
paragraphs=needs_paragraphs_update(fieldid, message),
|
|
223
241
|
relations=needs_relations_update(fieldid, message),
|
|
224
242
|
vectors=needs_vectors_update(fieldid, message),
|
|
225
|
-
replace=
|
|
243
|
+
replace=replace_field,
|
|
226
244
|
vectorset_configs=vectorsets_configs,
|
|
245
|
+
append_splits=modified_splits,
|
|
227
246
|
)
|
|
228
247
|
return self.brain.brain
|
|
229
248
|
|
|
@@ -354,6 +373,37 @@ def needs_vectors_update(
|
|
|
354
373
|
return any(field_vectors.field == field_id for field_vectors in message.field_vectors)
|
|
355
374
|
|
|
356
375
|
|
|
376
|
+
async def get_bm_modified_split_ids(
|
|
377
|
+
conversation_field_id: FieldID,
|
|
378
|
+
message: BrokerMessage,
|
|
379
|
+
resource: Resource,
|
|
380
|
+
) -> set[str]:
|
|
381
|
+
message_etw = next(
|
|
382
|
+
(etw for etw in message.extracted_text if etw.field == conversation_field_id), None
|
|
383
|
+
)
|
|
384
|
+
if message_etw is None:
|
|
385
|
+
return set()
|
|
386
|
+
storage = resource.storage
|
|
387
|
+
if message_etw.HasField("file"):
|
|
388
|
+
raw_payload = await storage.downloadbytescf(message_etw.file)
|
|
389
|
+
message_extracted_text = ExtractedText()
|
|
390
|
+
message_extracted_text.ParseFromString(raw_payload.read())
|
|
391
|
+
raw_payload.flush()
|
|
392
|
+
else:
|
|
393
|
+
message_extracted_text = message_etw.body
|
|
394
|
+
return set(message_extracted_text.split_text.keys())
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
async def get_stored_split_ids(
|
|
398
|
+
conversation_field_id: FieldID,
|
|
399
|
+
resource: Resource,
|
|
400
|
+
) -> set[str]:
|
|
401
|
+
fid = conversation_field_id
|
|
402
|
+
conv: Conversation = await resource.get_field(fid.field, fid.field_type, load=False)
|
|
403
|
+
splits_metadata = await conv.get_splits_metadata()
|
|
404
|
+
return set(splits_metadata.metadata)
|
|
405
|
+
|
|
406
|
+
|
|
357
407
|
def needs_relations_update(
|
|
358
408
|
field_id: FieldID,
|
|
359
409
|
message: BrokerMessage,
|
nucliadb/writer/api/v1/field.py
CHANGED
|
@@ -249,9 +249,10 @@ async def parse_conversation_field_adapter(
|
|
|
249
249
|
writer: BrokerMessage,
|
|
250
250
|
toprocess: PushPayload,
|
|
251
251
|
resource_classifications: ResourceClassifications,
|
|
252
|
+
replace_field: bool = False,
|
|
252
253
|
):
|
|
253
254
|
return await parse_conversation_field(
|
|
254
|
-
field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
|
|
255
|
+
field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications, replace_field
|
|
255
256
|
)
|
|
256
257
|
|
|
257
258
|
|
|
@@ -380,7 +381,9 @@ async def add_resource_field_conversation_rslug_prefix(
|
|
|
380
381
|
field_id: FieldIdString,
|
|
381
382
|
field_payload: models.InputConversationField,
|
|
382
383
|
) -> ResourceFieldAdded:
|
|
383
|
-
return await add_field_to_resource_by_slug(
|
|
384
|
+
return await add_field_to_resource_by_slug(
|
|
385
|
+
request, kbid, rslug, field_id, field_payload, replace_field=True
|
|
386
|
+
)
|
|
384
387
|
|
|
385
388
|
|
|
386
389
|
@api.put(
|
|
@@ -399,7 +402,7 @@ async def add_resource_field_conversation_rid_prefix(
|
|
|
399
402
|
field_id: FieldIdString,
|
|
400
403
|
field_payload: models.InputConversationField,
|
|
401
404
|
) -> ResourceFieldAdded:
|
|
402
|
-
return await add_field_to_resource(request, kbid, rid, field_id, field_payload)
|
|
405
|
+
return await add_field_to_resource(request, kbid, rid, field_id, field_payload, replace_field=True)
|
|
403
406
|
|
|
404
407
|
|
|
405
408
|
@api.put(
|
|
@@ -466,7 +469,9 @@ async def append_messages_to_conversation_field_rslug_prefix(
|
|
|
466
469
|
field = models.InputConversationField(messages=messages)
|
|
467
470
|
except pydantic.ValidationError as e:
|
|
468
471
|
raise HTTPException(status_code=422, detail=str(e))
|
|
469
|
-
return await add_field_to_resource_by_slug(
|
|
472
|
+
return await add_field_to_resource_by_slug(
|
|
473
|
+
request, kbid, rslug, field_id, field, replace_field=False
|
|
474
|
+
)
|
|
470
475
|
|
|
471
476
|
|
|
472
477
|
@api.put(
|
|
@@ -489,7 +494,7 @@ async def append_messages_to_conversation_field_rid_prefix(
|
|
|
489
494
|
field = models.InputConversationField(messages=messages)
|
|
490
495
|
except pydantic.ValidationError as e:
|
|
491
496
|
raise HTTPException(status_code=422, detail=str(e))
|
|
492
|
-
return await add_field_to_resource(request, kbid, rid, field_id, field)
|
|
497
|
+
return await add_field_to_resource(request, kbid, rid, field_id, field, replace_field=False)
|
|
493
498
|
|
|
494
499
|
|
|
495
500
|
@api.delete(
|
|
@@ -228,6 +228,7 @@ async def parse_fields(
|
|
|
228
228
|
kbid,
|
|
229
229
|
uuid,
|
|
230
230
|
resource_classifications,
|
|
231
|
+
replace_field=True,
|
|
231
232
|
)
|
|
232
233
|
|
|
233
234
|
|
|
@@ -431,19 +432,15 @@ async def parse_conversation_field(
|
|
|
431
432
|
kbid: str,
|
|
432
433
|
uuid: str,
|
|
433
434
|
resource_classifications: ResourceClassifications,
|
|
435
|
+
replace_field: bool,
|
|
434
436
|
) -> None:
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
raise HTTPException(
|
|
439
|
-
status_code=422,
|
|
440
|
-
detail=f"Conversation fields cannot have more than {MAX_CONVERSATION_MESSAGES} messages.",
|
|
441
|
-
)
|
|
442
|
-
|
|
437
|
+
if not replace_field:
|
|
438
|
+
# Appending messages to conversation
|
|
439
|
+
await _conversation_append_checks(kbid, uuid, key, conversation_field)
|
|
443
440
|
classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.CONVERSATION)
|
|
444
441
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
445
442
|
processing = get_processing()
|
|
446
|
-
field_value = resources_pb2.Conversation()
|
|
443
|
+
field_value = resources_pb2.Conversation(replace_field=replace_field)
|
|
447
444
|
convs = processing_models.PushConversation()
|
|
448
445
|
for message in conversation_field.messages:
|
|
449
446
|
cm = resources_pb2.Message()
|
|
@@ -554,13 +551,31 @@ async def get_stored_resource_classifications(
|
|
|
554
551
|
return rc
|
|
555
552
|
|
|
556
553
|
|
|
557
|
-
async def
|
|
554
|
+
async def _conversation_append_checks(
|
|
555
|
+
kbid: str, rid: str, field_id: str, input: models.InputConversationField
|
|
556
|
+
):
|
|
558
557
|
async with datamanagers.with_ro_transaction() as txn:
|
|
559
558
|
resource_obj = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
|
|
560
559
|
if resource_obj is None:
|
|
561
|
-
return
|
|
562
|
-
|
|
560
|
+
return
|
|
561
|
+
conv: Conversation = await resource_obj.get_field(
|
|
563
562
|
field_id, resources_pb2.FieldType.CONVERSATION, load=False
|
|
564
563
|
)
|
|
565
|
-
|
|
566
|
-
|
|
564
|
+
|
|
565
|
+
# Make sure that the max number of messages is not exceeded
|
|
566
|
+
current_message_count = (await conv.get_metadata()).total
|
|
567
|
+
if len(input.messages) + current_message_count > MAX_CONVERSATION_MESSAGES:
|
|
568
|
+
raise HTTPException(
|
|
569
|
+
status_code=422,
|
|
570
|
+
detail=f"Conversation fields cannot have more than {MAX_CONVERSATION_MESSAGES} messages.",
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Make sure input messages use unique idents
|
|
574
|
+
existing_message_ids = set((await conv.get_splits_metadata()).metadata.keys())
|
|
575
|
+
input_message_ids = {message.ident for message in input.messages}
|
|
576
|
+
intersection = input_message_ids.intersection(existing_message_ids)
|
|
577
|
+
if intersection != set():
|
|
578
|
+
raise HTTPException(
|
|
579
|
+
status_code=422,
|
|
580
|
+
detail=f"Message identifiers must be unique field={field_id}: {list(intersection)[:50]}",
|
|
581
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nucliadb
|
|
3
|
-
Version: 6.9.0.
|
|
3
|
+
Version: 6.9.0.post5085
|
|
4
4
|
Summary: NucliaDB
|
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: <4,>=3.9
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.9.0.
|
|
23
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.0.
|
|
24
|
-
Requires-Dist: nucliadb-protos>=6.9.0.
|
|
25
|
-
Requires-Dist: nucliadb-models>=6.9.0.
|
|
26
|
-
Requires-Dist: nidx-protos>=6.9.0.
|
|
22
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.9.0.post5085
|
|
23
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.0.post5085
|
|
24
|
+
Requires-Dist: nucliadb-protos>=6.9.0.post5085
|
|
25
|
+
Requires-Dist: nucliadb-models>=6.9.0.post5085
|
|
26
|
+
Requires-Dist: nidx-protos>=6.9.0.post5085
|
|
27
27
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
|
28
28
|
Requires-Dist: nuclia-models>=0.50.0
|
|
29
29
|
Requires-Dist: uvicorn[standard]
|
|
@@ -34,6 +34,7 @@ migrations/0035_rollover_nidx_texts_4.py,sha256=W0_AUd01pjMpYMDC3yqF6HzDLgcnnPpr
|
|
|
34
34
|
migrations/0036_backfill_catalog_slug.py,sha256=toYqxH_EfUFqoVn_cOdR5Fg8bWZU5BoFMfPBSf74LKU,2957
|
|
35
35
|
migrations/0037_backfill_catalog_facets.py,sha256=IH7H4OZ4tzws6xEh7Qro0bPDHDYOoVViEUj-JwPPe1U,2791
|
|
36
36
|
migrations/0038_backfill_catalog_field_labels.py,sha256=F519nYngJDb1Mtwf-OQpweDPWKPxAlqdxy5E-DyQrhA,3492
|
|
37
|
+
migrations/0039_backfill_converation_splits_metadata.py,sha256=NtL9S6Kx8mbSjNJLjIsc-6vVNymD0YKlF9vEkGUEDds,3958
|
|
37
38
|
migrations/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
38
39
|
migrations/pg/0001_bootstrap.py,sha256=3O_P17l0d0h48nebN6VQLXzM_B7S7zvDpaLR0koVgWE,1274
|
|
39
40
|
migrations/pg/0002_catalog.py,sha256=Rsleecu351Ty19kYZgOpqX5G3MEAY8nMxCJrAeuS2Mw,1690
|
|
@@ -154,18 +155,18 @@ nucliadb/ingest/consumer/shard_creator.py,sha256=UKIk0yaS_jC_nGQqymn9NGJWzwZEqhI
|
|
|
154
155
|
nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
|
|
155
156
|
nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
156
157
|
nucliadb/ingest/fields/base.py,sha256=D8NzawonF7hivDW9zvQBbV938TKA6e2OCGqV4kS96RU,24405
|
|
157
|
-
nucliadb/ingest/fields/conversation.py,sha256=
|
|
158
|
+
nucliadb/ingest/fields/conversation.py,sha256=KkOvNM1rZFQRg2RsfGd3Jrz3lpx0HpGpN1cmlpz_mZw,9563
|
|
158
159
|
nucliadb/ingest/fields/exceptions.py,sha256=sZBk21BSrXFdOdo1qUdCAyD-9YMYakSLdn4_WdIPCIQ,1217
|
|
159
160
|
nucliadb/ingest/fields/file.py,sha256=1v4jLg3balUua2VmSV8hHkAwPFShTUCOzufZvIUQcQw,4740
|
|
160
161
|
nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54Y4Ig,1547
|
|
161
162
|
nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
|
|
162
163
|
nucliadb/ingest/fields/text.py,sha256=2grxo8twWbpXEd_iwUMBw9q0dWorVmlPONmY5d1ThwQ,1684
|
|
163
164
|
nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
164
|
-
nucliadb/ingest/orm/brain_v2.py,sha256=
|
|
165
|
+
nucliadb/ingest/orm/brain_v2.py,sha256=mBze0LBft9pPiEtTIlfwCDVk597RSBXsx0Pte4ciTTg,34456
|
|
165
166
|
nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
|
|
166
167
|
nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
|
|
167
168
|
nucliadb/ingest/orm/exceptions.py,sha256=gsp7TtVNQPiIEh-zf_UEJClwuFU0iu-5vzj0OrKMScg,1550
|
|
168
|
-
nucliadb/ingest/orm/index_message.py,sha256=
|
|
169
|
+
nucliadb/ingest/orm/index_message.py,sha256=mWlpQ0-KChSVIbHewVE8sXCe-7LiPIIh0cBqr3axU8o,16554
|
|
169
170
|
nucliadb/ingest/orm/knowledgebox.py,sha256=OG9dmfklYf1PgTHwQd_iFZOociLEvUSMMv1ZKeUgecE,23910
|
|
170
171
|
nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
|
|
171
172
|
nucliadb/ingest/orm/resource.py,sha256=zQeZyZ-tCxr-DhonLobfZRkz_iEew0Y-cGfXeNNIHG0,40432
|
|
@@ -360,7 +361,7 @@ nucliadb/writer/api/constants.py,sha256=SCdqGDbEmpdczQdTfbTlpHzVjbLqccPtMQ25MPIF
|
|
|
360
361
|
nucliadb/writer/api/utils.py,sha256=wIQHlU8RQiIGVLI72suvyVIKlCU44Unh0Ae0IiN6Qwo,1313
|
|
361
362
|
nucliadb/writer/api/v1/__init__.py,sha256=akI9A_jloNLb0dU4T5zjfdyvmSAiDeIdjAlzNx74FlU,1128
|
|
362
363
|
nucliadb/writer/api/v1/export_import.py,sha256=v0sU55TtRSqDzwkDgcwv2uSaqKCuQTtGcMpYoHQYBQA,8192
|
|
363
|
-
nucliadb/writer/api/v1/field.py,sha256=
|
|
364
|
+
nucliadb/writer/api/v1/field.py,sha256=nO3IEV6v5hokdIo5HoaecdwDqvr1PzCJlh5DafzcNTw,19130
|
|
364
365
|
nucliadb/writer/api/v1/knowledgebox.py,sha256=kioqjD3yN-y1cDTgmXAAOwivXHX9NXxwblcSzGqJup0,9533
|
|
365
366
|
nucliadb/writer/api/v1/learning_config.py,sha256=DTLEzKJ3dHvi8pbZscjElUqCH_ZvLc6WZgvalFqHo10,4450
|
|
366
367
|
nucliadb/writer/api/v1/resource.py,sha256=IfcT6HXnR5sC5wSnQSuKmFzEWcLTh7OzZEAV4hYmXnA,20442
|
|
@@ -373,7 +374,7 @@ nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXx
|
|
|
373
374
|
nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
374
375
|
nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
|
|
375
376
|
nucliadb/writer/resource/basic.py,sha256=44GK8M9EEVoAUfGiabdLrrpENqeFwNn7qwxF2AHhQGg,10504
|
|
376
|
-
nucliadb/writer/resource/field.py,sha256=
|
|
377
|
+
nucliadb/writer/resource/field.py,sha256=kJFxOgmizGbEuTRPb5o0cNqonZ8sa9ehVlSfRk-ektY,22866
|
|
377
378
|
nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
|
|
378
379
|
nucliadb/writer/tus/__init__.py,sha256=Kera0BtxoDX0ngPftXiMjNgjrhtQ3l2XFc5nJqSBOJY,5498
|
|
379
380
|
nucliadb/writer/tus/azure.py,sha256=yxoRi4PhGDikTqVK3PiuVyguy8H9DOS66JpZCY4hpUY,4177
|
|
@@ -384,8 +385,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
|
384
385
|
nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
|
|
385
386
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
|
386
387
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
|
387
|
-
nucliadb-6.9.0.
|
|
388
|
-
nucliadb-6.9.0.
|
|
389
|
-
nucliadb-6.9.0.
|
|
390
|
-
nucliadb-6.9.0.
|
|
391
|
-
nucliadb-6.9.0.
|
|
388
|
+
nucliadb-6.9.0.post5085.dist-info/METADATA,sha256=hSS20ZkqaZcgSta1Oq5me923rWzrspnZ2gamC_J_c84,4158
|
|
389
|
+
nucliadb-6.9.0.post5085.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
390
|
+
nucliadb-6.9.0.post5085.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
|
391
|
+
nucliadb-6.9.0.post5085.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
|
392
|
+
nucliadb-6.9.0.post5085.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|