nucliadb 6.2.1.post2864__py3-none-any.whl → 6.2.1.post2871__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/ingest/fields/base.py +43 -18
- nucliadb/ingest/orm/brain.py +11 -21
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/knowledgebox.py +15 -4
- nucliadb/ingest/orm/resource.py +62 -396
- nucliadb/ingest/serialize.py +13 -2
- nucliadb/ingest/service/writer.py +4 -0
- nucliadb/purge/__init__.py +32 -12
- nucliadb/train/nodes.py +13 -7
- nucliadb/train/resource.py +380 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2871.dist-info}/METADATA +5 -5
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2871.dist-info}/RECORD +17 -16
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2871.dist-info}/WHEEL +0 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2871.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2871.dist-info}/top_level.txt +0 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2871.dist-info}/zip-safe +0 -0
nucliadb/train/nodes.py
CHANGED
@@ -28,6 +28,12 @@ from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG_BASE
|
|
28
28
|
from nucliadb.common.maindb.driver import Driver, Transaction
|
29
29
|
from nucliadb.ingest.orm.entities import EntitiesManager
|
30
30
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
31
|
+
from nucliadb.train.resource import (
|
32
|
+
generate_train_resource,
|
33
|
+
iterate_fields,
|
34
|
+
iterate_paragraphs,
|
35
|
+
iterate_sentences,
|
36
|
+
)
|
31
37
|
from nucliadb_protos.train_pb2 import (
|
32
38
|
GetFieldsRequest,
|
33
39
|
GetParagraphsRequest,
|
@@ -87,11 +93,11 @@ class TrainShardManager(manager.KBShardManager):
|
|
87
93
|
# Filter by uuid
|
88
94
|
resource = await kb.get(request.uuid)
|
89
95
|
if resource:
|
90
|
-
async for sentence in
|
96
|
+
async for sentence in iterate_sentences(resource, request.metadata):
|
91
97
|
yield sentence
|
92
98
|
else:
|
93
99
|
async for resource in kb.iterate_resources():
|
94
|
-
async for sentence in
|
100
|
+
async for sentence in iterate_sentences(resource, request.metadata):
|
95
101
|
yield sentence
|
96
102
|
|
97
103
|
async def kb_paragraphs(self, request: GetParagraphsRequest) -> AsyncIterator[TrainParagraph]:
|
@@ -101,11 +107,11 @@ class TrainShardManager(manager.KBShardManager):
|
|
101
107
|
# Filter by uuid
|
102
108
|
resource = await kb.get(request.uuid)
|
103
109
|
if resource:
|
104
|
-
async for paragraph in
|
110
|
+
async for paragraph in iterate_paragraphs(resource, request.metadata):
|
105
111
|
yield paragraph
|
106
112
|
else:
|
107
113
|
async for resource in kb.iterate_resources():
|
108
|
-
async for paragraph in
|
114
|
+
async for paragraph in iterate_paragraphs(resource, request.metadata):
|
109
115
|
yield paragraph
|
110
116
|
|
111
117
|
async def kb_fields(self, request: GetFieldsRequest) -> AsyncIterator[TrainField]:
|
@@ -115,11 +121,11 @@ class TrainShardManager(manager.KBShardManager):
|
|
115
121
|
# Filter by uuid
|
116
122
|
resource = await kb.get(request.uuid)
|
117
123
|
if resource:
|
118
|
-
async for field in
|
124
|
+
async for field in iterate_fields(resource, request.metadata):
|
119
125
|
yield field
|
120
126
|
else:
|
121
127
|
async for resource in kb.iterate_resources():
|
122
|
-
async for field in
|
128
|
+
async for field in iterate_fields(resource, request.metadata):
|
123
129
|
yield field
|
124
130
|
|
125
131
|
async def kb_resources(self, request: GetResourcesRequest) -> AsyncIterator[TrainResource]:
|
@@ -132,4 +138,4 @@ class TrainShardManager(manager.KBShardManager):
|
|
132
138
|
if rid is not None:
|
133
139
|
resource = await kb.get(rid.decode())
|
134
140
|
if resource is not None:
|
135
|
-
yield await
|
141
|
+
yield await generate_train_resource(resource, request.metadata)
|
@@ -0,0 +1,380 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from __future__ import annotations
|
21
|
+
|
22
|
+
from typing import AsyncIterator, MutableMapping, Optional
|
23
|
+
|
24
|
+
from nucliadb.common import datamanagers
|
25
|
+
from nucliadb.ingest.orm.resource import Resource
|
26
|
+
from nucliadb_protos.resources_pb2 import (
|
27
|
+
FieldID,
|
28
|
+
FieldMetadata,
|
29
|
+
ParagraphAnnotation,
|
30
|
+
)
|
31
|
+
from nucliadb_protos.train_pb2 import (
|
32
|
+
EnabledMetadata,
|
33
|
+
TrainField,
|
34
|
+
TrainMetadata,
|
35
|
+
TrainParagraph,
|
36
|
+
TrainResource,
|
37
|
+
TrainSentence,
|
38
|
+
)
|
39
|
+
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
40
|
+
|
41
|
+
|
42
|
+
async def iterate_sentences(
|
43
|
+
resource: Resource,
|
44
|
+
enabled_metadata: EnabledMetadata,
|
45
|
+
) -> AsyncIterator[TrainSentence]: # pragma: no cover
|
46
|
+
fields = await resource.get_fields(force=True)
|
47
|
+
metadata = TrainMetadata()
|
48
|
+
userdefinedparagraphclass: dict[str, ParagraphAnnotation] = {}
|
49
|
+
if enabled_metadata.labels:
|
50
|
+
if resource.basic is None:
|
51
|
+
resource.basic = await resource.get_basic()
|
52
|
+
if resource.basic is not None:
|
53
|
+
metadata.labels.resource.extend(resource.basic.usermetadata.classifications)
|
54
|
+
for fieldmetadata in resource.basic.fieldmetadata:
|
55
|
+
field_id = resource.generate_field_id(fieldmetadata.field)
|
56
|
+
for annotationparagraph in fieldmetadata.paragraphs:
|
57
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
58
|
+
|
59
|
+
for (type_id, field_id), field in fields.items():
|
60
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
61
|
+
field_key = resource.generate_field_id(fieldid)
|
62
|
+
fm = await field.get_field_metadata()
|
63
|
+
extracted_text = None
|
64
|
+
vo = None
|
65
|
+
text = None
|
66
|
+
|
67
|
+
if enabled_metadata.vector:
|
68
|
+
# XXX: Given that nobody requested any particular vectorset, we'll
|
69
|
+
# return any
|
70
|
+
vectorset_id = None
|
71
|
+
async with datamanagers.with_ro_transaction() as txn:
|
72
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
73
|
+
txn=txn, kbid=resource.kb.kbid
|
74
|
+
):
|
75
|
+
break
|
76
|
+
assert vectorset_id is not None, "All KBs must have at least a vectorset"
|
77
|
+
vo = await field.get_vectors(vectorset_id, vs.storage_key_kind)
|
78
|
+
|
79
|
+
extracted_text = await field.get_extracted_text()
|
80
|
+
|
81
|
+
if fm is None:
|
82
|
+
continue
|
83
|
+
|
84
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
85
|
+
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
86
|
+
field_metadatas.append((subfield_metadata, splitted_metadata))
|
87
|
+
|
88
|
+
for subfield, field_metadata in field_metadatas:
|
89
|
+
if enabled_metadata.labels:
|
90
|
+
metadata.labels.ClearField("field")
|
91
|
+
metadata.labels.field.extend(field_metadata.classifications)
|
92
|
+
|
93
|
+
entities: dict[str, str] = {}
|
94
|
+
if enabled_metadata.entities:
|
95
|
+
_update_entities_dict(entities, field_metadata)
|
96
|
+
|
97
|
+
precomputed_vectors = {}
|
98
|
+
if vo is not None:
|
99
|
+
if subfield is not None:
|
100
|
+
vectors = vo.split_vectors[subfield]
|
101
|
+
base_vector_key = f"{resource.uuid}/{field_key}/{subfield}"
|
102
|
+
else:
|
103
|
+
vectors = vo.vectors
|
104
|
+
base_vector_key = f"{resource.uuid}/{field_key}"
|
105
|
+
for index, vector in enumerate(vectors.vectors):
|
106
|
+
vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
107
|
+
precomputed_vectors[vector_key] = vector.vector
|
108
|
+
|
109
|
+
if extracted_text is not None:
|
110
|
+
if subfield is not None:
|
111
|
+
text = extracted_text.split_text[subfield]
|
112
|
+
else:
|
113
|
+
text = extracted_text.text
|
114
|
+
|
115
|
+
for paragraph in field_metadata.paragraphs:
|
116
|
+
if subfield is not None:
|
117
|
+
paragraph_key = (
|
118
|
+
f"{resource.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
119
|
+
)
|
120
|
+
else:
|
121
|
+
paragraph_key = f"{resource.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
122
|
+
|
123
|
+
if enabled_metadata.labels:
|
124
|
+
metadata.labels.ClearField("field")
|
125
|
+
metadata.labels.paragraph.extend(paragraph.classifications)
|
126
|
+
if paragraph_key in userdefinedparagraphclass:
|
127
|
+
metadata.labels.paragraph.extend(
|
128
|
+
userdefinedparagraphclass[paragraph_key].classifications
|
129
|
+
)
|
130
|
+
|
131
|
+
for index, sentence in enumerate(paragraph.sentences):
|
132
|
+
if subfield is not None:
|
133
|
+
sentence_key = f"{resource.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
|
134
|
+
else:
|
135
|
+
sentence_key = (
|
136
|
+
f"{resource.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
|
137
|
+
)
|
138
|
+
|
139
|
+
if vo is not None:
|
140
|
+
metadata.ClearField("vector")
|
141
|
+
vector_tmp = precomputed_vectors.get(sentence_key)
|
142
|
+
if vector_tmp:
|
143
|
+
metadata.vector.extend(vector_tmp)
|
144
|
+
|
145
|
+
if extracted_text is not None and text is not None:
|
146
|
+
metadata.text = text[sentence.start : sentence.end]
|
147
|
+
|
148
|
+
metadata.ClearField("entities")
|
149
|
+
metadata.ClearField("entity_positions")
|
150
|
+
if enabled_metadata.entities and text is not None:
|
151
|
+
local_text = text[sentence.start : sentence.end]
|
152
|
+
add_entities_to_metadata(entities, local_text, metadata)
|
153
|
+
|
154
|
+
pb_sentence = TrainSentence()
|
155
|
+
pb_sentence.uuid = resource.uuid
|
156
|
+
pb_sentence.field.CopyFrom(fieldid)
|
157
|
+
pb_sentence.paragraph = paragraph_key
|
158
|
+
pb_sentence.sentence = sentence_key
|
159
|
+
pb_sentence.metadata.CopyFrom(metadata)
|
160
|
+
yield pb_sentence
|
161
|
+
|
162
|
+
|
163
|
+
async def iterate_paragraphs(
|
164
|
+
resource: Resource, enabled_metadata: EnabledMetadata
|
165
|
+
) -> AsyncIterator[TrainParagraph]:
|
166
|
+
fields = await resource.get_fields(force=True)
|
167
|
+
metadata = TrainMetadata()
|
168
|
+
userdefinedparagraphclass: dict[str, ParagraphAnnotation] = {}
|
169
|
+
if enabled_metadata.labels:
|
170
|
+
if resource.basic is None:
|
171
|
+
resource.basic = await resource.get_basic()
|
172
|
+
if resource.basic is not None:
|
173
|
+
metadata.labels.resource.extend(resource.basic.usermetadata.classifications)
|
174
|
+
for fieldmetadata in resource.basic.fieldmetadata:
|
175
|
+
field_id = resource.generate_field_id(fieldmetadata.field)
|
176
|
+
for annotationparagraph in fieldmetadata.paragraphs:
|
177
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
178
|
+
|
179
|
+
for (type_id, field_id), field in fields.items():
|
180
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
181
|
+
field_key = resource.generate_field_id(fieldid)
|
182
|
+
fm = await field.get_field_metadata()
|
183
|
+
extracted_text = None
|
184
|
+
text = None
|
185
|
+
|
186
|
+
extracted_text = await field.get_extracted_text()
|
187
|
+
|
188
|
+
if fm is None:
|
189
|
+
continue
|
190
|
+
|
191
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
192
|
+
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
193
|
+
field_metadatas.append((subfield_metadata, splitted_metadata))
|
194
|
+
|
195
|
+
for subfield, field_metadata in field_metadatas:
|
196
|
+
if enabled_metadata.labels:
|
197
|
+
metadata.labels.ClearField("field")
|
198
|
+
metadata.labels.field.extend(field_metadata.classifications)
|
199
|
+
|
200
|
+
entities: dict[str, str] = {}
|
201
|
+
if enabled_metadata.entities:
|
202
|
+
_update_entities_dict(entities, field_metadata)
|
203
|
+
|
204
|
+
if extracted_text is not None:
|
205
|
+
if subfield is not None:
|
206
|
+
text = extracted_text.split_text[subfield]
|
207
|
+
else:
|
208
|
+
text = extracted_text.text
|
209
|
+
|
210
|
+
for paragraph in field_metadata.paragraphs:
|
211
|
+
if subfield is not None:
|
212
|
+
paragraph_key = (
|
213
|
+
f"{resource.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
214
|
+
)
|
215
|
+
else:
|
216
|
+
paragraph_key = f"{resource.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
217
|
+
|
218
|
+
if enabled_metadata.labels:
|
219
|
+
metadata.labels.ClearField("paragraph")
|
220
|
+
metadata.labels.paragraph.extend(paragraph.classifications)
|
221
|
+
|
222
|
+
if extracted_text is not None and text is not None:
|
223
|
+
metadata.text = text[paragraph.start : paragraph.end]
|
224
|
+
|
225
|
+
metadata.ClearField("entities")
|
226
|
+
metadata.ClearField("entity_positions")
|
227
|
+
if enabled_metadata.entities and text is not None:
|
228
|
+
local_text = text[paragraph.start : paragraph.end]
|
229
|
+
add_entities_to_metadata(entities, local_text, metadata)
|
230
|
+
|
231
|
+
if paragraph_key in userdefinedparagraphclass:
|
232
|
+
metadata.labels.paragraph.extend(
|
233
|
+
userdefinedparagraphclass[paragraph_key].classifications
|
234
|
+
)
|
235
|
+
|
236
|
+
pb_paragraph = TrainParagraph()
|
237
|
+
pb_paragraph.uuid = resource.uuid
|
238
|
+
pb_paragraph.field.CopyFrom(fieldid)
|
239
|
+
pb_paragraph.paragraph = paragraph_key
|
240
|
+
pb_paragraph.metadata.CopyFrom(metadata)
|
241
|
+
|
242
|
+
yield pb_paragraph
|
243
|
+
|
244
|
+
|
245
|
+
async def iterate_fields(
|
246
|
+
resource: Resource, enabled_metadata: EnabledMetadata
|
247
|
+
) -> AsyncIterator[TrainField]:
|
248
|
+
fields = await resource.get_fields(force=True)
|
249
|
+
metadata = TrainMetadata()
|
250
|
+
if enabled_metadata.labels:
|
251
|
+
if resource.basic is None:
|
252
|
+
resource.basic = await resource.get_basic()
|
253
|
+
if resource.basic is not None:
|
254
|
+
metadata.labels.resource.extend(resource.basic.usermetadata.classifications)
|
255
|
+
|
256
|
+
for (type_id, field_id), field in fields.items():
|
257
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
258
|
+
fm = await field.get_field_metadata()
|
259
|
+
extracted_text = None
|
260
|
+
|
261
|
+
if enabled_metadata.text:
|
262
|
+
extracted_text = await field.get_extracted_text()
|
263
|
+
|
264
|
+
if fm is None:
|
265
|
+
continue
|
266
|
+
|
267
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
268
|
+
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
269
|
+
field_metadatas.append((subfield_metadata, splitted_metadata))
|
270
|
+
|
271
|
+
for subfield, splitted_metadata in field_metadatas:
|
272
|
+
if enabled_metadata.labels:
|
273
|
+
metadata.labels.ClearField("field")
|
274
|
+
metadata.labels.field.extend(splitted_metadata.classifications)
|
275
|
+
|
276
|
+
if extracted_text is not None:
|
277
|
+
if subfield is not None:
|
278
|
+
metadata.text = extracted_text.split_text[subfield]
|
279
|
+
else:
|
280
|
+
metadata.text = extracted_text.text
|
281
|
+
|
282
|
+
if enabled_metadata.entities:
|
283
|
+
metadata.ClearField("entities")
|
284
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
285
|
+
|
286
|
+
pb_field = TrainField()
|
287
|
+
pb_field.uuid = resource.uuid
|
288
|
+
pb_field.field.CopyFrom(fieldid)
|
289
|
+
pb_field.metadata.CopyFrom(metadata)
|
290
|
+
yield pb_field
|
291
|
+
|
292
|
+
|
293
|
+
async def generate_train_resource(
|
294
|
+
resource: Resource, enabled_metadata: EnabledMetadata
|
295
|
+
) -> TrainResource:
|
296
|
+
fields = await resource.get_fields(force=True)
|
297
|
+
metadata = TrainMetadata()
|
298
|
+
if enabled_metadata.labels:
|
299
|
+
if resource.basic is None:
|
300
|
+
resource.basic = await resource.get_basic()
|
301
|
+
if resource.basic is not None:
|
302
|
+
metadata.labels.resource.extend(resource.basic.usermetadata.classifications)
|
303
|
+
|
304
|
+
metadata.labels.ClearField("field")
|
305
|
+
metadata.ClearField("entities")
|
306
|
+
|
307
|
+
for (_, _), field in fields.items():
|
308
|
+
extracted_text = None
|
309
|
+
fm = await field.get_field_metadata()
|
310
|
+
|
311
|
+
if enabled_metadata.text:
|
312
|
+
extracted_text = await field.get_extracted_text()
|
313
|
+
|
314
|
+
if extracted_text is not None:
|
315
|
+
metadata.text += extracted_text.text
|
316
|
+
for text in extracted_text.split_text.values():
|
317
|
+
metadata.text += f" {text}"
|
318
|
+
|
319
|
+
if fm is None:
|
320
|
+
continue
|
321
|
+
|
322
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
323
|
+
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
324
|
+
field_metadatas.append((subfield_metadata, splitted_metadata))
|
325
|
+
|
326
|
+
for _, splitted_metadata in field_metadatas:
|
327
|
+
if enabled_metadata.labels:
|
328
|
+
metadata.labels.field.extend(splitted_metadata.classifications)
|
329
|
+
|
330
|
+
if enabled_metadata.entities:
|
331
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
332
|
+
|
333
|
+
pb_resource = TrainResource()
|
334
|
+
pb_resource.uuid = resource.uuid
|
335
|
+
if resource.basic is not None:
|
336
|
+
pb_resource.title = resource.basic.title
|
337
|
+
pb_resource.icon = resource.basic.icon
|
338
|
+
pb_resource.slug = resource.basic.slug
|
339
|
+
pb_resource.modified.CopyFrom(resource.basic.modified)
|
340
|
+
pb_resource.created.CopyFrom(resource.basic.created)
|
341
|
+
pb_resource.metadata.CopyFrom(metadata)
|
342
|
+
return pb_resource
|
343
|
+
|
344
|
+
|
345
|
+
def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
|
346
|
+
for entity_key, entity_value in entities.items():
|
347
|
+
if entity_key not in local_text:
|
348
|
+
# Add the entity only if found in text
|
349
|
+
continue
|
350
|
+
metadata.entities[entity_key] = entity_value
|
351
|
+
|
352
|
+
# Add positions for the entity relative to the local text
|
353
|
+
poskey = f"{entity_value}/{entity_key}"
|
354
|
+
metadata.entity_positions[poskey].entity = entity_key
|
355
|
+
last_occurrence_end = 0
|
356
|
+
for _ in range(local_text.count(entity_key)):
|
357
|
+
start = local_text.index(entity_key, last_occurrence_end)
|
358
|
+
end = start + len(entity_key)
|
359
|
+
metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
|
360
|
+
last_occurrence_end = end
|
361
|
+
|
362
|
+
|
363
|
+
def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
|
364
|
+
"""
|
365
|
+
Update the entities dict with the entities from the field metadata.
|
366
|
+
Method created to ease the transition from legacy ner field to new entities field.
|
367
|
+
"""
|
368
|
+
# Data Augmentation + Processor entities
|
369
|
+
# This will overwrite entities detected from more than one data augmentation task
|
370
|
+
# TODO: Change TrainMetadata proto to accept multiple entities with the same text
|
371
|
+
entity_map = {
|
372
|
+
entity.text: entity.label
|
373
|
+
for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
|
374
|
+
for entity in entities_wrapper.entities
|
375
|
+
}
|
376
|
+
target_entites_dict.update(entity_map)
|
377
|
+
|
378
|
+
# Legacy processor entities
|
379
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
380
|
+
target_entites_dict.update(field_metadata.ner)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.2.1.
|
3
|
+
Version: 6.2.1.post2871
|
4
4
|
Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
|
5
5
|
Author: NucliaDB Community
|
6
6
|
Author-email: nucliadb@nuclia.com
|
@@ -22,10 +22,10 @@ Classifier: Programming Language :: Python :: 3.12
|
|
22
22
|
Classifier: Programming Language :: Python :: 3 :: Only
|
23
23
|
Requires-Python: >=3.9, <4
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.2.1.
|
26
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.
|
27
|
-
Requires-Dist: nucliadb-protos>=6.2.1.
|
28
|
-
Requires-Dist: nucliadb-models>=6.2.1.
|
25
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post2871
|
26
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post2871
|
27
|
+
Requires-Dist: nucliadb-protos>=6.2.1.post2871
|
28
|
+
Requires-Dist: nucliadb-models>=6.2.1.post2871
|
29
29
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
30
30
|
Requires-Dist: nucliadb-node-binding>=2.26.0
|
31
31
|
Requires-Dist: nuclia-models>=0.24.2
|
@@ -79,7 +79,7 @@ nucliadb/common/datamanagers/resources.py,sha256=5EJk7P-G4A_YiobiUexz_yuZUTuxS5z
|
|
79
79
|
nucliadb/common/datamanagers/rollover.py,sha256=c_DE3jtZusNL_9aOVjHOB9PV5OSVg7GJ5J-Ny0goHBE,7833
|
80
80
|
nucliadb/common/datamanagers/synonyms.py,sha256=zk3GEH38KF5vV_VcuL6DCg-2JwgXJfQl7Io6VPqv2cw,1566
|
81
81
|
nucliadb/common/datamanagers/utils.py,sha256=McHlXvE4P3x-bBY3pr0n8djbTDQvI1G5WusJrnRdhLA,1827
|
82
|
-
nucliadb/common/datamanagers/vectorsets.py,sha256=
|
82
|
+
nucliadb/common/datamanagers/vectorsets.py,sha256=ciYb5uD435Zo8ZbqgPUAszFW9Svp_-R2hY2FEhQ411Y,4304
|
83
83
|
nucliadb/common/external_index_providers/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
84
84
|
nucliadb/common/external_index_providers/base.py,sha256=yfPkCigT4unXFvAyzy1tXSy2UgWC481GcZAS9bdE4NI,8871
|
85
85
|
nucliadb/common/external_index_providers/exceptions.py,sha256=nDhhOIkb66hjCrBk4Spvl2vN1SuW5gbwrMCDmrdjHHE,1209
|
@@ -115,7 +115,7 @@ nucliadb/ingest/cache.py,sha256=w7jMMzamOmQ7gwXna6Dqm6isRNBVv6l5BTBlTxaYWjE,1005
|
|
115
115
|
nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
|
116
116
|
nucliadb/ingest/processing.py,sha256=gg1DqbMFwqdOsmCSGsZc2abRdYz86xOZJun9vrHOCzs,20618
|
117
117
|
nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
118
|
-
nucliadb/ingest/serialize.py,sha256=
|
118
|
+
nucliadb/ingest/serialize.py,sha256=03q9TBC9kbqbVq59SSL4ok1e3ThU0zeuYGdqY-B1V2M,15889
|
119
119
|
nucliadb/ingest/settings.py,sha256=0B-wQNa8FLqtNcQgRzh-fuIuGptM816XHcbH1NQKfmE,3050
|
120
120
|
nucliadb/ingest/utils.py,sha256=l1myURu3r8oA11dx3GpHw-gNTUc1AFX8xdPm9Lgl2rA,2275
|
121
121
|
nucliadb/ingest/consumer/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
@@ -128,7 +128,7 @@ nucliadb/ingest/consumer/service.py,sha256=EZM1sABW_7bj6j2UgKUHUuK-EGIEYnLdtPAn8
|
|
128
128
|
nucliadb/ingest/consumer/shard_creator.py,sha256=19wf-Bu_9hb_muCDVblamWuvLr09e5dMu9Id5I4-rGw,4324
|
129
129
|
nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
|
130
130
|
nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
131
|
-
nucliadb/ingest/fields/base.py,sha256=
|
131
|
+
nucliadb/ingest/fields/base.py,sha256=b6QpVPsCiDirDiYG3-yOCMaSNznJSHmQB0z6J_eDIyw,20657
|
132
132
|
nucliadb/ingest/fields/conversation.py,sha256=OcQOHvi72Pm0OyNGwxLo9gONo8f1NhwASq0_gS-E64A,7021
|
133
133
|
nucliadb/ingest/fields/exceptions.py,sha256=LBZ-lw11f42Pk-ck-NSN9mSJ2kOw-NeRwb-UE31ILTQ,1171
|
134
134
|
nucliadb/ingest/fields/file.py,sha256=1v4jLg3balUua2VmSV8hHkAwPFShTUCOzufZvIUQcQw,4740
|
@@ -136,13 +136,13 @@ nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54
|
|
136
136
|
nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
|
137
137
|
nucliadb/ingest/fields/text.py,sha256=tFvSQJAe0W7ePpp2_WDfLiE2yglR1OTU0Zht9acvOFw,1594
|
138
138
|
nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
139
|
-
nucliadb/ingest/orm/brain.py,sha256=
|
140
|
-
nucliadb/ingest/orm/broker_message.py,sha256=
|
139
|
+
nucliadb/ingest/orm/brain.py,sha256=UND5EsNUdd7XdjScYqRqg4r_xCx3l-My8alGw5M9CWg,28398
|
140
|
+
nucliadb/ingest/orm/broker_message.py,sha256=ZEMueoGuuRKO4tHgzc0P0AM1Ls1TTYey_4UvRQf0BpY,6915
|
141
141
|
nucliadb/ingest/orm/entities.py,sha256=2PslT1FZ6yCvJtjR0UpKTSzxJrtS-C_gZx4ZTWHunTc,15759
|
142
142
|
nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
|
143
|
-
nucliadb/ingest/orm/knowledgebox.py,sha256=
|
143
|
+
nucliadb/ingest/orm/knowledgebox.py,sha256=jWRBGic3KE1NRJzvUMpsRRLL6GHu9t28WsTb2DKtNhk,24901
|
144
144
|
nucliadb/ingest/orm/metrics.py,sha256=OkwMSPKLZcKba0ZTwtTiIxwBgaLMX5ydhGieKvi2y7E,1096
|
145
|
-
nucliadb/ingest/orm/resource.py,sha256=
|
145
|
+
nucliadb/ingest/orm/resource.py,sha256=KhucZzQzUbTBUm8_9gaCqxH68Fy1Q2u804IfTcjAIIk,43970
|
146
146
|
nucliadb/ingest/orm/utils.py,sha256=vCe_9UxHu26JDFGLwQ0wH-XyzJIpQCTK-Ow9dtZR5Vg,2716
|
147
147
|
nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
|
148
148
|
nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
|
@@ -152,7 +152,7 @@ nucliadb/ingest/orm/processor/processor.py,sha256=2FxAetUvtHvg6l-24xYrmBdsyqc0RU
|
|
152
152
|
nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
|
153
153
|
nucliadb/ingest/service/__init__.py,sha256=MME_G_ERxzJR6JW_hfE2qcfXpmpH1kdG-S0a-M0qRm8,2043
|
154
154
|
nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
155
|
-
nucliadb/ingest/service/writer.py,sha256=
|
155
|
+
nucliadb/ingest/service/writer.py,sha256=aBLLpPUJLlIf-VjAczBCUrcb-zMxRZOFHXkA0QE1pgw,22952
|
156
156
|
nucliadb/middleware/__init__.py,sha256=A8NBlBuEkunCFMKpR9gnfNELsVn0Plc55BIQMbWDM8Q,2202
|
157
157
|
nucliadb/migrator/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
158
158
|
nucliadb/migrator/command.py,sha256=dKbJ1tAmP6X4lMVRSSlz351euaqs2wBPpOczLjATUes,2089
|
@@ -165,7 +165,7 @@ nucliadb/migrator/settings.py,sha256=jOUX0ZMunCXN8HpF9xXN0aunJYRhu4Vdr_ffjRIqwtw
|
|
165
165
|
nucliadb/migrator/utils.py,sha256=NgUreUvON8_nWEzTxELBMWlfV7E6-6qi-g0DMEbVEz4,2885
|
166
166
|
nucliadb/models/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
167
167
|
nucliadb/models/responses.py,sha256=qnuOoc7TrVSUnpikfTwHLKez47_DE4mSFzpxrwtqijA,1599
|
168
|
-
nucliadb/purge/__init__.py,sha256=
|
168
|
+
nucliadb/purge/__init__.py,sha256=ijcigiWz38ohXmVVwDU87aCki1BkmAIQRjDoNQ3LPRM,11647
|
169
169
|
nucliadb/purge/orphan_shards.py,sha256=fA5yqRRN-M50OIk8dkAi1_ShFVjwDYEYqzMA9dYP0eU,9227
|
170
170
|
nucliadb/reader/__init__.py,sha256=C5Efic7WlGm2U2C5WOyquMFbIj2Pojwe_8mwzVYnOzE,1304
|
171
171
|
nucliadb/reader/app.py,sha256=Se-BFTE6d1v1msLzQn4q5XIhjnSxa2ckDSHdvm7NRf8,3096
|
@@ -274,8 +274,9 @@ nucliadb/train/app.py,sha256=TiRttTvekLuZdIvi46E4HyuumDTkR4G4Luqq3fEdjes,2824
|
|
274
274
|
nucliadb/train/generator.py,sha256=0_zqWsLUHmJZl0lXhGorO5CWSkl42-k78dqb1slZ5h0,3904
|
275
275
|
nucliadb/train/lifecycle.py,sha256=aCNaRURu0ZOUJaWLTZuEjwTstnB9MuLtzxOMztQoGxc,1773
|
276
276
|
nucliadb/train/models.py,sha256=BmgmMjDsu_1Ih5JDAqo6whhume90q0ASJcDP9dkMQm8,1198
|
277
|
-
nucliadb/train/nodes.py,sha256=
|
277
|
+
nucliadb/train/nodes.py,sha256=_89ZIpBb0HnR2jejvuO6aPsgHVSGbasPWz0lkGmVnvU,5925
|
278
278
|
nucliadb/train/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
279
|
+
nucliadb/train/resource.py,sha256=3qQ_9Zdt5JAbtD-wpmt7OeDGRNKS-fQdKAuIQfznZm0,16219
|
279
280
|
nucliadb/train/run.py,sha256=evz6CKVfJOzkbHMoaYz2mTMlKjJnNOb1O8zBBWMpeBw,1400
|
280
281
|
nucliadb/train/servicer.py,sha256=scbmq8FriKsJGkOcoZB2Fg_IyIExn9Ux4W30mGDlkJQ,5728
|
281
282
|
nucliadb/train/settings.py,sha256=rrLtgdBmuthtIObLuZUaeuo4VBGU2PJRazquQbtPBeI,1383
|
@@ -338,9 +339,9 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
338
339
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
339
340
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
340
341
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
341
|
-
nucliadb-6.2.1.
|
342
|
-
nucliadb-6.2.1.
|
343
|
-
nucliadb-6.2.1.
|
344
|
-
nucliadb-6.2.1.
|
345
|
-
nucliadb-6.2.1.
|
346
|
-
nucliadb-6.2.1.
|
342
|
+
nucliadb-6.2.1.post2871.dist-info/METADATA,sha256=qE45BIhLiAzSWUFf-L8MUwB-4Wppi3k2SEmvhznEgNs,4689
|
343
|
+
nucliadb-6.2.1.post2871.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
344
|
+
nucliadb-6.2.1.post2871.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
345
|
+
nucliadb-6.2.1.post2871.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
346
|
+
nucliadb-6.2.1.post2871.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
347
|
+
nucliadb-6.2.1.post2871.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|