nucliadb 6.4.0.post4313__py3-none-any.whl → 6.4.0.post4317__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/train/generators/field_streaming.py +13 -5
- {nucliadb-6.4.0.post4313.dist-info → nucliadb-6.4.0.post4317.dist-info}/METADATA +6 -6
- {nucliadb-6.4.0.post4313.dist-info → nucliadb-6.4.0.post4317.dist-info}/RECORD +6 -6
- {nucliadb-6.4.0.post4313.dist-info → nucliadb-6.4.0.post4317.dist-info}/WHEEL +0 -0
- {nucliadb-6.4.0.post4313.dist-info → nucliadb-6.4.0.post4317.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.4.0.post4313.dist-info → nucliadb-6.4.0.post4317.dist-info}/top_level.txt +0 -0
@@ -72,8 +72,8 @@ async def generate_field_streaming_payloads(
|
|
72
72
|
for status in trainset.filter.status:
|
73
73
|
request.filter.labels.append(f"/n/s/{status}")
|
74
74
|
|
75
|
-
total = 0
|
76
75
|
resources = set()
|
76
|
+
fields = set()
|
77
77
|
|
78
78
|
async for document_item in get_nidx_searcher_client().Documents(request):
|
79
79
|
text_labels = []
|
@@ -81,7 +81,6 @@ async def generate_field_streaming_payloads(
|
|
81
81
|
text_labels.append(label)
|
82
82
|
|
83
83
|
field_id = f"{document_item.uuid}{document_item.field}"
|
84
|
-
total += 1
|
85
84
|
resources.add(document_item.uuid)
|
86
85
|
|
87
86
|
field_parts = document_item.field.split("/")
|
@@ -100,6 +99,15 @@ async def generate_field_streaming_payloads(
|
|
100
99
|
tl.field_type = field_type
|
101
100
|
tl.split = split
|
102
101
|
|
102
|
+
field_unique_key = f"{rid}/{field_type}/{field}/{split}"
|
103
|
+
if field_unique_key in fields:
|
104
|
+
# This field has already been yielded. This can happen as we are streaming directly from nidx
|
105
|
+
# and field deletions may not be reflected immediately in the index.
|
106
|
+
logger.warning(f"Duplicated field found {field_unique_key}. Skipping.", extra={"kbid": kbid})
|
107
|
+
continue
|
108
|
+
|
109
|
+
fields.add(field_unique_key)
|
110
|
+
|
103
111
|
if trainset.exclude_text:
|
104
112
|
tl.text.text = ""
|
105
113
|
else:
|
@@ -119,11 +127,11 @@ async def generate_field_streaming_payloads(
|
|
119
127
|
|
120
128
|
yield tl
|
121
129
|
|
122
|
-
if
|
130
|
+
if len(fields) % 1000 == 0:
|
123
131
|
logger.info(
|
124
132
|
"Field streaming in progress",
|
125
133
|
extra={
|
126
|
-
"fields":
|
134
|
+
"fields": len(fields),
|
127
135
|
"resources": len(resources),
|
128
136
|
"kbid": kbid,
|
129
137
|
"shard_replica_id": shard_replica_id,
|
@@ -133,7 +141,7 @@ async def generate_field_streaming_payloads(
|
|
133
141
|
logger.info(
|
134
142
|
"Field streaming finished",
|
135
143
|
extra={
|
136
|
-
"fields":
|
144
|
+
"fields": len(fields),
|
137
145
|
"resources": len(resources),
|
138
146
|
"kbid": kbid,
|
139
147
|
"shard_replica_id": shard_replica_id,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.4.0.
|
3
|
+
Version: 6.4.0.post4317
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.4.0.
|
26
|
-
Requires-Dist: nucliadb-models>=6.4.0.
|
27
|
-
Requires-Dist: nidx-protos>=6.4.0.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4317
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4317
|
25
|
+
Requires-Dist: nucliadb-protos>=6.4.0.post4317
|
26
|
+
Requires-Dist: nucliadb-models>=6.4.0.post4317
|
27
|
+
Requires-Dist: nidx-protos>=6.4.0.post4317
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|
@@ -322,7 +322,7 @@ nucliadb/train/api/v1/shards.py,sha256=GJRnQe8P-7_VTIN1oxVmxlrDA08qVN7opEZdbF4Wx
|
|
322
322
|
nucliadb/train/api/v1/trainset.py,sha256=kpnpDgiMWr1FKHZJgwH7hue5kzilA8-i9X0YHlNeHuU,2113
|
323
323
|
nucliadb/train/generators/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
324
324
|
nucliadb/train/generators/field_classifier.py,sha256=xUA10o9CtBtilbP3uc-8Wn_zQ0oK3BrqYGqZgxh4ZLk,3428
|
325
|
-
nucliadb/train/generators/field_streaming.py,sha256=
|
325
|
+
nucliadb/train/generators/field_streaming.py,sha256=nje317SutX8QmHq-xwUphzUiozmzpCRfPXxhF_jFzdg,6441
|
326
326
|
nucliadb/train/generators/image_classifier.py,sha256=BDXgyd5TGZRnzDnVRvp-qsRCuoTbTYwui3JiDIjuiDc,1736
|
327
327
|
nucliadb/train/generators/paragraph_classifier.py,sha256=4sH3IQc7yJrlDs1C76SxFzL9N5mXWRZzJzoiF7y4dSQ,2703
|
328
328
|
nucliadb/train/generators/paragraph_streaming.py,sha256=1xsc_IqP-1M0TzYTqu5qCvWBNp_J3Kyvnx8HVbToXmQ,3532
|
@@ -368,8 +368,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
368
368
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
369
369
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
370
370
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
371
|
-
nucliadb-6.4.0.
|
372
|
-
nucliadb-6.4.0.
|
373
|
-
nucliadb-6.4.0.
|
374
|
-
nucliadb-6.4.0.
|
375
|
-
nucliadb-6.4.0.
|
371
|
+
nucliadb-6.4.0.post4317.dist-info/METADATA,sha256=d8tAT1pIjUuErJUdwLw2yt9bgSnVJn2U7KkhldSRAZU,4223
|
372
|
+
nucliadb-6.4.0.post4317.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
373
|
+
nucliadb-6.4.0.post4317.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
374
|
+
nucliadb-6.4.0.post4317.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
375
|
+
nucliadb-6.4.0.post4317.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|