nucliadb 6.4.0.post4313__py3-none-any.whl → 6.4.0.post4317__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -72,8 +72,8 @@ async def generate_field_streaming_payloads(
72
72
  for status in trainset.filter.status:
73
73
  request.filter.labels.append(f"/n/s/{status}")
74
74
 
75
- total = 0
76
75
  resources = set()
76
+ fields = set()
77
77
 
78
78
  async for document_item in get_nidx_searcher_client().Documents(request):
79
79
  text_labels = []
@@ -81,7 +81,6 @@ async def generate_field_streaming_payloads(
81
81
  text_labels.append(label)
82
82
 
83
83
  field_id = f"{document_item.uuid}{document_item.field}"
84
- total += 1
85
84
  resources.add(document_item.uuid)
86
85
 
87
86
  field_parts = document_item.field.split("/")
@@ -100,6 +99,15 @@ async def generate_field_streaming_payloads(
100
99
  tl.field_type = field_type
101
100
  tl.split = split
102
101
 
102
+ field_unique_key = f"{rid}/{field_type}/{field}/{split}"
103
+ if field_unique_key in fields:
104
+ # This field has already been yielded. This can happen as we are streaming directly from nidx
105
+ # and field deletions may not be reflected immediately in the index.
106
+ logger.warning(f"Duplicated field found {field_unique_key}. Skipping.", extra={"kbid": kbid})
107
+ continue
108
+
109
+ fields.add(field_unique_key)
110
+
103
111
  if trainset.exclude_text:
104
112
  tl.text.text = ""
105
113
  else:
@@ -119,11 +127,11 @@ async def generate_field_streaming_payloads(
119
127
 
120
128
  yield tl
121
129
 
122
- if total % 1000 == 0:
130
+ if len(fields) % 1000 == 0:
123
131
  logger.info(
124
132
  "Field streaming in progress",
125
133
  extra={
126
- "fields": total,
134
+ "fields": len(fields),
127
135
  "resources": len(resources),
128
136
  "kbid": kbid,
129
137
  "shard_replica_id": shard_replica_id,
@@ -133,7 +141,7 @@ async def generate_field_streaming_payloads(
133
141
  logger.info(
134
142
  "Field streaming finished",
135
143
  extra={
136
- "fields": total,
144
+ "fields": len(fields),
137
145
  "resources": len(resources),
138
146
  "kbid": kbid,
139
147
  "shard_replica_id": shard_replica_id,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.4.0.post4313
3
+ Version: 6.4.0.post4317
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4313
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4313
25
- Requires-Dist: nucliadb-protos>=6.4.0.post4313
26
- Requires-Dist: nucliadb-models>=6.4.0.post4313
27
- Requires-Dist: nidx-protos>=6.4.0.post4313
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4317
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4317
25
+ Requires-Dist: nucliadb-protos>=6.4.0.post4317
26
+ Requires-Dist: nucliadb-models>=6.4.0.post4317
27
+ Requires-Dist: nidx-protos>=6.4.0.post4317
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn[standard]
@@ -322,7 +322,7 @@ nucliadb/train/api/v1/shards.py,sha256=GJRnQe8P-7_VTIN1oxVmxlrDA08qVN7opEZdbF4Wx
322
322
  nucliadb/train/api/v1/trainset.py,sha256=kpnpDgiMWr1FKHZJgwH7hue5kzilA8-i9X0YHlNeHuU,2113
323
323
  nucliadb/train/generators/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
324
324
  nucliadb/train/generators/field_classifier.py,sha256=xUA10o9CtBtilbP3uc-8Wn_zQ0oK3BrqYGqZgxh4ZLk,3428
325
- nucliadb/train/generators/field_streaming.py,sha256=tI6vWhfLk-AVswh7rcjcO7Gg0YzS3OKMLJJ3VhDASG0,5980
325
+ nucliadb/train/generators/field_streaming.py,sha256=nje317SutX8QmHq-xwUphzUiozmzpCRfPXxhF_jFzdg,6441
326
326
  nucliadb/train/generators/image_classifier.py,sha256=BDXgyd5TGZRnzDnVRvp-qsRCuoTbTYwui3JiDIjuiDc,1736
327
327
  nucliadb/train/generators/paragraph_classifier.py,sha256=4sH3IQc7yJrlDs1C76SxFzL9N5mXWRZzJzoiF7y4dSQ,2703
328
328
  nucliadb/train/generators/paragraph_streaming.py,sha256=1xsc_IqP-1M0TzYTqu5qCvWBNp_J3Kyvnx8HVbToXmQ,3532
@@ -368,8 +368,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
368
368
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
369
369
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
370
370
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
371
- nucliadb-6.4.0.post4313.dist-info/METADATA,sha256=-Mp65qW_udL5EUZsp6CwEysmjbvuTY3u6Qbn0Bc3epI,4223
372
- nucliadb-6.4.0.post4313.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
373
- nucliadb-6.4.0.post4313.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
374
- nucliadb-6.4.0.post4313.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
375
- nucliadb-6.4.0.post4313.dist-info/RECORD,,
371
+ nucliadb-6.4.0.post4317.dist-info/METADATA,sha256=d8tAT1pIjUuErJUdwLw2yt9bgSnVJn2U7KkhldSRAZU,4223
372
+ nucliadb-6.4.0.post4317.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
373
+ nucliadb-6.4.0.post4317.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
374
+ nucliadb-6.4.0.post4317.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
375
+ nucliadb-6.4.0.post4317.dist-info/RECORD,,