nucliadb 6.3.5.post3914__py3-none-any.whl → 6.3.5.post3922__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -85,15 +85,6 @@ def user_field_metadata(message: resources_pb2.UserFieldMetadata) -> UserFieldMe
85
85
  including_default_value_fields=True,
86
86
  use_integers_for_enums=True,
87
87
  )
88
- value["selections"] = [
89
- MessageToDict(
90
- selections,
91
- preserving_proto_field_name=True,
92
- including_default_value_fields=True,
93
- use_integers_for_enums=True,
94
- )
95
- for selections in message.page_selections
96
- ]
97
88
  value["field"]["field_type"] = field_type_name(value["field"]["field_type"]).value
98
89
  return UserFieldMetadata(**value)
99
90
 
@@ -606,20 +606,6 @@ class ResourceBrain:
606
606
  )
607
607
 
608
608
  if basic_user_fieldmetadata is not None:
609
- for token in basic_user_fieldmetadata.token:
610
- if token.cancelled_by_user is False:
611
- labels["e"].add(f"{token.klass}/{token.token}")
612
- relation_node_entity = RelationNode(
613
- value=token.token,
614
- ntype=RelationNode.NodeType.ENTITY,
615
- subtype=token.klass,
616
- )
617
- rel = Relation(
618
- relation=Relation.ENTITY,
619
- source=relation_node_resource,
620
- to=relation_node_entity,
621
- )
622
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=rel))
623
609
  for paragraph_annotation in basic_user_fieldmetadata.paragraphs:
624
610
  for classification in paragraph_annotation.classifications:
625
611
  if not classification.cancelled_by_user:
@@ -18,27 +18,15 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- import json
22
- from typing import Any, AsyncGenerator
21
+ from typing import AsyncGenerator
23
22
 
24
23
  from nucliadb.common.cluster.base import AbstractIndexNode
25
- from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB
26
- from nucliadb.ingest.fields.base import Field
27
- from nucliadb.ingest.orm.resource import Resource
28
- from nucliadb.train import logger
29
- from nucliadb.train.generators.utils import batchify, get_resource_from_cache_or_db
24
+ from nucliadb.train.generators.utils import batchify
30
25
  from nucliadb_protos.dataset_pb2 import (
31
26
  ImageClassification,
32
27
  ImageClassificationBatch,
33
28
  TrainSet,
34
29
  )
35
- from nucliadb_protos.nodereader_pb2 import StreamRequest
36
- from nucliadb_protos.resources_pb2 import FieldType, PageStructure, VisualSelection
37
-
38
- VISUALLY_ANNOTABLE_FIELDS = {FieldType.FILE, FieldType.LINK}
39
-
40
- # PAWLS JSON format
41
- PawlsPayload = dict[str, Any]
42
30
 
43
31
 
44
32
  def image_classification_batch_generator(
@@ -58,125 +46,8 @@ async def generate_image_classification_payloads(
58
46
  node: AbstractIndexNode,
59
47
  shard_replica_id: str,
60
48
  ) -> AsyncGenerator[ImageClassification, None]:
61
- request = StreamRequest()
62
- request.shard_id.id = shard_replica_id
63
- async for item in node.stream_get_fields(request):
64
- rid = item.uuid
65
- resource = await get_resource_from_cache_or_db(kbid, rid)
66
- if resource is None:
67
- logger.error(f"Resource {rid} does not exist on DB")
68
- return
69
-
70
- _, field_type_key, field_key = item.field.split("/")
71
- field_type = FIELD_TYPE_STR_TO_PB[field_type_key]
72
-
73
- if field_type not in VISUALLY_ANNOTABLE_FIELDS:
74
- continue
75
-
76
- field = await resource.get_field(field_key, field_type, load=True)
77
-
78
- page_selections = await get_page_selections(resource, field)
79
- if len(page_selections) == 0:
80
- # Generating a payload without annotations makes no sense
81
- continue
82
-
83
- page_structure = await get_page_structure(field)
84
-
85
- for page, (page_uri, ps) in enumerate(page_structure):
86
- pawls_payload = {
87
- "width": ps.page.width,
88
- "height": ps.page.height,
89
- "tokens": [
90
- {
91
- "x": token.x,
92
- "y": token.y,
93
- "width": token.width,
94
- "height": token.height,
95
- "text": token.text,
96
- "line": token.line,
97
- }
98
- for token in ps.tokens
99
- ],
100
- "annotations": [
101
- {
102
- "page": page,
103
- "label": {
104
- "text": selection.label,
105
- },
106
- "bounds": {
107
- "top": selection.top,
108
- "left": selection.left,
109
- "right": selection.right,
110
- "bottom": selection.bottom,
111
- },
112
- "tokens": [
113
- {
114
- "pageIndex": page,
115
- "tokenIndex": token_id,
116
- }
117
- for token_id in selection.token_ids
118
- ],
119
- }
120
- for selection in page_selections[page]
121
- ],
122
- }
123
-
124
- ic = ImageClassification()
125
- ic.page_uri = page_uri
126
- ic.selections = json.dumps(pawls_payload)
127
-
128
- yield ic
129
-
130
-
131
- async def get_page_selections(resource: Resource, field: Field) -> dict[int, list[VisualSelection]]:
132
- page_selections: dict[int, list[VisualSelection]] = {}
133
- basic = await resource.get_basic()
134
- if basic is None or basic.fieldmetadata is None:
135
- return page_selections
136
-
137
- # We assume only one fieldmetadata per field as it's implemented in
138
- # resource ingestion
139
- for fieldmetadata in basic.fieldmetadata:
140
- if (
141
- fieldmetadata.field.field == field.id
142
- and fieldmetadata.field.field_type == FIELD_TYPE_STR_TO_PB[field.type]
143
- ):
144
- for selection in fieldmetadata.page_selections:
145
- page_selections[selection.page] = selection.visual # type: ignore
146
- break
147
-
148
- return page_selections
149
-
150
-
151
- async def get_page_structure(field: Field) -> list[tuple[str, PageStructure]]:
152
- page_structures: list[tuple[str, PageStructure]] = []
153
- field_type = FIELD_TYPE_STR_TO_PB[field.type]
154
- if field_type == FieldType.FILE:
155
- fed = await field.get_file_extracted_data() # type: ignore
156
- if fed is None:
157
- return page_structures
158
-
159
- fp = fed.file_pages_previews
160
- if len(fp.pages) != len(fp.structures):
161
- field_path = f"/kb/{field.kbid}/resource/{field.resource.uuid}/file/{field.id}"
162
- logger.warning(
163
- f"File extracted data has a different number of pages and structures! ({field_path})"
164
- )
165
- return page_structures
166
- page_structures.extend(
167
- [
168
- # we expect this two field to have the same length, if not,
169
- # something went wrong while processing
170
- (fp.pages[i].uri, fp.structures[i])
171
- for i in range(len(fp.pages))
172
- ]
173
- )
174
-
175
- elif field_type == FieldType.LINK:
176
- led = await field.get_link_extracted_data() # type: ignore
177
- if led is None:
178
- return page_structures
179
-
180
- page_structures.append((led.link_image.uri, led.pdf_structure))
181
-
182
- return page_structures
49
+ # NOTE: image classifications are no longer supported, as the page selection annotations were removed
50
+ # from the API.
51
+ if False:
52
+ yield
53
+ return
@@ -109,33 +109,6 @@ async def get_field_text(
109
109
  ] = {} # Dict of entity group , with entity and list of positions in field
110
110
  split_ners[MAIN] = {}
111
111
 
112
- basic_data = await orm_resource.get_basic()
113
- invalid_tokens_split: dict[str, list[tuple[str, str, int, int]]] = {}
114
- # Check user definition of entities
115
- if basic_data is not None:
116
- for userfieldmetadata in basic_data.fieldmetadata:
117
- if (
118
- userfieldmetadata.field.field == field
119
- and userfieldmetadata.field.field_type == field_type_int
120
- ):
121
- for token in userfieldmetadata.token:
122
- if token.klass in valid_entity_groups:
123
- if token.cancelled_by_user:
124
- if token.split in (None, ""):
125
- split = MAIN
126
- else:
127
- split = token.split
128
- invalid_tokens_split[split].append(
129
- (token.klass, token.token, token.start, token.end)
130
- )
131
- else:
132
- if token.split in (None, ""):
133
- split = MAIN
134
- else:
135
- split = token.split
136
- split_ners[split].setdefault(token.klass, {}).setdefault(token.token, [])
137
- split_ners[split][token.klass][token.token].append((token.start, token.end))
138
-
139
112
  field_metadata = await field_obj.get_field_metadata()
140
113
  # Check computed definition of entities
141
114
  if field_metadata is not None:
@@ -189,17 +162,6 @@ async def get_field_text(
189
162
  for position in positions.position:
190
163
  split_ners[split][entity_group][entity].append((position.start, position.end))
191
164
 
192
- for split, invalid_tokens in invalid_tokens_split.items():
193
- for token.klass, token.token, token.start, token.end in invalid_tokens:
194
- if token.klass in split_ners.get(split, {}):
195
- if token.token in split_ners.get(split, {}).get(token.klass, {}):
196
- if (token.start, token.end) in split_ners[split][token.klass][token.token]:
197
- split_ners[split][token.klass][token.token].remove((token.start, token.end))
198
- if len(split_ners[split][token.klass][token.token]) == 0:
199
- del split_ners[split][token.klass][token.token]
200
- if len(split_ners[split][token.klass]) == 0:
201
- del split_ners[split][token.klass]
202
-
203
165
  ordered_positions: dict[str, POSITION_DICT] = {}
204
166
  for split, ners in split_ners.items():
205
167
  split_positions: dict[tuple[int, int], tuple[str, str]] = {}
@@ -52,11 +52,8 @@ from nucliadb_protos.resources_pb2 import (
52
52
  FieldComputedMetadataWrapper,
53
53
  FieldType,
54
54
  Metadata,
55
- PageSelections,
56
55
  Paragraph,
57
- TokenSplit,
58
56
  UserFieldMetadata,
59
- VisualSelection,
60
57
  )
61
58
  from nucliadb_protos.resources_pb2 import ParagraphAnnotation as PBParagraphAnnotation
62
59
  from nucliadb_protos.resources_pb2 import (
@@ -102,16 +99,6 @@ def parse_basic_modify(bm: BrokerMessage, item: ComingResourcePayload, toprocess
102
99
  if item.fieldmetadata is not None:
103
100
  for fieldmetadata in item.fieldmetadata:
104
101
  userfieldmetadata = UserFieldMetadata()
105
- for token in fieldmetadata.token:
106
- userfieldmetadata.token.append(
107
- TokenSplit(
108
- token=token.token,
109
- klass=token.klass,
110
- start=token.start,
111
- end=token.end,
112
- cancelled_by_user=token.cancelled_by_user,
113
- )
114
- )
115
102
  for paragraph in fieldmetadata.paragraphs:
116
103
  validate_classifications(paragraph)
117
104
  paragraphpb = PBParagraphAnnotation(key=paragraph.key)
@@ -125,24 +112,6 @@ def parse_basic_modify(bm: BrokerMessage, item: ComingResourcePayload, toprocess
125
112
  )
126
113
  userfieldmetadata.paragraphs.append(paragraphpb)
127
114
 
128
- for page_selections in fieldmetadata.selections:
129
- page_selections_pb = PageSelections()
130
- page_selections_pb.page = page_selections.page
131
- page_selections_pb.visual.extend(
132
- [
133
- VisualSelection(
134
- label=visual_selection.label,
135
- top=visual_selection.top,
136
- left=visual_selection.left,
137
- right=visual_selection.right,
138
- bottom=visual_selection.bottom,
139
- token_ids=visual_selection.token_ids,
140
- )
141
- for visual_selection in page_selections.visual
142
- ]
143
- )
144
- userfieldmetadata.page_selections.append(page_selections_pb)
145
-
146
115
  for qa_annotation in fieldmetadata.question_answers:
147
116
  qa_annotation_pb = build_question_answer_annotation_pb(qa_annotation)
148
117
  userfieldmetadata.question_answers.append(qa_annotation_pb)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.3.5.post3914
3
+ Version: 6.3.5.post3922
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.5.post3914
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.post3914
25
- Requires-Dist: nucliadb-protos>=6.3.5.post3914
26
- Requires-Dist: nucliadb-models>=6.3.5.post3914
27
- Requires-Dist: nidx-protos>=6.3.5.post3914
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.5.post3922
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.5.post3922
25
+ Requires-Dist: nucliadb-protos>=6.3.5.post3922
26
+ Requires-Dist: nucliadb-models>=6.3.5.post3922
27
+ Requires-Dist: nidx-protos>=6.3.5.post3922
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn[standard]
@@ -105,7 +105,7 @@ nucliadb/common/maindb/local.py,sha256=uE9DIQX1yCNHNN8Tx4fPgSiuTtWpQhlfWkMJ8QZPa
105
105
  nucliadb/common/maindb/pg.py,sha256=FNq2clckJYj4Te-1svjQblqGoAF5OwJ5nwz2JtxD0d4,13645
106
106
  nucliadb/common/maindb/utils.py,sha256=zWLs82rWEVhpc1dYvdqTZiAcjZroB6Oo5MQaxMeFuKk,3301
107
107
  nucliadb/common/models_utils/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
108
- nucliadb/common/models_utils/from_proto.py,sha256=yYn5vg4SKgB57RKOmeqzfD8VfmjKSSr4UNhw-Vvb4vs,15891
108
+ nucliadb/common/models_utils/from_proto.py,sha256=zoQrqVsL5nfcvbuPPHoWHEt7UkTi-9uUd8f6ZqQnvU4,15614
109
109
  nucliadb/common/models_utils/to_proto.py,sha256=97JvOR_3odu50YvzLa2CERfEN3w_QPmAVcCJwJB5m5A,2438
110
110
  nucliadb/export_import/__init__.py,sha256=y-Is0Bxa8TMV6UiOW0deC_D3U465P65CQ5RjBjIWnow,932
111
111
  nucliadb/export_import/datamanager.py,sha256=xL8b0xvk45q6wx1l7J32JgPmpyjnF6fKiJi2F2B_UQY,6968
@@ -141,7 +141,7 @@ nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54
141
141
  nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
142
142
  nucliadb/ingest/fields/text.py,sha256=tFvSQJAe0W7ePpp2_WDfLiE2yglR1OTU0Zht9acvOFw,1594
143
143
  nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
144
- nucliadb/ingest/orm/brain.py,sha256=A8H1J7Bo95sNzDgYr0_UNoemQhWOFEFz9UlYfs6ug-8,29407
144
+ nucliadb/ingest/orm/brain.py,sha256=S08SVgHukdBs4RGlFSu69xIT51YR6fcVolhJ_E3cR9w,28686
145
145
  nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
146
146
  nucliadb/ingest/orm/entities.py,sha256=a-aYuKBUQhxDKFtXOzTAkLlY_t2JiTfaptw2vt3AQDQ,14915
147
147
  nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
@@ -310,12 +310,12 @@ nucliadb/train/api/v1/trainset.py,sha256=kpnpDgiMWr1FKHZJgwH7hue5kzilA8-i9X0YHlN
310
310
  nucliadb/train/generators/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
311
311
  nucliadb/train/generators/field_classifier.py,sha256=yatj7U-LHRN5xTR6XsYz_3acIAUKTcpkNZcZaSY8MtE,3482
312
312
  nucliadb/train/generators/field_streaming.py,sha256=kjwg4VNiROVqVDN--mRd4ylLw55Sg2VYxKRDdbmpYSM,5398
313
- nucliadb/train/generators/image_classifier.py,sha256=yqdBKIHiumeDSTggbxLFiUPanjMtbWGEaWo4mg2OcqA,6704
313
+ nucliadb/train/generators/image_classifier.py,sha256=B4P88JfpjMcAZIPzlSOYaGseq5NgfssEr_Ecvlprr3g,1859
314
314
  nucliadb/train/generators/paragraph_classifier.py,sha256=0pOZYcT1cAmG7gjSD1HIUaMM5T3Ag-96iUTXRhiV8MI,2761
315
315
  nucliadb/train/generators/paragraph_streaming.py,sha256=dsM7a5hBd2iokvFuxnZhQeko4Jad6djyP2p3tevku8A,3586
316
316
  nucliadb/train/generators/question_answer_streaming.py,sha256=P7-de4W4yW2mgEQ82fF2OZVyx6QJHXezY52qDciDcmw,5680
317
317
  nucliadb/train/generators/sentence_classifier.py,sha256=DuvXfnWvLhklYR_qFGk2LqUyl2JE7CMVFwuHaPyC9Ys,5121
318
- nucliadb/train/generators/token_classifier.py,sha256=Vl14aaWoqrgYPijmvM62OjxDdANbpcbEZSZq2X2KhEo,11697
318
+ nucliadb/train/generators/token_classifier.py,sha256=0848GqoXh8ywU82cPUrkzOM53-lZ1MVCw--8yDABigY,9557
319
319
  nucliadb/train/generators/utils.py,sha256=1uSELmM4CpKy9jWp6j_u7_n_KR-udRNkes4UmPMOCcI,3907
320
320
  nucliadb/writer/__init__.py,sha256=S298mrZL3vr62OrBqi97mdLxgR5cReMlRJgnaQHZV7s,1304
321
321
  nucliadb/writer/app.py,sha256=ABBO8-u4pDAa61b3mCdD0TFhuHAYcxMkgpZSGgWARuE,2736
@@ -344,7 +344,7 @@ nucliadb/writer/api/v1/upload.py,sha256=hLMHXSaqEOE-vjKjhIupgdx8klJc3mVQp_oMwx5N
344
344
  nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXxtd_XLRE,6973
345
345
  nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
346
346
  nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
347
- nucliadb/writer/resource/basic.py,sha256=_zdAr110C7rtEzOKoBRMzPjAnQ0pAtRfGjB8qCzodvI,11767
347
+ nucliadb/writer/resource/basic.py,sha256=P2VXXXLKs43_Cd7Uvrcd-JTeuOJuUGu1Jpx8eujGi7Q,10451
348
348
  nucliadb/writer/resource/field.py,sha256=e5QGkR5ZDT1VUQgMXK7v6GGXJ2eek6jxGA0nPqjq_g4,20241
349
349
  nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
350
350
  nucliadb/writer/tus/__init__.py,sha256=huWpKnDnjsrKlBBJk30ta5vamlA-4x0TbPs_2Up8hyM,5443
@@ -356,8 +356,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
356
356
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
357
357
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
358
358
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
359
- nucliadb-6.3.5.post3914.dist-info/METADATA,sha256=_kbS1OuZddK3euMgqL96FHsWqVreZJPHIazpqsxSIzI,4301
360
- nucliadb-6.3.5.post3914.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
361
- nucliadb-6.3.5.post3914.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
362
- nucliadb-6.3.5.post3914.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
363
- nucliadb-6.3.5.post3914.dist-info/RECORD,,
359
+ nucliadb-6.3.5.post3922.dist-info/METADATA,sha256=blA0HPryUhZB_Z8QgNTv8DaGx3OSrJLDuagaaN_Ojdk,4301
360
+ nucliadb-6.3.5.post3922.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
361
+ nucliadb-6.3.5.post3922.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
362
+ nucliadb-6.3.5.post3922.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
363
+ nucliadb-6.3.5.post3922.dist-info/RECORD,,