nucliadb-models 6.9.3.post5346__py3-none-any.whl → 6.10.0.post5788__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb-models might be problematic. Click here for more details.

Files changed (35) hide show
  1. nucliadb_models/agents/ingestion.py +4 -4
  2. nucliadb_models/augment.py +355 -0
  3. nucliadb_models/common.py +57 -57
  4. nucliadb_models/configuration.py +8 -8
  5. nucliadb_models/content_types.py +13 -11
  6. nucliadb_models/conversation.py +25 -26
  7. nucliadb_models/entities.py +17 -18
  8. nucliadb_models/external_index_providers.py +1 -2
  9. nucliadb_models/extracted.py +82 -83
  10. nucliadb_models/file.py +10 -11
  11. nucliadb_models/filters.py +79 -75
  12. nucliadb_models/graph/requests.py +40 -48
  13. nucliadb_models/graph/responses.py +13 -1
  14. nucliadb_models/hydration.py +50 -52
  15. nucliadb_models/internal/predict.py +7 -9
  16. nucliadb_models/internal/shards.py +2 -3
  17. nucliadb_models/labels.py +18 -11
  18. nucliadb_models/link.py +18 -19
  19. nucliadb_models/metadata.py +66 -54
  20. nucliadb_models/notifications.py +3 -3
  21. nucliadb_models/processing.py +1 -2
  22. nucliadb_models/resource.py +85 -102
  23. nucliadb_models/retrieval.py +147 -0
  24. nucliadb_models/search.py +331 -283
  25. nucliadb_models/security.py +2 -3
  26. nucliadb_models/text.py +7 -8
  27. nucliadb_models/trainset.py +1 -2
  28. nucliadb_models/utils.py +2 -3
  29. nucliadb_models/vectors.py +2 -5
  30. nucliadb_models/writer.py +56 -57
  31. {nucliadb_models-6.9.3.post5346.dist-info → nucliadb_models-6.10.0.post5788.dist-info}/METADATA +1 -1
  32. nucliadb_models-6.10.0.post5788.dist-info/RECORD +41 -0
  33. nucliadb_models-6.9.3.post5346.dist-info/RECORD +0 -39
  34. {nucliadb_models-6.9.3.post5346.dist-info → nucliadb_models-6.10.0.post5788.dist-info}/WHEEL +0 -0
  35. {nucliadb_models-6.9.3.post5346.dist-info → nucliadb_models-6.10.0.post5788.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  #
15
- from typing import Annotated, Optional, Union
15
+ from typing import Annotated
16
16
 
17
17
  from pydantic import BaseModel, Field, StringConstraints
18
18
 
@@ -105,23 +105,23 @@ class GenericFieldHydration(BaseModel, extra="forbid"):
105
105
 
106
106
 
107
107
  class FieldHydration(BaseModel, extra="forbid"):
108
- text: Optional[TextFieldHydration] = Field(
108
+ text: TextFieldHydration | None = Field(
109
109
  default_factory=TextFieldHydration,
110
110
  description="Text fields hydration options",
111
111
  )
112
- file: Optional[FileFieldHydration] = Field(
112
+ file: FileFieldHydration | None = Field(
113
113
  default_factory=FileFieldHydration,
114
114
  description="File fields hydration options",
115
115
  )
116
- link: Optional[LinkFieldHydration] = Field(
116
+ link: LinkFieldHydration | None = Field(
117
117
  default_factory=LinkFieldHydration,
118
118
  description="Link fields hydration options",
119
119
  )
120
- conversation: Optional[ConversationFieldHydration] = Field(
120
+ conversation: ConversationFieldHydration | None = Field(
121
121
  default_factory=ConversationFieldHydration,
122
122
  description="Conversation fields hydration options",
123
123
  )
124
- generic: Optional[GenericFieldHydration] = Field(
124
+ generic: GenericFieldHydration | None = Field(
125
125
  default_factory=GenericFieldHydration,
126
126
  description="Generic fields hydration options",
127
127
  )
@@ -141,7 +141,7 @@ class NeighbourParagraphHydration(BaseModel, extra="forbid"):
141
141
 
142
142
 
143
143
  class RelatedParagraphHydration(BaseModel, extra="forbid"):
144
- neighbours: Optional[NeighbourParagraphHydration] = Field(
144
+ neighbours: NeighbourParagraphHydration | None = Field(
145
145
  default=None,
146
146
  description="Hydrate extra paragraphs that surround the original one",
147
147
  )
@@ -205,11 +205,11 @@ class ParagraphHydration(BaseModel, extra="forbid"):
205
205
  default=True,
206
206
  description="Hydrate paragraph text",
207
207
  )
208
- image: Optional[ImageParagraphHydration] = Field(
208
+ image: ImageParagraphHydration | None = Field(
209
209
  default=None,
210
210
  description="Hydrate options for paragraphs extracted from images (using OCR, inception...)",
211
211
  )
212
- table: Optional[TableParagraphHydration] = Field(
212
+ table: TableParagraphHydration | None = Field(
213
213
  default=None,
214
214
  description="Hydrate options for paragraphs extracted from tables",
215
215
  )
@@ -217,19 +217,19 @@ class ParagraphHydration(BaseModel, extra="forbid"):
217
217
  # TODO: at some point, we should add hydration options for paragraphs from
218
218
  # audio and video
219
219
 
220
- page: Optional[ParagraphPageHydration] = Field(
220
+ page: ParagraphPageHydration | None = Field(
221
221
  default=None,
222
222
  description="Hydrte options for paragraphs within a page. This applies to paragraphs in fields with pages",
223
223
  )
224
224
 
225
- related: Optional[RelatedParagraphHydration] = Field(
225
+ related: RelatedParagraphHydration | None = Field(
226
226
  default=None,
227
227
  description="Hydration options for related paragraphs. For example, neighbours or sibling paragraphs",
228
228
  )
229
229
 
230
230
 
231
231
  class Hydration(BaseModel, extra="forbid"):
232
- resource: Optional[ResourceHydration] = Field(
232
+ resource: ResourceHydration | None = Field(
233
233
  default_factory=ResourceHydration,
234
234
  description="Resource hydration options",
235
235
  )
@@ -247,8 +247,8 @@ ParagraphId = Annotated[
247
247
  str,
248
248
  StringConstraints(
249
249
  # resource-uuid/field-type/field-id/[split-id/]paragraph-id
250
- pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+/([^/]{1,128}/)?[0-9]+-[0-9]+$",
251
- min_length=32 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 3,
250
+ pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$",
251
+ min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
252
252
  # max field id of 250 and 10 digit paragraphs. More than enough
253
253
  max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
254
254
  ),
@@ -270,22 +270,22 @@ class HydratedResource(BaseModel, extra="forbid"):
270
270
  id: str = Field(description="Unique resource id")
271
271
  slug: str = Field(description="Resource slug")
272
272
 
273
- title: Optional[str] = None
274
- summary: Optional[str] = None
273
+ title: str | None = None
274
+ summary: str | None = None
275
275
 
276
- origin: Optional[Origin] = None
276
+ origin: Origin | None = None
277
277
 
278
- security: Optional[ResourceSecurity] = None
278
+ security: ResourceSecurity | None = None
279
279
 
280
280
  # TODO: add resource labels to hydrated resources
281
281
 
282
282
 
283
283
  class FieldExtractedData(BaseModel, extra="forbid"):
284
- text: Optional[str] = None
284
+ text: str | None = None
285
285
 
286
286
 
287
287
  class SplitFieldExtractedData(BaseModel, extra="forbid"):
288
- texts: Optional[dict[str, str]] = None
288
+ texts: dict[str, str] | None = None
289
289
 
290
290
 
291
291
  class HydratedTextField(BaseModel, extra="forbid"):
@@ -293,8 +293,8 @@ class HydratedTextField(BaseModel, extra="forbid"):
293
293
  resource: str = Field("Field resource id")
294
294
  field_type: FieldTypeName = FieldTypeName.TEXT
295
295
 
296
- value: Optional[FieldText] = None
297
- extracted: Optional[FieldExtractedData] = None
296
+ value: FieldText | None = None
297
+ extracted: FieldExtractedData | None = None
298
298
 
299
299
 
300
300
  class HydratedFileField(BaseModel, extra="forbid"):
@@ -302,10 +302,10 @@ class HydratedFileField(BaseModel, extra="forbid"):
302
302
  resource: str = Field("Field resource id")
303
303
  field_type: FieldTypeName = FieldTypeName.FILE
304
304
 
305
- value: Optional[FieldFile] = None
306
- extracted: Optional[FieldExtractedData] = None
305
+ value: FieldFile | None = None
306
+ extracted: FieldExtractedData | None = None
307
307
 
308
- previews: Optional[dict[str, Image]] = Field(
308
+ previews: dict[str, Image] | None = Field(
309
309
  default=None,
310
310
  title="Previews of specific parts of the field",
311
311
  description=(
@@ -323,8 +323,8 @@ class HydratedLinkField(BaseModel, extra="forbid"):
323
323
  resource: str = Field("Field resource id")
324
324
  field_type: FieldTypeName = FieldTypeName.LINK
325
325
 
326
- value: Optional[FieldLink] = None
327
- extracted: Optional[FieldExtractedData] = None
326
+ value: FieldLink | None = None
327
+ extracted: FieldExtractedData | None = None
328
328
 
329
329
 
330
330
  class HydratedConversationField(BaseModel, extra="forbid"):
@@ -332,8 +332,8 @@ class HydratedConversationField(BaseModel, extra="forbid"):
332
332
  resource: str = Field("Field resource id")
333
333
  field_type: FieldTypeName = FieldTypeName.CONVERSATION
334
334
 
335
- value: Optional[FieldConversation] = None
336
- extracted: Optional[FieldExtractedData] = None
335
+ value: FieldConversation | None = None
336
+ extracted: FieldExtractedData | None = None
337
337
 
338
338
 
339
339
  class HydratedGenericField(BaseModel, extra="forbid"):
@@ -341,24 +341,24 @@ class HydratedGenericField(BaseModel, extra="forbid"):
341
341
  resource: str = Field("Field resource id")
342
342
  field_type: FieldTypeName = FieldTypeName.TEXT
343
343
 
344
- value: Optional[str] = None
345
- extracted: Optional[FieldExtractedData] = None
344
+ value: str | None = None
345
+ extracted: FieldExtractedData | None = None
346
346
 
347
347
 
348
348
  class RelatedNeighbourParagraphRefs(BaseModel, extra="forbid"):
349
- before: Optional[list[str]] = None
350
- after: Optional[list[str]] = None
349
+ before: list[str] | None = None
350
+ after: list[str] | None = None
351
351
 
352
352
 
353
353
  class RelatedParagraphRefs(BaseModel, extra="forbid"):
354
- neighbours: Optional[RelatedNeighbourParagraphRefs] = None
355
- parents: Optional[list[str]] = None
356
- siblings: Optional[list[str]] = None
357
- replacements: Optional[list[str]] = None
354
+ neighbours: RelatedNeighbourParagraphRefs | None = None
355
+ parents: list[str] | None = None
356
+ siblings: list[str] | None = None
357
+ replacements: list[str] | None = None
358
358
 
359
359
 
360
360
  class HydratedParagraphImage(BaseModel, extra="forbid"):
361
- source_image: Optional[Image] = Field(
361
+ source_image: Image | None = Field(
362
362
  default=None,
363
363
  description=(
364
364
  "Source image for this paragraph. This only applies to paragraphs "
@@ -369,7 +369,7 @@ class HydratedParagraphImage(BaseModel, extra="forbid"):
369
369
 
370
370
 
371
371
  class HydratedParagraphTable(BaseModel, extra="forbid"):
372
- page_preview_ref: Optional[str] = Field(
372
+ page_preview_ref: str | None = Field(
373
373
  default=None,
374
374
  description=(
375
375
  "Referento to the page preview for this paragraph. The actual "
@@ -381,7 +381,7 @@ class HydratedParagraphTable(BaseModel, extra="forbid"):
381
381
 
382
382
 
383
383
  class HydratedParagraphPage(BaseModel, extra="forbid"):
384
- page_preview_ref: Optional[str] = Field(
384
+ page_preview_ref: str | None = Field(
385
385
  default=None,
386
386
  description=(
387
387
  "Reference to the page preview for this paragraph. The actual "
@@ -398,28 +398,26 @@ class HydratedParagraph(BaseModel, extra="forbid"):
398
398
  field: str = Field(description="Paragraph field id")
399
399
  resource: str = Field(description="Paragraph resource id")
400
400
 
401
- text: Optional[str] = None
401
+ text: str | None = None
402
402
 
403
403
  # TODO: add labels to hydrated paragraphs
404
404
  # labels: Optional[list[str]] = None
405
405
 
406
- related: Optional[RelatedParagraphRefs] = None
406
+ related: RelatedParagraphRefs | None = None
407
407
 
408
- image: Optional[HydratedParagraphImage] = None
409
- table: Optional[HydratedParagraphTable] = None
410
- page: Optional[HydratedParagraphPage] = None
408
+ image: HydratedParagraphImage | None = None
409
+ table: HydratedParagraphTable | None = None
410
+ page: HydratedParagraphPage | None = None
411
411
 
412
412
 
413
413
  class Hydrated(BaseModel, extra="forbid"):
414
414
  resources: dict[str, HydratedResource]
415
415
  fields: dict[
416
416
  str,
417
- Union[
418
- HydratedTextField,
419
- HydratedFileField,
420
- HydratedLinkField,
421
- HydratedConversationField,
422
- HydratedGenericField,
423
- ],
417
+ HydratedTextField
418
+ | HydratedFileField
419
+ | HydratedLinkField
420
+ | HydratedConversationField
421
+ | HydratedGenericField,
424
422
  ]
425
423
  paragraphs: dict[str, HydratedParagraph]
@@ -19,13 +19,11 @@ Models for Predict API v1.
19
19
  ATENTION! Keep these models in sync with models on Predict API
20
20
  """
21
21
 
22
- from typing import List, Optional
23
-
24
22
  from pydantic import BaseModel, Field
25
23
 
26
24
 
27
25
  class SentenceSearch(BaseModel):
28
- vectors: dict[str, List[float]] = Field(
26
+ vectors: dict[str, list[float]] = Field(
29
27
  default_factory=dict,
30
28
  description="Sentence vectors for each semantic model",
31
29
  min_length=1,
@@ -45,14 +43,14 @@ class Ner(BaseModel):
45
43
 
46
44
 
47
45
  class TokenSearch(BaseModel):
48
- tokens: List[Ner] = []
46
+ tokens: list[Ner] = []
49
47
  time: float
50
48
  input_tokens: int = 0
51
49
 
52
50
 
53
51
  class QueryInfo(BaseModel):
54
- language: Optional[str]
55
- stop_words: List[str] = Field(default_factory=list)
52
+ language: str | None
53
+ stop_words: list[str] = Field(default_factory=list)
56
54
  semantic_thresholds: dict[str, float] = Field(
57
55
  default_factory=dict,
58
56
  description="Semantic threshold for each semantic model",
@@ -60,10 +58,10 @@ class QueryInfo(BaseModel):
60
58
  )
61
59
  visual_llm: bool
62
60
  max_context: int
63
- entities: Optional[TokenSearch]
64
- sentence: Optional[SentenceSearch]
61
+ entities: TokenSearch | None
62
+ sentence: SentenceSearch | None
65
63
  query: str
66
- rephrased_query: Optional[str] = None
64
+ rephrased_query: str | None = None
67
65
 
68
66
 
69
67
  class RerankModel(BaseModel):
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  from enum import Enum
16
- from typing import List, Optional
17
16
 
18
17
  from pydantic import BaseModel
19
18
 
@@ -58,9 +57,9 @@ class ShardReplica(BaseModel):
58
57
 
59
58
  class ShardObject(BaseModel):
60
59
  shard: str
61
- nidx_shard_id: Optional[str]
60
+ nidx_shard_id: str | None
62
61
 
63
62
 
64
63
  class KnowledgeboxShards(BaseModel):
65
64
  kbid: str
66
- shards: List[ShardObject]
65
+ shards: list[ShardObject]
nucliadb_models/labels.py CHANGED
@@ -14,9 +14,8 @@
14
14
  #
15
15
 
16
16
  from enum import Enum
17
- from typing import Dict, List, Optional
18
17
 
19
- from pydantic import BaseModel, model_validator
18
+ from pydantic import BaseModel, Field, model_validator
20
19
  from typing_extensions import Self
21
20
 
22
21
  BASE_LABELS: dict[str, set[str]] = {
@@ -96,18 +95,26 @@ class LabelSetKind(str, Enum):
96
95
 
97
96
 
98
97
  class Label(BaseModel):
99
- title: str
100
- related: Optional[str] = None
101
- text: Optional[str] = None
102
- uri: Optional[str] = None
98
+ title: str = Field(
99
+ description="Title of the label. This is the display name for the label shown in the UI and also used for searching."
100
+ )
101
+ related: str | None = None
102
+ text: str | None = None
103
+ uri: str | None = None
103
104
 
104
105
 
105
106
  class LabelSet(BaseModel):
106
- title: Optional[str] = None
107
- color: Optional[str] = "blue"
107
+ title: str | None = Field(
108
+ default=None,
109
+ description="Title of the labelset. It is a prettier display name for the labelset shown in the UI but it is not intended to be used for searching.",
110
+ )
111
+ color: str | None = "blue"
108
112
  multiple: bool = True
109
- kind: List[LabelSetKind] = []
110
- labels: List[Label] = []
113
+ kind: list[LabelSetKind] = []
114
+ labels: list[Label] = Field(
115
+ default_factory=list,
116
+ description="List of labels in the labelset. The titles of the labels must be unique within the labelset.",
117
+ )
111
118
 
112
119
  @model_validator(mode="after")
113
120
  def check_unique_labels(self) -> Self:
@@ -123,4 +130,4 @@ class LabelSet(BaseModel):
123
130
 
124
131
  class KnowledgeBoxLabels(BaseModel):
125
132
  uuid: str
126
- labelsets: Dict[str, LabelSet] = {}
133
+ labelsets: dict[str, LabelSet] = {}
nucliadb_models/link.py CHANGED
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  from datetime import datetime
16
- from typing import Dict, Optional
17
16
 
18
17
  from pydantic import BaseModel, Field
19
18
 
@@ -25,19 +24,19 @@ from pydantic import BaseModel, Field
25
24
 
26
25
 
27
26
  class FieldLink(BaseModel):
28
- added: Optional[datetime] = None
29
- headers: Optional[Dict[str, str]] = None
30
- cookies: Optional[Dict[str, str]] = None
31
- uri: Optional[str] = None
32
- language: Optional[str] = None
33
- localstorage: Optional[Dict[str, str]] = None
34
- css_selector: Optional[str] = None
35
- xpath: Optional[str] = None
36
- extract_strategy: Optional[str] = Field(
27
+ added: datetime | None = None
28
+ headers: dict[str, str] | None = None
29
+ cookies: dict[str, str] | None = None
30
+ uri: str | None = None
31
+ language: str | None = None
32
+ localstorage: dict[str, str] | None = None
33
+ css_selector: str | None = None
34
+ xpath: str | None = None
35
+ extract_strategy: str | None = Field(
37
36
  default=None,
38
37
  description="Id of the Nuclia extract strategy used at processing time. If not set, the default strategy was used. Extract strategies are defined at the learning configuration api.",
39
38
  )
40
- split_strategy: Optional[str] = Field(
39
+ split_strategy: str | None = Field(
41
40
  default=None,
42
41
  description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
43
42
  )
@@ -47,18 +46,18 @@ class FieldLink(BaseModel):
47
46
 
48
47
 
49
48
  class LinkField(BaseModel):
50
- headers: Optional[Dict[str, str]] = {}
51
- cookies: Optional[Dict[str, str]] = {}
49
+ headers: dict[str, str] | None = {}
50
+ cookies: dict[str, str] | None = {}
52
51
  uri: str
53
- language: Optional[str] = None
54
- localstorage: Optional[Dict[str, str]] = {}
55
- css_selector: Optional[str] = None
56
- xpath: Optional[str] = None
57
- extract_strategy: Optional[str] = Field(
52
+ language: str | None = None
53
+ localstorage: dict[str, str] | None = {}
54
+ css_selector: str | None = None
55
+ xpath: str | None = None
56
+ extract_strategy: str | None = Field(
58
57
  default=None,
59
58
  description="Id of the Nuclia extract strategy to use at processing time. If not set, the default strategy will be used. Extract strategies are defined at the learning configuration api.",
60
59
  )
61
- split_strategy: Optional[str] = Field(
60
+ split_strategy: str | None = Field(
62
61
  default=None,
63
62
  description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
64
63
  )