nucliadb-models 6.8.1.post4983__py3-none-any.whl → 6.10.0.post5694__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb-models might be problematic. Click here for more details.

Files changed (34) hide show
  1. nucliadb_models/agents/ingestion.py +4 -4
  2. nucliadb_models/augment.py +359 -0
  3. nucliadb_models/common.py +66 -57
  4. nucliadb_models/configuration.py +9 -9
  5. nucliadb_models/content_types.py +13 -11
  6. nucliadb_models/conversation.py +30 -29
  7. nucliadb_models/entities.py +17 -18
  8. nucliadb_models/external_index_providers.py +5 -20
  9. nucliadb_models/extracted.py +82 -83
  10. nucliadb_models/file.py +10 -11
  11. nucliadb_models/filters.py +78 -74
  12. nucliadb_models/graph/requests.py +38 -47
  13. nucliadb_models/hydration.py +423 -0
  14. nucliadb_models/internal/predict.py +7 -9
  15. nucliadb_models/internal/shards.py +2 -3
  16. nucliadb_models/labels.py +18 -11
  17. nucliadb_models/link.py +18 -19
  18. nucliadb_models/metadata.py +80 -53
  19. nucliadb_models/notifications.py +3 -3
  20. nucliadb_models/processing.py +1 -2
  21. nucliadb_models/resource.py +85 -102
  22. nucliadb_models/retrieval.py +147 -0
  23. nucliadb_models/search.py +360 -306
  24. nucliadb_models/security.py +2 -3
  25. nucliadb_models/text.py +7 -8
  26. nucliadb_models/trainset.py +1 -2
  27. nucliadb_models/utils.py +2 -3
  28. nucliadb_models/vectors.py +2 -5
  29. nucliadb_models/writer.py +56 -57
  30. {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/METADATA +2 -3
  31. nucliadb_models-6.10.0.post5694.dist-info/RECORD +41 -0
  32. nucliadb_models-6.8.1.post4983.dist-info/RECORD +0 -38
  33. {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/WHEEL +0 -0
  34. {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,423 @@
1
+ # Copyright 2025 Bosutech XXI S.L.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ from typing import Annotated
16
+
17
+ from pydantic import BaseModel, Field, StringConstraints
18
+
19
+ from nucliadb_models.common import FieldTypeName
20
+ from nucliadb_models.metadata import Origin
21
+ from nucliadb_models.resource import FieldConversation, FieldFile, FieldLink, FieldText
22
+ from nucliadb_models.search import Image
23
+ from nucliadb_models.security import ResourceSecurity
24
+
25
+
26
+ class ResourceHydration(BaseModel, extra="forbid"):
27
+ title: bool = Field(
28
+ default=True,
29
+ description="Hydrate resource titles",
30
+ )
31
+ summary: bool = Field(
32
+ default=False,
33
+ description="Hydrate resource summaries",
34
+ )
35
+
36
+ origin: bool = Field(
37
+ default=False,
38
+ description="Hydrate resource origin",
39
+ )
40
+
41
+ security: bool = Field(
42
+ default=False,
43
+ description="Hydrate resource security metadata",
44
+ )
45
+
46
+
47
+ class TextFieldHydration(BaseModel, extra="forbid"):
48
+ value: bool = Field(
49
+ default=False,
50
+ description="Hydrate text field values. Field values are similar payloads to the ones used to create them",
51
+ )
52
+ extracted_text: bool = Field(
53
+ default=False,
54
+ description="Hydrate extracted text for text fields",
55
+ )
56
+ # TODO: what else should be interesting to add?
57
+
58
+
59
+ class FileFieldHydration(BaseModel, extra="forbid"):
60
+ value: bool = Field(
61
+ default=False,
62
+ description="Hydrate file field values. Field values are similar payloads to the ones used to create them",
63
+ )
64
+ extracted_text: bool = Field(
65
+ default=False,
66
+ description="Hydrate extracted text for file fields",
67
+ )
68
+ # TODO: what else should be interesting to add?
69
+
70
+
71
+ class LinkFieldHydration(BaseModel, extra="forbid"):
72
+ value: bool = Field(
73
+ default=False,
74
+ description="Hydrate link field values. Field values are similar payloads to the ones used to create them",
75
+ )
76
+ extracted_text: bool = Field(
77
+ default=False,
78
+ description="Hydrate extracted text for link fields",
79
+ )
80
+ # TODO: what else should be interesting to add?
81
+
82
+
83
+ class ConversationFieldHydration(BaseModel, extra="forbid"):
84
+ value: bool = Field(
85
+ default=False,
86
+ description="Hydrate conversation field values. Field values are similar payloads to the ones used to create them",
87
+ )
88
+
89
+ # TODO: add fields to hydrate conversation fields. Think about how to handle
90
+ # splits and fulfill the conversational RAG strategies
91
+
92
+ # TODO: what else should be interesting to add?
93
+
94
+
95
+ class GenericFieldHydration(BaseModel, extra="forbid"):
96
+ value: bool = Field(
97
+ default=False,
98
+ description="Hydrate generic field values. Field values are similar payloads to the ones used to create them",
99
+ )
100
+ extracted_text: bool = Field(
101
+ default=False,
102
+ description="Hydrate extracted text for generic fields",
103
+ )
104
+ # TODO: what else should be interesting to add?
105
+
106
+
107
+ class FieldHydration(BaseModel, extra="forbid"):
108
+ text: TextFieldHydration | None = Field(
109
+ default_factory=TextFieldHydration,
110
+ description="Text fields hydration options",
111
+ )
112
+ file: FileFieldHydration | None = Field(
113
+ default_factory=FileFieldHydration,
114
+ description="File fields hydration options",
115
+ )
116
+ link: LinkFieldHydration | None = Field(
117
+ default_factory=LinkFieldHydration,
118
+ description="Link fields hydration options",
119
+ )
120
+ conversation: ConversationFieldHydration | None = Field(
121
+ default_factory=ConversationFieldHydration,
122
+ description="Conversation fields hydration options",
123
+ )
124
+ generic: GenericFieldHydration | None = Field(
125
+ default_factory=GenericFieldHydration,
126
+ description="Generic fields hydration options",
127
+ )
128
+
129
+
130
+ class NeighbourParagraphHydration(BaseModel, extra="forbid"):
131
+ before: int = Field(
132
+ default=2,
133
+ ge=0,
134
+ description="Number of previous paragraphs to hydrate",
135
+ )
136
+ after: int = Field(
137
+ default=2,
138
+ ge=0,
139
+ description="Number of following paragraphs to hydrate",
140
+ )
141
+
142
+
143
+ class RelatedParagraphHydration(BaseModel, extra="forbid"):
144
+ neighbours: NeighbourParagraphHydration | None = Field(
145
+ default=None,
146
+ description="Hydrate extra paragraphs that surround the original one",
147
+ )
148
+
149
+ # TODO: FEATURE: implement related paragraphs by page
150
+ # page: bool = Field(
151
+ # default=False,
152
+ # description="Hydrate all paragraphs in the same page. This only applies to fields with pages",
153
+ # )
154
+
155
+ # TODO: description
156
+ # XXX: should we let users control the amount of elements?
157
+ parents: bool = False
158
+ # TODO: description
159
+ # XXX: should we let users control the amount of elements?
160
+ siblings: bool = False
161
+ # TODO: description
162
+ # XXX: should we let users control the amount of elements?
163
+ replacements: bool = False
164
+
165
+
166
+ class ImageParagraphHydration(BaseModel, extra="forbid"):
167
+ # The source image is also known as reference or reference_file in the
168
+ # paragraph context. The reference/reference_file is the filename of the
169
+ # source image from which the paragraph has been extracted
170
+ source_image: bool = Field(
171
+ default=False,
172
+ description=(
173
+ "When a paragraph has been extracted from an image (using OCR, inception...), "
174
+ "hydrate the image that represents it"
175
+ ),
176
+ )
177
+
178
+
179
+ class TableParagraphHydration(BaseModel, extra="forbid"):
180
+ # TODO: implement. ARAG uses the label "/k/table" to check whether a
181
+ # paragraph is or a table or not. We can also use info on maindb
182
+ table_page_preview: bool = Field(
183
+ default=False,
184
+ description="Hydrate the page preview for the table. This will only hydrate fields with pages",
185
+ )
186
+
187
+
188
+ class ParagraphPageHydration(BaseModel, extra="forbid"):
189
+ # For some field types (file and link) learning generates previews. A
190
+ # preview is a PDF file representing the content. For a docx for example, is
191
+ # the PDF equivalent. Depending on the field type, the preview can
192
+ # represent, for example, a page in a document or a portion of a webpage.
193
+ page_with_visual: bool = Field(
194
+ default=False,
195
+ description=(
196
+ "When a paragraph has been extracted from a page containing visual "
197
+ "content (images, tables...), hydrate the preview of the paragraph's "
198
+ "page as an image. Not all field types have previews nor visual content"
199
+ ),
200
+ )
201
+
202
+
203
+ class ParagraphHydration(BaseModel, extra="forbid"):
204
+ text: bool = Field(
205
+ default=True,
206
+ description="Hydrate paragraph text",
207
+ )
208
+ image: ImageParagraphHydration | None = Field(
209
+ default=None,
210
+ description="Hydrate options for paragraphs extracted from images (using OCR, inception...)",
211
+ )
212
+ table: TableParagraphHydration | None = Field(
213
+ default=None,
214
+ description="Hydrate options for paragraphs extracted from tables",
215
+ )
216
+
217
+ # TODO: at some point, we should add hydration options for paragraphs from
218
+ # audio and video
219
+
220
+ page: ParagraphPageHydration | None = Field(
221
+ default=None,
222
+ description="Hydrte options for paragraphs within a page. This applies to paragraphs in fields with pages",
223
+ )
224
+
225
+ related: RelatedParagraphHydration | None = Field(
226
+ default=None,
227
+ description="Hydration options for related paragraphs. For example, neighbours or sibling paragraphs",
228
+ )
229
+
230
+
231
+ class Hydration(BaseModel, extra="forbid"):
232
+ resource: ResourceHydration | None = Field(
233
+ default_factory=ResourceHydration,
234
+ description="Resource hydration options",
235
+ )
236
+ field: FieldHydration = Field(
237
+ default_factory=FieldHydration,
238
+ description="Field hydration options",
239
+ )
240
+ paragraph: ParagraphHydration = Field(
241
+ default_factory=ParagraphHydration,
242
+ description="Paragraph hydration options",
243
+ )
244
+
245
+
246
+ ParagraphId = Annotated[
247
+ str,
248
+ StringConstraints(
249
+ # resource-uuid/field-type/field-id/[split-id/]paragraph-id
250
+ pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$",
251
+ min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
252
+ # max field id of 250 and 10 digit paragraphs. More than enough
253
+ max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
254
+ ),
255
+ ]
256
+
257
+
258
+ class HydrateRequest(BaseModel, extra="forbid"):
259
+ data: list[ParagraphId] = Field(
260
+ description="List of paragraph ids we want to hydrate",
261
+ max_length=50,
262
+ )
263
+ hydration: Hydration = Field(description="Description of how hydration must be performed")
264
+
265
+
266
+ ### Response models
267
+
268
+
269
+ class HydratedResource(BaseModel, extra="forbid"):
270
+ id: str = Field(description="Unique resource id")
271
+ slug: str = Field(description="Resource slug")
272
+
273
+ title: str | None = None
274
+ summary: str | None = None
275
+
276
+ origin: Origin | None = None
277
+
278
+ security: ResourceSecurity | None = None
279
+
280
+ # TODO: add resource labels to hydrated resources
281
+
282
+
283
+ class FieldExtractedData(BaseModel, extra="forbid"):
284
+ text: str | None = None
285
+
286
+
287
+ class SplitFieldExtractedData(BaseModel, extra="forbid"):
288
+ texts: dict[str, str] | None = None
289
+
290
+
291
+ class HydratedTextField(BaseModel, extra="forbid"):
292
+ id: str = Field("Unique field id")
293
+ resource: str = Field("Field resource id")
294
+ field_type: FieldTypeName = FieldTypeName.TEXT
295
+
296
+ value: FieldText | None = None
297
+ extracted: FieldExtractedData | None = None
298
+
299
+
300
+ class HydratedFileField(BaseModel, extra="forbid"):
301
+ id: str = Field("Unique field id")
302
+ resource: str = Field("Field resource id")
303
+ field_type: FieldTypeName = FieldTypeName.FILE
304
+
305
+ value: FieldFile | None = None
306
+ extracted: FieldExtractedData | None = None
307
+
308
+ previews: dict[str, Image] | None = Field(
309
+ default=None,
310
+ title="Previews of specific parts of the field",
311
+ description=(
312
+ "Previews for specific pages of this field. Previews are differents"
313
+ "depending on the file type. For example, for a PDF file, a preview"
314
+ "will be an image of a single page."
315
+ "In this field, previews will be populated according to the hydration"
316
+ "options requested."
317
+ ),
318
+ )
319
+
320
+
321
+ class HydratedLinkField(BaseModel, extra="forbid"):
322
+ id: str = Field("Unique field id")
323
+ resource: str = Field("Field resource id")
324
+ field_type: FieldTypeName = FieldTypeName.LINK
325
+
326
+ value: FieldLink | None = None
327
+ extracted: FieldExtractedData | None = None
328
+
329
+
330
+ class HydratedConversationField(BaseModel, extra="forbid"):
331
+ id: str = Field("Unique field id")
332
+ resource: str = Field("Field resource id")
333
+ field_type: FieldTypeName = FieldTypeName.CONVERSATION
334
+
335
+ value: FieldConversation | None = None
336
+ extracted: FieldExtractedData | None = None
337
+
338
+
339
+ class HydratedGenericField(BaseModel, extra="forbid"):
340
+ id: str = Field("Unique field id")
341
+ resource: str = Field("Field resource id")
342
+ field_type: FieldTypeName = FieldTypeName.TEXT
343
+
344
+ value: str | None = None
345
+ extracted: FieldExtractedData | None = None
346
+
347
+
348
+ class RelatedNeighbourParagraphRefs(BaseModel, extra="forbid"):
349
+ before: list[str] | None = None
350
+ after: list[str] | None = None
351
+
352
+
353
+ class RelatedParagraphRefs(BaseModel, extra="forbid"):
354
+ neighbours: RelatedNeighbourParagraphRefs | None = None
355
+ parents: list[str] | None = None
356
+ siblings: list[str] | None = None
357
+ replacements: list[str] | None = None
358
+
359
+
360
+ class HydratedParagraphImage(BaseModel, extra="forbid"):
361
+ source_image: Image | None = Field(
362
+ default=None,
363
+ description=(
364
+ "Source image for this paragraph. This only applies to paragraphs "
365
+ "extracted from an image using OCR or inception, and if this "
366
+ "hydration option has been enabled in the request"
367
+ ),
368
+ )
369
+
370
+
371
+ class HydratedParagraphTable(BaseModel, extra="forbid"):
372
+ page_preview_ref: str | None = Field(
373
+ default=None,
374
+ description=(
375
+ "Referento to the page preview for this paragraph. The actual "
376
+ "preview will be found in the previews of its field. This only "
377
+ "applies to paragraphs generated from a table and if the "
378
+ "corresponding hydration option has been enabled in the request"
379
+ ),
380
+ )
381
+
382
+
383
+ class HydratedParagraphPage(BaseModel, extra="forbid"):
384
+ page_preview_ref: str | None = Field(
385
+ default=None,
386
+ description=(
387
+ "Reference to the page preview for this paragraph. The actual "
388
+ "preview will be found in the previews of its field. This only "
389
+ "applies to paragraphs extracted from a page containing visual "
390
+ "content and if the corresponding hydration option has been enabled "
391
+ "in the request"
392
+ ),
393
+ )
394
+
395
+
396
+ class HydratedParagraph(BaseModel, extra="forbid"):
397
+ id: str = Field(description="Unique paragraph id")
398
+ field: str = Field(description="Paragraph field id")
399
+ resource: str = Field(description="Paragraph resource id")
400
+
401
+ text: str | None = None
402
+
403
+ # TODO: add labels to hydrated paragraphs
404
+ # labels: Optional[list[str]] = None
405
+
406
+ related: RelatedParagraphRefs | None = None
407
+
408
+ image: HydratedParagraphImage | None = None
409
+ table: HydratedParagraphTable | None = None
410
+ page: HydratedParagraphPage | None = None
411
+
412
+
413
+ class Hydrated(BaseModel, extra="forbid"):
414
+ resources: dict[str, HydratedResource]
415
+ fields: dict[
416
+ str,
417
+ HydratedTextField
418
+ | HydratedFileField
419
+ | HydratedLinkField
420
+ | HydratedConversationField
421
+ | HydratedGenericField,
422
+ ]
423
+ paragraphs: dict[str, HydratedParagraph]
@@ -19,13 +19,11 @@ Models for Predict API v1.
19
19
  ATENTION! Keep these models in sync with models on Predict API
20
20
  """
21
21
 
22
- from typing import List, Optional
23
-
24
22
  from pydantic import BaseModel, Field
25
23
 
26
24
 
27
25
  class SentenceSearch(BaseModel):
28
- vectors: dict[str, List[float]] = Field(
26
+ vectors: dict[str, list[float]] = Field(
29
27
  default_factory=dict,
30
28
  description="Sentence vectors for each semantic model",
31
29
  min_length=1,
@@ -45,14 +43,14 @@ class Ner(BaseModel):
45
43
 
46
44
 
47
45
  class TokenSearch(BaseModel):
48
- tokens: List[Ner] = []
46
+ tokens: list[Ner] = []
49
47
  time: float
50
48
  input_tokens: int = 0
51
49
 
52
50
 
53
51
  class QueryInfo(BaseModel):
54
- language: Optional[str]
55
- stop_words: List[str] = Field(default_factory=list)
52
+ language: str | None
53
+ stop_words: list[str] = Field(default_factory=list)
56
54
  semantic_thresholds: dict[str, float] = Field(
57
55
  default_factory=dict,
58
56
  description="Semantic threshold for each semantic model",
@@ -60,10 +58,10 @@ class QueryInfo(BaseModel):
60
58
  )
61
59
  visual_llm: bool
62
60
  max_context: int
63
- entities: Optional[TokenSearch]
64
- sentence: Optional[SentenceSearch]
61
+ entities: TokenSearch | None
62
+ sentence: SentenceSearch | None
65
63
  query: str
66
- rephrased_query: Optional[str] = None
64
+ rephrased_query: str | None = None
67
65
 
68
66
 
69
67
  class RerankModel(BaseModel):
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  from enum import Enum
16
- from typing import List, Optional
17
16
 
18
17
  from pydantic import BaseModel
19
18
 
@@ -58,9 +57,9 @@ class ShardReplica(BaseModel):
58
57
 
59
58
  class ShardObject(BaseModel):
60
59
  shard: str
61
- nidx_shard_id: Optional[str]
60
+ nidx_shard_id: str | None
62
61
 
63
62
 
64
63
  class KnowledgeboxShards(BaseModel):
65
64
  kbid: str
66
- shards: List[ShardObject]
65
+ shards: list[ShardObject]
nucliadb_models/labels.py CHANGED
@@ -14,9 +14,8 @@
14
14
  #
15
15
 
16
16
  from enum import Enum
17
- from typing import Dict, List, Optional
18
17
 
19
- from pydantic import BaseModel, model_validator
18
+ from pydantic import BaseModel, Field, model_validator
20
19
  from typing_extensions import Self
21
20
 
22
21
  BASE_LABELS: dict[str, set[str]] = {
@@ -96,18 +95,26 @@ class LabelSetKind(str, Enum):
96
95
 
97
96
 
98
97
  class Label(BaseModel):
99
- title: str
100
- related: Optional[str] = None
101
- text: Optional[str] = None
102
- uri: Optional[str] = None
98
+ title: str = Field(
99
+ description="Title of the label. This is the display name for the label shown in the UI and also used for searching."
100
+ )
101
+ related: str | None = None
102
+ text: str | None = None
103
+ uri: str | None = None
103
104
 
104
105
 
105
106
  class LabelSet(BaseModel):
106
- title: Optional[str] = None
107
- color: Optional[str] = "blue"
107
+ title: str | None = Field(
108
+ default=None,
109
+ description="Title of the labelset. It is a prettier display name for the labelset shown in the UI but it is not intended to be used for searching.",
110
+ )
111
+ color: str | None = "blue"
108
112
  multiple: bool = True
109
- kind: List[LabelSetKind] = []
110
- labels: List[Label] = []
113
+ kind: list[LabelSetKind] = []
114
+ labels: list[Label] = Field(
115
+ default_factory=list,
116
+ description="List of labels in the labelset. The titles of the labels must be unique within the labelset.",
117
+ )
111
118
 
112
119
  @model_validator(mode="after")
113
120
  def check_unique_labels(self) -> Self:
@@ -123,4 +130,4 @@ class LabelSet(BaseModel):
123
130
 
124
131
  class KnowledgeBoxLabels(BaseModel):
125
132
  uuid: str
126
- labelsets: Dict[str, LabelSet] = {}
133
+ labelsets: dict[str, LabelSet] = {}
nucliadb_models/link.py CHANGED
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  #
15
15
  from datetime import datetime
16
- from typing import Dict, Optional
17
16
 
18
17
  from pydantic import BaseModel, Field
19
18
 
@@ -25,19 +24,19 @@ from pydantic import BaseModel, Field
25
24
 
26
25
 
27
26
  class FieldLink(BaseModel):
28
- added: Optional[datetime] = None
29
- headers: Optional[Dict[str, str]] = None
30
- cookies: Optional[Dict[str, str]] = None
31
- uri: Optional[str] = None
32
- language: Optional[str] = None
33
- localstorage: Optional[Dict[str, str]] = None
34
- css_selector: Optional[str] = None
35
- xpath: Optional[str] = None
36
- extract_strategy: Optional[str] = Field(
27
+ added: datetime | None = None
28
+ headers: dict[str, str] | None = None
29
+ cookies: dict[str, str] | None = None
30
+ uri: str | None = None
31
+ language: str | None = None
32
+ localstorage: dict[str, str] | None = None
33
+ css_selector: str | None = None
34
+ xpath: str | None = None
35
+ extract_strategy: str | None = Field(
37
36
  default=None,
38
37
  description="Id of the Nuclia extract strategy used at processing time. If not set, the default strategy was used. Extract strategies are defined at the learning configuration api.",
39
38
  )
40
- split_strategy: Optional[str] = Field(
39
+ split_strategy: str | None = Field(
41
40
  default=None,
42
41
  description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
43
42
  )
@@ -47,18 +46,18 @@ class FieldLink(BaseModel):
47
46
 
48
47
 
49
48
  class LinkField(BaseModel):
50
- headers: Optional[Dict[str, str]] = {}
51
- cookies: Optional[Dict[str, str]] = {}
49
+ headers: dict[str, str] | None = {}
50
+ cookies: dict[str, str] | None = {}
52
51
  uri: str
53
- language: Optional[str] = None
54
- localstorage: Optional[Dict[str, str]] = {}
55
- css_selector: Optional[str] = None
56
- xpath: Optional[str] = None
57
- extract_strategy: Optional[str] = Field(
52
+ language: str | None = None
53
+ localstorage: dict[str, str] | None = {}
54
+ css_selector: str | None = None
55
+ xpath: str | None = None
56
+ extract_strategy: str | None = Field(
58
57
  default=None,
59
58
  description="Id of the Nuclia extract strategy to use at processing time. If not set, the default strategy will be used. Extract strategies are defined at the learning configuration api.",
60
59
  )
61
- split_strategy: Optional[str] = Field(
60
+ split_strategy: str | None = Field(
62
61
  default=None,
63
62
  description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
64
63
  )