nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -7,82 +7,15 @@
7
7
  import logging
8
8
  from typing import Any
9
9
  from typing import Dict
10
- from typing import Literal
11
10
  from typing import Optional
12
- from typing import Type
13
11
 
14
- from pydantic import BaseModel
15
- from pydantic import ConfigDict
16
- from pydantic import model_validator
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
17
13
 
18
14
  from .task_base import Task
19
15
 
20
16
  logger = logging.getLogger(__name__)
21
17
 
22
18
 
23
- class EmbedTaskSchema(BaseModel):
24
- """
25
- Schema for embed task configuration.
26
-
27
- This schema contains configuration details for an embedding task,
28
- including the endpoint URL, model name, API key, and error filtering flag.
29
-
30
- Attributes
31
- ----------
32
- endpoint_url : Optional[str]
33
- URL of the embedding endpoint. Default is None.
34
- model_name : Optional[str]
35
- Name of the embedding model. Default is None.
36
- api_key : Optional[str]
37
- API key for authentication with the embedding service. Default is None.
38
- filter_errors : bool
39
- Flag to indicate whether errors should be filtered. Default is False.
40
- """
41
-
42
- endpoint_url: Optional[str] = None
43
- model_name: Optional[str] = None
44
- api_key: Optional[str] = None
45
- filter_errors: bool = False
46
-
47
- text_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
48
- image_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
49
- structured_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
50
- audio_elements_modality: Optional[Literal["text"]] = None
51
-
52
- @model_validator(mode="before")
53
- def handle_deprecated_fields(cls: Type["EmbedTaskSchema"], values: Dict[str, Any]) -> Dict[str, Any]:
54
- """
55
- Handle deprecated fields before model validation.
56
-
57
- This validator checks for the presence of deprecated keys ('text' and 'tables')
58
- in the input dictionary and removes them. Warnings are issued if these keys are found.
59
-
60
- Parameters
61
- ----------
62
- values : Dict[str, Any]
63
- Input dictionary of model values.
64
-
65
- Returns
66
- -------
67
- Dict[str, Any]
68
- The updated dictionary with deprecated fields removed.
69
- """
70
- if "text" in values:
71
- logger.warning(
72
- "'text' parameter is deprecated and will be ignored. Future versions will remove this argument."
73
- )
74
- values.pop("text")
75
- if "tables" in values:
76
- logger.warning(
77
- "'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
78
- )
79
- values.pop("tables")
80
- return values
81
-
82
- model_config = ConfigDict(extra="forbid")
83
- model_config["protected_namespaces"] = ()
84
-
85
-
86
19
  class EmbedTask(Task):
87
20
  """
88
21
  Object for document embedding tasks.
@@ -103,6 +36,8 @@ class EmbedTask(Task):
103
36
  image_elements_modality: Optional[str] = None,
104
37
  structured_elements_modality: Optional[str] = None,
105
38
  audio_elements_modality: Optional[str] = None,
39
+ custom_content_field: Optional[str] = None,
40
+ result_target_field: Optional[str] = None,
106
41
  ) -> None:
107
42
  """
108
43
  Initialize the EmbedTask configuration.
@@ -133,14 +68,30 @@ class EmbedTask(Task):
133
68
  "'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
134
69
  )
135
70
 
136
- self._endpoint_url: Optional[str] = endpoint_url
137
- self._model_name: Optional[str] = model_name
138
- self._api_key: Optional[str] = api_key
139
- self._filter_errors: bool = filter_errors
140
- self._text_elements_modality: Optional[bool] = text_elements_modality
141
- self._image_elements_modality: Optional[bool] = image_elements_modality
142
- self._structured_elements_modality: Optional[bool] = structured_elements_modality
143
- self._audio_elements_modality: Optional[bool] = audio_elements_modality
71
+ # Use the API schema for validation
72
+ validated_data = IngestTaskEmbedSchema(
73
+ endpoint_url=endpoint_url,
74
+ model_name=model_name,
75
+ api_key=api_key,
76
+ filter_errors=filter_errors,
77
+ text_elements_modality=text_elements_modality,
78
+ image_elements_modality=image_elements_modality,
79
+ structured_elements_modality=structured_elements_modality,
80
+ audio_elements_modality=audio_elements_modality,
81
+ custom_content_field=custom_content_field,
82
+ result_target_field=result_target_field,
83
+ )
84
+
85
+ self._endpoint_url = validated_data.endpoint_url
86
+ self._model_name = validated_data.model_name
87
+ self._api_key = validated_data.api_key
88
+ self._filter_errors = validated_data.filter_errors
89
+ self._text_elements_modality = validated_data.text_elements_modality
90
+ self._image_elements_modality = validated_data.image_elements_modality
91
+ self._structured_elements_modality = validated_data.structured_elements_modality
92
+ self._audio_elements_modality = validated_data.audio_elements_modality
93
+ self._custom_content_field = validated_data.custom_content_field
94
+ self._result_target_field = validated_data.result_target_field
144
95
 
145
96
  def __str__(self) -> str:
146
97
  """
@@ -169,6 +120,10 @@ class EmbedTask(Task):
169
120
  info += f" structured_elements_modality: {self._structured_elements_modality}\n"
170
121
  if self._audio_elements_modality:
171
122
  info += f" audio_elements_modality: {self._audio_elements_modality}\n"
123
+ if self._custom_content_field:
124
+ info += f" custom_content_field: {self._custom_content_field}\n"
125
+ if self._result_target_field:
126
+ info += f" result_target_field: {self.result_target_field}\n"
172
127
  return info
173
128
 
174
129
  def to_dict(self) -> Dict[str, Any]:
@@ -204,4 +159,10 @@ class EmbedTask(Task):
204
159
  if self._audio_elements_modality:
205
160
  task_properties["audio_elements_modality"] = self._audio_elements_modality
206
161
 
162
+ if self._custom_content_field:
163
+ task_properties["custom_content_field"] = self._custom_content_field
164
+
165
+ if self._result_target_field:
166
+ task_properties["result_target_field"] = self.result_target_field
167
+
207
168
  return {"type": "embed", "task_properties": task_properties}
@@ -12,12 +12,8 @@ from typing import Any
12
12
  from typing import Dict
13
13
  from typing import Literal
14
14
  from typing import Optional
15
- from typing import get_args
16
15
 
17
- from pydantic import BaseModel
18
- from pydantic import ConfigDict
19
- from pydantic import field_validator
20
- from pydantic import model_validator
16
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
21
17
 
22
18
  from .task_base import Task
23
19
 
@@ -64,142 +60,9 @@ _Type_Extract_Method_PDF = Literal[
64
60
  "unstructured_io",
65
61
  ]
66
62
 
67
- _Type_Extract_Method_DOCX = Literal["python_docx", "haystack", "unstructured_local", "unstructured_service"]
63
+ _Type_Extract_Images_Method = Literal["group", "yolox"]
68
64
 
69
- _Type_Extract_Method_PPTX = Literal["python_pptx", "haystack", "unstructured_local", "unstructured_service"]
70
-
71
- _Type_Extract_Method_Image = Literal["image"]
72
-
73
- _Type_Extract_Method_Audio = Literal["audio"]
74
-
75
- _Type_Extract_Method_Text = Literal["txt"]
76
-
77
- _Type_Extract_Method_Html = Literal["markitdown"]
78
-
79
- _Type_Extract_Method_Map = {
80
- "bmp": get_args(_Type_Extract_Method_Image),
81
- "docx": get_args(_Type_Extract_Method_DOCX),
82
- "html": get_args(_Type_Extract_Method_Html),
83
- "jpeg": get_args(_Type_Extract_Method_Image),
84
- "jpg": get_args(_Type_Extract_Method_Image),
85
- "pdf": get_args(_Type_Extract_Method_PDF),
86
- "png": get_args(_Type_Extract_Method_Image),
87
- "pptx": get_args(_Type_Extract_Method_PPTX),
88
- "text": get_args(_Type_Extract_Method_Text),
89
- "tiff": get_args(_Type_Extract_Method_Image),
90
- "txt": get_args(_Type_Extract_Method_Text),
91
- "mp3": get_args(_Type_Extract_Method_Audio),
92
- "wav": get_args(_Type_Extract_Method_Audio),
93
- }
94
-
95
- _Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium", "nemoretriever_parse"]
96
-
97
- _Type_Extract_Tables_Method_DOCX = Literal["python_docx",]
98
-
99
- _Type_Extract_Tables_Method_PPTX = Literal["python_pptx",]
100
-
101
- _Type_Extract_Tables_Method_Map = {
102
- "pdf": get_args(_Type_Extract_Tables_Method_PDF),
103
- "docx": get_args(_Type_Extract_Tables_Method_DOCX),
104
- "pptx": get_args(_Type_Extract_Tables_Method_PPTX),
105
- }
106
-
107
- _Type_Extract_Images_Method = Literal["simple", "group"]
108
-
109
-
110
- class ExtractTaskSchema(BaseModel):
111
- document_type: str
112
- extract_method: str = None # Initially allow None to set a smart default
113
- extract_text: bool = True
114
- extract_images: bool = True
115
- extract_images_method: str = "group"
116
- extract_images_params: Optional[Dict[str, Any]] = None
117
- extract_tables: bool = True
118
- extract_tables_method: str = "yolox"
119
- extract_charts: Optional[bool] = None # Initially allow None to set a smart default
120
- extract_infographics: bool = False
121
- extract_page_as_image: bool = False
122
- extract_audio_params: Optional[Dict[str, Any]] = None
123
- text_depth: str = "document"
124
- paddle_output_format: Optional[str] = None
125
- table_output_format: str = "pseudo_markdown"
126
-
127
- @model_validator(mode="after")
128
- @classmethod
129
- def set_default_extract_method(cls, values):
130
- document_type = values.document_type.lower() # Ensure case-insensitive comparison
131
- extract_method = values.extract_method
132
- paddle_output_format = values.paddle_output_format
133
-
134
- if document_type not in _DEFAULT_EXTRACTOR_MAP:
135
- raise ValueError(
136
- f"Unsupported document type: {document_type}."
137
- f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
138
- )
139
-
140
- if extract_method is None:
141
- values.extract_method = _DEFAULT_EXTRACTOR_MAP[document_type]
142
-
143
- if paddle_output_format is not None:
144
- logger.warning(
145
- "`paddle_output_format` is deprecated and will be removed in a future release. "
146
- "Please use `table_output_format` instead."
147
- )
148
- values.table_output_format = paddle_output_format
149
-
150
- return values
151
-
152
- @field_validator("extract_charts")
153
- def set_default_extract_charts(cls, v, values):
154
- # `extract_charts` is initially set to None for backward compatibility.
155
- # {extract_tables: true, extract_charts: None} or {extract_tables: true, extract_charts: true} enables both
156
- # table and chart extraction.
157
- # {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
158
- extract_charts = v
159
- if extract_charts is None:
160
- extract_charts = values.data.get("extract_tables")
161
-
162
- return extract_charts
163
-
164
- @field_validator("extract_method")
165
- def extract_method_must_be_valid(cls, v, values, **kwargs):
166
- document_type = values.data.get("document_type", "").lower() # Ensure case-insensitive comparison
167
-
168
- # Skip validation for text-like types, since they do not have 'extract' stages.
169
- if document_type in ["txt", "text", "json", "md", "sh"]:
170
- return
171
-
172
- valid_methods = set(_Type_Extract_Method_Map[document_type])
173
- if v not in valid_methods:
174
- raise ValueError(f"extract_method must be one of {valid_methods}")
175
-
176
- return v
177
-
178
- @field_validator("document_type")
179
- def document_type_must_be_supported(cls, v):
180
- if v.lower() not in _DEFAULT_EXTRACTOR_MAP:
181
- raise ValueError(
182
- f"Unsupported document type '{v}'. Supported types are: {', '.join(_DEFAULT_EXTRACTOR_MAP.keys())}"
183
- )
184
- return v.lower()
185
-
186
- @field_validator("extract_tables_method")
187
- def extract_tables_method_must_be_valid(cls, v, values, **kwargs):
188
- document_type = values.data.get("document_type", "").lower() # Ensure case-insensitive comparison
189
- valid_methods = set(_Type_Extract_Tables_Method_Map[document_type])
190
- if v not in valid_methods:
191
- raise ValueError(f"extract_method must be one of {valid_methods}")
192
- return v
193
-
194
- @field_validator("extract_images_method")
195
- def extract_images_method_must_be_valid(cls, v):
196
- if v.lower() not in get_args(_Type_Extract_Images_Method):
197
- raise ValueError(
198
- f"Unsupported document type '{v}'. Supported types are: {', '.join(_Type_Extract_Images_Method)}"
199
- )
200
- return v.lower()
201
-
202
- model_config = ConfigDict(extra="forbid")
65
+ _Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
203
66
 
204
67
 
205
68
  class ExtractTask(Task):
@@ -210,7 +73,7 @@ class ExtractTask(Task):
210
73
  def __init__(
211
74
  self,
212
75
  document_type,
213
- extract_method: _Type_Extract_Method_PDF = "pdfium",
76
+ extract_method: _Type_Extract_Method_PDF = None,
214
77
  extract_text: bool = False,
215
78
  extract_images: bool = False,
216
79
  extract_tables: bool = False,
@@ -223,26 +86,69 @@ class ExtractTask(Task):
223
86
  extract_page_as_image: bool = False,
224
87
  text_depth: str = "document",
225
88
  paddle_output_format: str = "pseudo_markdown",
226
- table_output_format: str = "pseudo_markdown",
89
+ table_output_format: str = "markdown",
227
90
  ) -> None:
228
91
  """
229
92
  Setup Extract Task Config
230
93
  """
231
94
  super().__init__()
232
95
 
233
- self._document_type = document_type
96
+ # Set default extract_method if None
97
+ if extract_method is None:
98
+ # Handle both string and enum inputs
99
+ if hasattr(document_type, "value"):
100
+ document_type_str = document_type.value
101
+ else:
102
+ document_type_str = document_type
103
+ document_type_lower = document_type_str.lower()
104
+ if document_type_lower not in _DEFAULT_EXTRACTOR_MAP:
105
+ raise ValueError(
106
+ f"Unsupported document type: {document_type}."
107
+ f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
108
+ )
109
+ extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
110
+
111
+ # Set default extract_charts if None
112
+ if extract_charts is None:
113
+ extract_charts = extract_tables
114
+
115
+ # Build params dict for API schema validation
116
+ extract_params = {
117
+ "extract_text": extract_text,
118
+ "extract_images": extract_images,
119
+ "extract_images_method": extract_images_method,
120
+ "extract_tables": extract_tables,
121
+ "extract_tables_method": extract_tables_method,
122
+ "extract_charts": extract_charts,
123
+ "extract_infographics": extract_infographics,
124
+ "extract_page_as_image": extract_page_as_image,
125
+ "text_depth": text_depth,
126
+ "table_output_format": table_output_format,
127
+ }
128
+
129
+ # Add optional parameters if provided
130
+ if extract_images_params:
131
+ extract_params["extract_images_params"] = extract_images_params
132
+ if extract_audio_params:
133
+ extract_params["extract_audio_params"] = extract_audio_params
134
+
135
+ # Use the API schema for validation
136
+ validated_data = IngestTaskExtractSchema(
137
+ document_type=document_type,
138
+ method=extract_method,
139
+ params=extract_params,
140
+ )
141
+
142
+ # Store validated data
143
+ self._document_type = validated_data.document_type
144
+ self._extract_method = validated_data.method
234
145
  self._extract_audio_params = extract_audio_params
235
146
  self._extract_images = extract_images
236
- self._extract_method = extract_method
237
147
  self._extract_tables = extract_tables
238
148
  self._extract_images_method = extract_images_method
239
149
  self._extract_images_params = extract_images_params
240
150
  self._extract_tables_method = extract_tables_method
241
- # `extract_charts` is initially set to None for backward compatibility.
242
- # {extract_tables: true, extract_charts: None} or {extract_tables: true, extract-charts: true} enables both
243
- # table and chart extraction.
244
- # {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
245
- self._extract_charts = extract_charts if extract_charts is not None else extract_tables
151
+ self._extract_charts = extract_charts
246
152
  self._extract_infographics = extract_infographics
247
153
  self._extract_page_as_image = extract_page_as_image
248
154
  self._extract_text = extract_text
@@ -256,34 +162,27 @@ class ExtractTask(Task):
256
162
  """
257
163
  info = ""
258
164
  info += "Extract Task:\n"
259
- info += f" document type: {self._document_type}\n"
260
- info += f" extract method: {self._extract_method}\n"
261
- info += f" extract text: {self._extract_text}\n"
262
- info += f" extract images: {self._extract_images}\n"
263
- info += f" extract tables: {self._extract_tables}\n"
264
- info += f" extract charts: {self._extract_charts}\n"
265
- info += f" extract infographics: {self._extract_infographics}\n"
266
- info += f" extract page as image: {self._extract_page_as_image}\n"
267
- info += f" extract images method: {self._extract_images_method}\n"
268
- info += f" extract tables method: {self._extract_tables_method}\n"
269
- info += f" text depth: {self._text_depth}\n"
165
+ info += f" document_type: {self._document_type.value}\n"
166
+ info += f" extract_method: {self._extract_method}\n"
167
+ info += f" extract_text: {self._extract_text}\n"
168
+ info += f" extract_images: {self._extract_images}\n"
169
+ info += f" extract_tables: {self._extract_tables}\n"
170
+ info += f" extract_charts: {self._extract_charts}\n"
171
+ info += f" extract_infographics: {self._extract_infographics}\n"
172
+ info += f" extract_page_as_image: {self._extract_page_as_image}\n"
173
+ info += f" text_depth: {self._text_depth}\n"
270
174
  info += f" table_output_format: {self._table_output_format}\n"
271
-
272
- if self._extract_images_params:
273
- info += f" extract images params: {self._extract_images_params}\n"
274
- if self._extract_audio_params:
275
- info += f" extract audio params: {self._extract_audio_params}\n"
276
175
  return info
277
176
 
278
177
  def to_dict(self) -> Dict:
279
178
  """
280
- Convert to a dict for submission to redis (fixme)
179
+ Convert to a dict for submission to redis
281
180
  """
282
181
  extract_params = {
283
182
  "extract_text": self._extract_text,
284
183
  "extract_images": self._extract_images,
285
- "extract_tables": self._extract_tables,
286
184
  "extract_images_method": self._extract_images_method,
185
+ "extract_tables": self._extract_tables,
287
186
  "extract_tables_method": self._extract_tables_method,
288
187
  "extract_charts": self._extract_charts,
289
188
  "extract_infographics": self._extract_infographics,
@@ -306,7 +205,7 @@ class ExtractTask(Task):
306
205
 
307
206
  task_properties = {
308
207
  "method": self._extract_method,
309
- "document_type": self._document_type,
208
+ "document_type": self._document_type.value,
310
209
  "params": extract_params,
311
210
  }
312
211
 
@@ -339,4 +238,4 @@ class ExtractTask(Task):
339
238
 
340
239
  @property
341
240
  def document_type(self):
342
- return self._document_type
241
+ return self._document_type.value
@@ -11,31 +11,13 @@ from typing import Dict
11
11
  from typing import Literal
12
12
  from typing import Union
13
13
 
14
- from pydantic import BaseModel, field_validator
14
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
15
15
 
16
16
  from .task_base import Task
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
- class FilterTaskSchema(BaseModel):
22
- content_type: str = "image"
23
- min_size: int = 128
24
- max_aspect_ratio: Union[float, int] = 5.0
25
- min_aspect_ratio: Union[float, int] = 0.2
26
- filter: bool = False
27
-
28
- @field_validator("content_type")
29
- def content_type_must_be_valid(cls, v):
30
- valid_criteria = ["image"]
31
- if v not in valid_criteria:
32
- raise ValueError(f"content_type must be one of {valid_criteria}")
33
- return v
34
-
35
- class Config:
36
- extra = "forbid"
37
-
38
-
39
21
  class FilterTask(Task):
40
22
  """
41
23
  Object for document filter task
@@ -49,17 +31,29 @@ class FilterTask(Task):
49
31
  min_size: int = 128,
50
32
  max_aspect_ratio: Union[int, float] = 5.0,
51
33
  min_aspect_ratio: Union[int, float] = 0.2,
52
- filter: bool = False,
34
+ filter: bool = True,
53
35
  ) -> None:
54
36
  """
55
- Setup Split Task Config
37
+ Setup Filter Task Config
56
38
  """
57
39
  super().__init__()
58
- self._content_type = content_type
59
- self._min_size = min_size
60
- self._max_aspect_ratio = max_aspect_ratio
61
- self._min_aspect_ratio = min_aspect_ratio
62
- self._filter = filter
40
+
41
+ # Use the API schema for validation
42
+ validated_data = IngestTaskFilterSchema(
43
+ content_type=content_type,
44
+ params={
45
+ "min_size": min_size,
46
+ "max_aspect_ratio": max_aspect_ratio,
47
+ "min_aspect_ratio": min_aspect_ratio,
48
+ "filter": filter,
49
+ },
50
+ )
51
+
52
+ self._content_type = validated_data.content_type
53
+ self._min_size = validated_data.params.min_size
54
+ self._max_aspect_ratio = validated_data.params.max_aspect_ratio
55
+ self._min_aspect_ratio = validated_data.params.min_aspect_ratio
56
+ self._filter = validated_data.params.filter
63
57
 
64
58
  def __str__(self) -> str:
65
59
  """
@@ -67,7 +61,7 @@ class FilterTask(Task):
67
61
  """
68
62
  info = ""
69
63
  info += "Filter Task:\n"
70
- info += f" content_type: {self._content_type}\n"
64
+ info += f" content_type: {self._content_type.value}\n"
71
65
  info += f" min_size: {self._min_size}\n"
72
66
  info += f" max_aspect_ratio: {self._max_aspect_ratio}\n"
73
67
  info += f" min_aspect_ratio: {self._min_aspect_ratio}\n"
@@ -86,7 +80,7 @@ class FilterTask(Task):
86
80
  }
87
81
 
88
82
  task_properties = {
89
- "content_type": self._content_type,
83
+ "content_type": self._content_type.value,
90
84
  "params": filter_params,
91
85
  }
92
86
 
@@ -9,44 +9,47 @@
9
9
  import logging
10
10
  from typing import Dict
11
11
 
12
- from pydantic import BaseModel
13
-
14
- from .task_base import Task
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
13
+ from nv_ingest_client.primitives.tasks.task_base import Task
15
14
 
16
15
  logger = logging.getLogger(__name__)
17
16
 
18
17
 
19
- class InfographicExtractionSchema(BaseModel):
20
- class Config:
21
- extra = "forbid"
22
-
23
-
24
18
  class InfographicExtractionTask(Task):
25
19
  """
26
20
  Object for infographic extraction task
27
21
  """
28
22
 
29
- def __init__(self) -> None:
23
+ def __init__(self, params: dict = None) -> None:
30
24
  """
31
- Setup Dedup Task Config
25
+ Setup Infographic Extraction Task Config
32
26
  """
33
27
  super().__init__()
34
28
 
29
+ # Handle None params by converting to empty dict for backward compatibility
30
+ if params is None:
31
+ params = {}
32
+
33
+ # Use the API schema for validation
34
+ validated_data = IngestTaskInfographicExtraction(params=params)
35
+
36
+ self._params = validated_data.params
37
+
35
38
  def __str__(self) -> str:
36
39
  """
37
40
  Returns a string with the object's config and run time state
38
41
  """
39
42
  info = ""
40
- info += "infographic extraction task\n"
43
+ info += "Infographic Extraction Task:\n"
44
+ info += f" params: {self._params}\n"
41
45
  return info
42
46
 
43
47
  def to_dict(self) -> Dict:
44
48
  """
45
49
  Convert to a dict for submission to redis
46
50
  """
47
-
48
51
  task_properties = {
49
- "params": {},
52
+ "params": self._params,
50
53
  }
51
54
 
52
55
  return {"type": "infographic_data_extract", "task_properties": task_properties}
@@ -8,25 +8,14 @@
8
8
 
9
9
  import logging
10
10
  from typing import Dict
11
- from typing import Optional
12
11
 
13
- from pydantic import BaseModel
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
14
13
 
15
14
  from .task_base import Task
16
15
 
17
16
  logger = logging.getLogger(__name__)
18
17
 
19
18
 
20
- class SplitTaskSchema(BaseModel):
21
- tokenizer: Optional[str] = None
22
- chunk_size: int = 1024
23
- chunk_overlap: int = 150
24
- params: dict = {}
25
-
26
- class Config:
27
- extra = "forbid"
28
-
29
-
30
19
  class SplitTask(Task):
31
20
  """
32
21
  Object for document splitting task
@@ -37,16 +26,26 @@ class SplitTask(Task):
37
26
  tokenizer: str = None,
38
27
  chunk_size: int = 1024,
39
28
  chunk_overlap: int = 150,
40
- params: dict = {},
41
- ) -> None:
29
+ params: dict = None,
30
+ ):
42
31
  """
43
32
  Setup Split Task Config
44
33
  """
45
34
  super().__init__()
46
- self._tokenizer = tokenizer
47
- self._chunk_size = chunk_size
48
- self._chunk_overlap = chunk_overlap
49
- self._params = params
35
+
36
+ # Handle None params by converting to empty dict for backward compatibility
37
+ if params is None:
38
+ params = {}
39
+
40
+ # Use the API schema for validation
41
+ validated_data = IngestTaskSplitSchema(
42
+ tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, params=params
43
+ )
44
+
45
+ self._tokenizer = validated_data.tokenizer
46
+ self._chunk_size = validated_data.chunk_size
47
+ self._chunk_overlap = validated_data.chunk_overlap
48
+ self._params = validated_data.params
50
49
 
51
50
  def __str__(self) -> str:
52
51
  """