nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +561 -207
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +466 -59
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +58 -6
- nv_ingest_client/primitives/jobs/job_spec.py +32 -10
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +37 -76
- nv_ingest_client/primitives/tasks/extract.py +68 -169
- nv_ingest_client/primitives/tasks/filter.py +22 -28
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +10 -11
- nv_ingest_client/primitives/tasks/udf.py +349 -0
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client/util/image_disk_utils.py +300 -0
- nv_ingest_client/util/transport.py +12 -6
- nv_ingest_client/util/util.py +66 -0
- nv_ingest_client/util/vdb/milvus.py +220 -75
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
- nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -7,82 +7,15 @@
|
|
|
7
7
|
import logging
|
|
8
8
|
from typing import Any
|
|
9
9
|
from typing import Dict
|
|
10
|
-
from typing import Literal
|
|
11
10
|
from typing import Optional
|
|
12
|
-
from typing import Type
|
|
13
11
|
|
|
14
|
-
from
|
|
15
|
-
from pydantic import ConfigDict
|
|
16
|
-
from pydantic import model_validator
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
|
|
17
13
|
|
|
18
14
|
from .task_base import Task
|
|
19
15
|
|
|
20
16
|
logger = logging.getLogger(__name__)
|
|
21
17
|
|
|
22
18
|
|
|
23
|
-
class EmbedTaskSchema(BaseModel):
|
|
24
|
-
"""
|
|
25
|
-
Schema for embed task configuration.
|
|
26
|
-
|
|
27
|
-
This schema contains configuration details for an embedding task,
|
|
28
|
-
including the endpoint URL, model name, API key, and error filtering flag.
|
|
29
|
-
|
|
30
|
-
Attributes
|
|
31
|
-
----------
|
|
32
|
-
endpoint_url : Optional[str]
|
|
33
|
-
URL of the embedding endpoint. Default is None.
|
|
34
|
-
model_name : Optional[str]
|
|
35
|
-
Name of the embedding model. Default is None.
|
|
36
|
-
api_key : Optional[str]
|
|
37
|
-
API key for authentication with the embedding service. Default is None.
|
|
38
|
-
filter_errors : bool
|
|
39
|
-
Flag to indicate whether errors should be filtered. Default is False.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
endpoint_url: Optional[str] = None
|
|
43
|
-
model_name: Optional[str] = None
|
|
44
|
-
api_key: Optional[str] = None
|
|
45
|
-
filter_errors: bool = False
|
|
46
|
-
|
|
47
|
-
text_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
|
|
48
|
-
image_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
|
|
49
|
-
structured_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
|
|
50
|
-
audio_elements_modality: Optional[Literal["text"]] = None
|
|
51
|
-
|
|
52
|
-
@model_validator(mode="before")
|
|
53
|
-
def handle_deprecated_fields(cls: Type["EmbedTaskSchema"], values: Dict[str, Any]) -> Dict[str, Any]:
|
|
54
|
-
"""
|
|
55
|
-
Handle deprecated fields before model validation.
|
|
56
|
-
|
|
57
|
-
This validator checks for the presence of deprecated keys ('text' and 'tables')
|
|
58
|
-
in the input dictionary and removes them. Warnings are issued if these keys are found.
|
|
59
|
-
|
|
60
|
-
Parameters
|
|
61
|
-
----------
|
|
62
|
-
values : Dict[str, Any]
|
|
63
|
-
Input dictionary of model values.
|
|
64
|
-
|
|
65
|
-
Returns
|
|
66
|
-
-------
|
|
67
|
-
Dict[str, Any]
|
|
68
|
-
The updated dictionary with deprecated fields removed.
|
|
69
|
-
"""
|
|
70
|
-
if "text" in values:
|
|
71
|
-
logger.warning(
|
|
72
|
-
"'text' parameter is deprecated and will be ignored. Future versions will remove this argument."
|
|
73
|
-
)
|
|
74
|
-
values.pop("text")
|
|
75
|
-
if "tables" in values:
|
|
76
|
-
logger.warning(
|
|
77
|
-
"'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
|
|
78
|
-
)
|
|
79
|
-
values.pop("tables")
|
|
80
|
-
return values
|
|
81
|
-
|
|
82
|
-
model_config = ConfigDict(extra="forbid")
|
|
83
|
-
model_config["protected_namespaces"] = ()
|
|
84
|
-
|
|
85
|
-
|
|
86
19
|
class EmbedTask(Task):
|
|
87
20
|
"""
|
|
88
21
|
Object for document embedding tasks.
|
|
@@ -103,6 +36,8 @@ class EmbedTask(Task):
|
|
|
103
36
|
image_elements_modality: Optional[str] = None,
|
|
104
37
|
structured_elements_modality: Optional[str] = None,
|
|
105
38
|
audio_elements_modality: Optional[str] = None,
|
|
39
|
+
custom_content_field: Optional[str] = None,
|
|
40
|
+
result_target_field: Optional[str] = None,
|
|
106
41
|
) -> None:
|
|
107
42
|
"""
|
|
108
43
|
Initialize the EmbedTask configuration.
|
|
@@ -133,14 +68,30 @@ class EmbedTask(Task):
|
|
|
133
68
|
"'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
|
|
134
69
|
)
|
|
135
70
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
71
|
+
# Use the API schema for validation
|
|
72
|
+
validated_data = IngestTaskEmbedSchema(
|
|
73
|
+
endpoint_url=endpoint_url,
|
|
74
|
+
model_name=model_name,
|
|
75
|
+
api_key=api_key,
|
|
76
|
+
filter_errors=filter_errors,
|
|
77
|
+
text_elements_modality=text_elements_modality,
|
|
78
|
+
image_elements_modality=image_elements_modality,
|
|
79
|
+
structured_elements_modality=structured_elements_modality,
|
|
80
|
+
audio_elements_modality=audio_elements_modality,
|
|
81
|
+
custom_content_field=custom_content_field,
|
|
82
|
+
result_target_field=result_target_field,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self._endpoint_url = validated_data.endpoint_url
|
|
86
|
+
self._model_name = validated_data.model_name
|
|
87
|
+
self._api_key = validated_data.api_key
|
|
88
|
+
self._filter_errors = validated_data.filter_errors
|
|
89
|
+
self._text_elements_modality = validated_data.text_elements_modality
|
|
90
|
+
self._image_elements_modality = validated_data.image_elements_modality
|
|
91
|
+
self._structured_elements_modality = validated_data.structured_elements_modality
|
|
92
|
+
self._audio_elements_modality = validated_data.audio_elements_modality
|
|
93
|
+
self._custom_content_field = validated_data.custom_content_field
|
|
94
|
+
self._result_target_field = validated_data.result_target_field
|
|
144
95
|
|
|
145
96
|
def __str__(self) -> str:
|
|
146
97
|
"""
|
|
@@ -169,6 +120,10 @@ class EmbedTask(Task):
|
|
|
169
120
|
info += f" structured_elements_modality: {self._structured_elements_modality}\n"
|
|
170
121
|
if self._audio_elements_modality:
|
|
171
122
|
info += f" audio_elements_modality: {self._audio_elements_modality}\n"
|
|
123
|
+
if self._custom_content_field:
|
|
124
|
+
info += f" custom_content_field: {self._custom_content_field}\n"
|
|
125
|
+
if self._result_target_field:
|
|
126
|
+
info += f" result_target_field: {self.result_target_field}\n"
|
|
172
127
|
return info
|
|
173
128
|
|
|
174
129
|
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -204,4 +159,10 @@ class EmbedTask(Task):
|
|
|
204
159
|
if self._audio_elements_modality:
|
|
205
160
|
task_properties["audio_elements_modality"] = self._audio_elements_modality
|
|
206
161
|
|
|
162
|
+
if self._custom_content_field:
|
|
163
|
+
task_properties["custom_content_field"] = self._custom_content_field
|
|
164
|
+
|
|
165
|
+
if self._result_target_field:
|
|
166
|
+
task_properties["result_target_field"] = self.result_target_field
|
|
167
|
+
|
|
207
168
|
return {"type": "embed", "task_properties": task_properties}
|
|
@@ -12,12 +12,8 @@ from typing import Any
|
|
|
12
12
|
from typing import Dict
|
|
13
13
|
from typing import Literal
|
|
14
14
|
from typing import Optional
|
|
15
|
-
from typing import get_args
|
|
16
15
|
|
|
17
|
-
from
|
|
18
|
-
from pydantic import ConfigDict
|
|
19
|
-
from pydantic import field_validator
|
|
20
|
-
from pydantic import model_validator
|
|
16
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
21
17
|
|
|
22
18
|
from .task_base import Task
|
|
23
19
|
|
|
@@ -64,142 +60,9 @@ _Type_Extract_Method_PDF = Literal[
|
|
|
64
60
|
"unstructured_io",
|
|
65
61
|
]
|
|
66
62
|
|
|
67
|
-
|
|
63
|
+
_Type_Extract_Images_Method = Literal["group", "yolox"]
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
_Type_Extract_Method_Image = Literal["image"]
|
|
72
|
-
|
|
73
|
-
_Type_Extract_Method_Audio = Literal["audio"]
|
|
74
|
-
|
|
75
|
-
_Type_Extract_Method_Text = Literal["txt"]
|
|
76
|
-
|
|
77
|
-
_Type_Extract_Method_Html = Literal["markitdown"]
|
|
78
|
-
|
|
79
|
-
_Type_Extract_Method_Map = {
|
|
80
|
-
"bmp": get_args(_Type_Extract_Method_Image),
|
|
81
|
-
"docx": get_args(_Type_Extract_Method_DOCX),
|
|
82
|
-
"html": get_args(_Type_Extract_Method_Html),
|
|
83
|
-
"jpeg": get_args(_Type_Extract_Method_Image),
|
|
84
|
-
"jpg": get_args(_Type_Extract_Method_Image),
|
|
85
|
-
"pdf": get_args(_Type_Extract_Method_PDF),
|
|
86
|
-
"png": get_args(_Type_Extract_Method_Image),
|
|
87
|
-
"pptx": get_args(_Type_Extract_Method_PPTX),
|
|
88
|
-
"text": get_args(_Type_Extract_Method_Text),
|
|
89
|
-
"tiff": get_args(_Type_Extract_Method_Image),
|
|
90
|
-
"txt": get_args(_Type_Extract_Method_Text),
|
|
91
|
-
"mp3": get_args(_Type_Extract_Method_Audio),
|
|
92
|
-
"wav": get_args(_Type_Extract_Method_Audio),
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
_Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium", "nemoretriever_parse"]
|
|
96
|
-
|
|
97
|
-
_Type_Extract_Tables_Method_DOCX = Literal["python_docx",]
|
|
98
|
-
|
|
99
|
-
_Type_Extract_Tables_Method_PPTX = Literal["python_pptx",]
|
|
100
|
-
|
|
101
|
-
_Type_Extract_Tables_Method_Map = {
|
|
102
|
-
"pdf": get_args(_Type_Extract_Tables_Method_PDF),
|
|
103
|
-
"docx": get_args(_Type_Extract_Tables_Method_DOCX),
|
|
104
|
-
"pptx": get_args(_Type_Extract_Tables_Method_PPTX),
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
_Type_Extract_Images_Method = Literal["simple", "group"]
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
class ExtractTaskSchema(BaseModel):
|
|
111
|
-
document_type: str
|
|
112
|
-
extract_method: str = None # Initially allow None to set a smart default
|
|
113
|
-
extract_text: bool = True
|
|
114
|
-
extract_images: bool = True
|
|
115
|
-
extract_images_method: str = "group"
|
|
116
|
-
extract_images_params: Optional[Dict[str, Any]] = None
|
|
117
|
-
extract_tables: bool = True
|
|
118
|
-
extract_tables_method: str = "yolox"
|
|
119
|
-
extract_charts: Optional[bool] = None # Initially allow None to set a smart default
|
|
120
|
-
extract_infographics: bool = False
|
|
121
|
-
extract_page_as_image: bool = False
|
|
122
|
-
extract_audio_params: Optional[Dict[str, Any]] = None
|
|
123
|
-
text_depth: str = "document"
|
|
124
|
-
paddle_output_format: Optional[str] = None
|
|
125
|
-
table_output_format: str = "pseudo_markdown"
|
|
126
|
-
|
|
127
|
-
@model_validator(mode="after")
|
|
128
|
-
@classmethod
|
|
129
|
-
def set_default_extract_method(cls, values):
|
|
130
|
-
document_type = values.document_type.lower() # Ensure case-insensitive comparison
|
|
131
|
-
extract_method = values.extract_method
|
|
132
|
-
paddle_output_format = values.paddle_output_format
|
|
133
|
-
|
|
134
|
-
if document_type not in _DEFAULT_EXTRACTOR_MAP:
|
|
135
|
-
raise ValueError(
|
|
136
|
-
f"Unsupported document type: {document_type}."
|
|
137
|
-
f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
if extract_method is None:
|
|
141
|
-
values.extract_method = _DEFAULT_EXTRACTOR_MAP[document_type]
|
|
142
|
-
|
|
143
|
-
if paddle_output_format is not None:
|
|
144
|
-
logger.warning(
|
|
145
|
-
"`paddle_output_format` is deprecated and will be removed in a future release. "
|
|
146
|
-
"Please use `table_output_format` instead."
|
|
147
|
-
)
|
|
148
|
-
values.table_output_format = paddle_output_format
|
|
149
|
-
|
|
150
|
-
return values
|
|
151
|
-
|
|
152
|
-
@field_validator("extract_charts")
|
|
153
|
-
def set_default_extract_charts(cls, v, values):
|
|
154
|
-
# `extract_charts` is initially set to None for backward compatibility.
|
|
155
|
-
# {extract_tables: true, extract_charts: None} or {extract_tables: true, extract_charts: true} enables both
|
|
156
|
-
# table and chart extraction.
|
|
157
|
-
# {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
|
|
158
|
-
extract_charts = v
|
|
159
|
-
if extract_charts is None:
|
|
160
|
-
extract_charts = values.data.get("extract_tables")
|
|
161
|
-
|
|
162
|
-
return extract_charts
|
|
163
|
-
|
|
164
|
-
@field_validator("extract_method")
|
|
165
|
-
def extract_method_must_be_valid(cls, v, values, **kwargs):
|
|
166
|
-
document_type = values.data.get("document_type", "").lower() # Ensure case-insensitive comparison
|
|
167
|
-
|
|
168
|
-
# Skip validation for text-like types, since they do not have 'extract' stages.
|
|
169
|
-
if document_type in ["txt", "text", "json", "md", "sh"]:
|
|
170
|
-
return
|
|
171
|
-
|
|
172
|
-
valid_methods = set(_Type_Extract_Method_Map[document_type])
|
|
173
|
-
if v not in valid_methods:
|
|
174
|
-
raise ValueError(f"extract_method must be one of {valid_methods}")
|
|
175
|
-
|
|
176
|
-
return v
|
|
177
|
-
|
|
178
|
-
@field_validator("document_type")
|
|
179
|
-
def document_type_must_be_supported(cls, v):
|
|
180
|
-
if v.lower() not in _DEFAULT_EXTRACTOR_MAP:
|
|
181
|
-
raise ValueError(
|
|
182
|
-
f"Unsupported document type '{v}'. Supported types are: {', '.join(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
|
183
|
-
)
|
|
184
|
-
return v.lower()
|
|
185
|
-
|
|
186
|
-
@field_validator("extract_tables_method")
|
|
187
|
-
def extract_tables_method_must_be_valid(cls, v, values, **kwargs):
|
|
188
|
-
document_type = values.data.get("document_type", "").lower() # Ensure case-insensitive comparison
|
|
189
|
-
valid_methods = set(_Type_Extract_Tables_Method_Map[document_type])
|
|
190
|
-
if v not in valid_methods:
|
|
191
|
-
raise ValueError(f"extract_method must be one of {valid_methods}")
|
|
192
|
-
return v
|
|
193
|
-
|
|
194
|
-
@field_validator("extract_images_method")
|
|
195
|
-
def extract_images_method_must_be_valid(cls, v):
|
|
196
|
-
if v.lower() not in get_args(_Type_Extract_Images_Method):
|
|
197
|
-
raise ValueError(
|
|
198
|
-
f"Unsupported document type '{v}'. Supported types are: {', '.join(_Type_Extract_Images_Method)}"
|
|
199
|
-
)
|
|
200
|
-
return v.lower()
|
|
201
|
-
|
|
202
|
-
model_config = ConfigDict(extra="forbid")
|
|
65
|
+
_Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
|
|
203
66
|
|
|
204
67
|
|
|
205
68
|
class ExtractTask(Task):
|
|
@@ -210,7 +73,7 @@ class ExtractTask(Task):
|
|
|
210
73
|
def __init__(
|
|
211
74
|
self,
|
|
212
75
|
document_type,
|
|
213
|
-
extract_method: _Type_Extract_Method_PDF =
|
|
76
|
+
extract_method: _Type_Extract_Method_PDF = None,
|
|
214
77
|
extract_text: bool = False,
|
|
215
78
|
extract_images: bool = False,
|
|
216
79
|
extract_tables: bool = False,
|
|
@@ -223,26 +86,69 @@ class ExtractTask(Task):
|
|
|
223
86
|
extract_page_as_image: bool = False,
|
|
224
87
|
text_depth: str = "document",
|
|
225
88
|
paddle_output_format: str = "pseudo_markdown",
|
|
226
|
-
table_output_format: str = "
|
|
89
|
+
table_output_format: str = "markdown",
|
|
227
90
|
) -> None:
|
|
228
91
|
"""
|
|
229
92
|
Setup Extract Task Config
|
|
230
93
|
"""
|
|
231
94
|
super().__init__()
|
|
232
95
|
|
|
233
|
-
|
|
96
|
+
# Set default extract_method if None
|
|
97
|
+
if extract_method is None:
|
|
98
|
+
# Handle both string and enum inputs
|
|
99
|
+
if hasattr(document_type, "value"):
|
|
100
|
+
document_type_str = document_type.value
|
|
101
|
+
else:
|
|
102
|
+
document_type_str = document_type
|
|
103
|
+
document_type_lower = document_type_str.lower()
|
|
104
|
+
if document_type_lower not in _DEFAULT_EXTRACTOR_MAP:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Unsupported document type: {document_type}."
|
|
107
|
+
f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
|
108
|
+
)
|
|
109
|
+
extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
|
|
110
|
+
|
|
111
|
+
# Set default extract_charts if None
|
|
112
|
+
if extract_charts is None:
|
|
113
|
+
extract_charts = extract_tables
|
|
114
|
+
|
|
115
|
+
# Build params dict for API schema validation
|
|
116
|
+
extract_params = {
|
|
117
|
+
"extract_text": extract_text,
|
|
118
|
+
"extract_images": extract_images,
|
|
119
|
+
"extract_images_method": extract_images_method,
|
|
120
|
+
"extract_tables": extract_tables,
|
|
121
|
+
"extract_tables_method": extract_tables_method,
|
|
122
|
+
"extract_charts": extract_charts,
|
|
123
|
+
"extract_infographics": extract_infographics,
|
|
124
|
+
"extract_page_as_image": extract_page_as_image,
|
|
125
|
+
"text_depth": text_depth,
|
|
126
|
+
"table_output_format": table_output_format,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Add optional parameters if provided
|
|
130
|
+
if extract_images_params:
|
|
131
|
+
extract_params["extract_images_params"] = extract_images_params
|
|
132
|
+
if extract_audio_params:
|
|
133
|
+
extract_params["extract_audio_params"] = extract_audio_params
|
|
134
|
+
|
|
135
|
+
# Use the API schema for validation
|
|
136
|
+
validated_data = IngestTaskExtractSchema(
|
|
137
|
+
document_type=document_type,
|
|
138
|
+
method=extract_method,
|
|
139
|
+
params=extract_params,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Store validated data
|
|
143
|
+
self._document_type = validated_data.document_type
|
|
144
|
+
self._extract_method = validated_data.method
|
|
234
145
|
self._extract_audio_params = extract_audio_params
|
|
235
146
|
self._extract_images = extract_images
|
|
236
|
-
self._extract_method = extract_method
|
|
237
147
|
self._extract_tables = extract_tables
|
|
238
148
|
self._extract_images_method = extract_images_method
|
|
239
149
|
self._extract_images_params = extract_images_params
|
|
240
150
|
self._extract_tables_method = extract_tables_method
|
|
241
|
-
|
|
242
|
-
# {extract_tables: true, extract_charts: None} or {extract_tables: true, extract-charts: true} enables both
|
|
243
|
-
# table and chart extraction.
|
|
244
|
-
# {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
|
|
245
|
-
self._extract_charts = extract_charts if extract_charts is not None else extract_tables
|
|
151
|
+
self._extract_charts = extract_charts
|
|
246
152
|
self._extract_infographics = extract_infographics
|
|
247
153
|
self._extract_page_as_image = extract_page_as_image
|
|
248
154
|
self._extract_text = extract_text
|
|
@@ -256,34 +162,27 @@ class ExtractTask(Task):
|
|
|
256
162
|
"""
|
|
257
163
|
info = ""
|
|
258
164
|
info += "Extract Task:\n"
|
|
259
|
-
info += f"
|
|
260
|
-
info += f"
|
|
261
|
-
info += f"
|
|
262
|
-
info += f"
|
|
263
|
-
info += f"
|
|
264
|
-
info += f"
|
|
265
|
-
info += f"
|
|
266
|
-
info += f"
|
|
267
|
-
info += f"
|
|
268
|
-
info += f" extract tables method: {self._extract_tables_method}\n"
|
|
269
|
-
info += f" text depth: {self._text_depth}\n"
|
|
165
|
+
info += f" document_type: {self._document_type.value}\n"
|
|
166
|
+
info += f" extract_method: {self._extract_method}\n"
|
|
167
|
+
info += f" extract_text: {self._extract_text}\n"
|
|
168
|
+
info += f" extract_images: {self._extract_images}\n"
|
|
169
|
+
info += f" extract_tables: {self._extract_tables}\n"
|
|
170
|
+
info += f" extract_charts: {self._extract_charts}\n"
|
|
171
|
+
info += f" extract_infographics: {self._extract_infographics}\n"
|
|
172
|
+
info += f" extract_page_as_image: {self._extract_page_as_image}\n"
|
|
173
|
+
info += f" text_depth: {self._text_depth}\n"
|
|
270
174
|
info += f" table_output_format: {self._table_output_format}\n"
|
|
271
|
-
|
|
272
|
-
if self._extract_images_params:
|
|
273
|
-
info += f" extract images params: {self._extract_images_params}\n"
|
|
274
|
-
if self._extract_audio_params:
|
|
275
|
-
info += f" extract audio params: {self._extract_audio_params}\n"
|
|
276
175
|
return info
|
|
277
176
|
|
|
278
177
|
def to_dict(self) -> Dict:
|
|
279
178
|
"""
|
|
280
|
-
Convert to a dict for submission to redis
|
|
179
|
+
Convert to a dict for submission to redis
|
|
281
180
|
"""
|
|
282
181
|
extract_params = {
|
|
283
182
|
"extract_text": self._extract_text,
|
|
284
183
|
"extract_images": self._extract_images,
|
|
285
|
-
"extract_tables": self._extract_tables,
|
|
286
184
|
"extract_images_method": self._extract_images_method,
|
|
185
|
+
"extract_tables": self._extract_tables,
|
|
287
186
|
"extract_tables_method": self._extract_tables_method,
|
|
288
187
|
"extract_charts": self._extract_charts,
|
|
289
188
|
"extract_infographics": self._extract_infographics,
|
|
@@ -306,7 +205,7 @@ class ExtractTask(Task):
|
|
|
306
205
|
|
|
307
206
|
task_properties = {
|
|
308
207
|
"method": self._extract_method,
|
|
309
|
-
"document_type": self._document_type,
|
|
208
|
+
"document_type": self._document_type.value,
|
|
310
209
|
"params": extract_params,
|
|
311
210
|
}
|
|
312
211
|
|
|
@@ -339,4 +238,4 @@ class ExtractTask(Task):
|
|
|
339
238
|
|
|
340
239
|
@property
|
|
341
240
|
def document_type(self):
|
|
342
|
-
return self._document_type
|
|
241
|
+
return self._document_type.value
|
|
@@ -11,31 +11,13 @@ from typing import Dict
|
|
|
11
11
|
from typing import Literal
|
|
12
12
|
from typing import Union
|
|
13
13
|
|
|
14
|
-
from
|
|
14
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
15
15
|
|
|
16
16
|
from .task_base import Task
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
class FilterTaskSchema(BaseModel):
|
|
22
|
-
content_type: str = "image"
|
|
23
|
-
min_size: int = 128
|
|
24
|
-
max_aspect_ratio: Union[float, int] = 5.0
|
|
25
|
-
min_aspect_ratio: Union[float, int] = 0.2
|
|
26
|
-
filter: bool = False
|
|
27
|
-
|
|
28
|
-
@field_validator("content_type")
|
|
29
|
-
def content_type_must_be_valid(cls, v):
|
|
30
|
-
valid_criteria = ["image"]
|
|
31
|
-
if v not in valid_criteria:
|
|
32
|
-
raise ValueError(f"content_type must be one of {valid_criteria}")
|
|
33
|
-
return v
|
|
34
|
-
|
|
35
|
-
class Config:
|
|
36
|
-
extra = "forbid"
|
|
37
|
-
|
|
38
|
-
|
|
39
21
|
class FilterTask(Task):
|
|
40
22
|
"""
|
|
41
23
|
Object for document filter task
|
|
@@ -49,17 +31,29 @@ class FilterTask(Task):
|
|
|
49
31
|
min_size: int = 128,
|
|
50
32
|
max_aspect_ratio: Union[int, float] = 5.0,
|
|
51
33
|
min_aspect_ratio: Union[int, float] = 0.2,
|
|
52
|
-
filter: bool =
|
|
34
|
+
filter: bool = True,
|
|
53
35
|
) -> None:
|
|
54
36
|
"""
|
|
55
|
-
Setup
|
|
37
|
+
Setup Filter Task Config
|
|
56
38
|
"""
|
|
57
39
|
super().__init__()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
40
|
+
|
|
41
|
+
# Use the API schema for validation
|
|
42
|
+
validated_data = IngestTaskFilterSchema(
|
|
43
|
+
content_type=content_type,
|
|
44
|
+
params={
|
|
45
|
+
"min_size": min_size,
|
|
46
|
+
"max_aspect_ratio": max_aspect_ratio,
|
|
47
|
+
"min_aspect_ratio": min_aspect_ratio,
|
|
48
|
+
"filter": filter,
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
self._content_type = validated_data.content_type
|
|
53
|
+
self._min_size = validated_data.params.min_size
|
|
54
|
+
self._max_aspect_ratio = validated_data.params.max_aspect_ratio
|
|
55
|
+
self._min_aspect_ratio = validated_data.params.min_aspect_ratio
|
|
56
|
+
self._filter = validated_data.params.filter
|
|
63
57
|
|
|
64
58
|
def __str__(self) -> str:
|
|
65
59
|
"""
|
|
@@ -67,7 +61,7 @@ class FilterTask(Task):
|
|
|
67
61
|
"""
|
|
68
62
|
info = ""
|
|
69
63
|
info += "Filter Task:\n"
|
|
70
|
-
info += f" content_type: {self._content_type}\n"
|
|
64
|
+
info += f" content_type: {self._content_type.value}\n"
|
|
71
65
|
info += f" min_size: {self._min_size}\n"
|
|
72
66
|
info += f" max_aspect_ratio: {self._max_aspect_ratio}\n"
|
|
73
67
|
info += f" min_aspect_ratio: {self._min_aspect_ratio}\n"
|
|
@@ -86,7 +80,7 @@ class FilterTask(Task):
|
|
|
86
80
|
}
|
|
87
81
|
|
|
88
82
|
task_properties = {
|
|
89
|
-
"content_type": self._content_type,
|
|
83
|
+
"content_type": self._content_type.value,
|
|
90
84
|
"params": filter_params,
|
|
91
85
|
}
|
|
92
86
|
|
|
@@ -9,44 +9,47 @@
|
|
|
9
9
|
import logging
|
|
10
10
|
from typing import Dict
|
|
11
11
|
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
from .task_base import Task
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
|
|
13
|
+
from nv_ingest_client.primitives.tasks.task_base import Task
|
|
15
14
|
|
|
16
15
|
logger = logging.getLogger(__name__)
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
class InfographicExtractionSchema(BaseModel):
|
|
20
|
-
class Config:
|
|
21
|
-
extra = "forbid"
|
|
22
|
-
|
|
23
|
-
|
|
24
18
|
class InfographicExtractionTask(Task):
|
|
25
19
|
"""
|
|
26
20
|
Object for infographic extraction task
|
|
27
21
|
"""
|
|
28
22
|
|
|
29
|
-
def __init__(self) -> None:
|
|
23
|
+
def __init__(self, params: dict = None) -> None:
|
|
30
24
|
"""
|
|
31
|
-
Setup
|
|
25
|
+
Setup Infographic Extraction Task Config
|
|
32
26
|
"""
|
|
33
27
|
super().__init__()
|
|
34
28
|
|
|
29
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
30
|
+
if params is None:
|
|
31
|
+
params = {}
|
|
32
|
+
|
|
33
|
+
# Use the API schema for validation
|
|
34
|
+
validated_data = IngestTaskInfographicExtraction(params=params)
|
|
35
|
+
|
|
36
|
+
self._params = validated_data.params
|
|
37
|
+
|
|
35
38
|
def __str__(self) -> str:
|
|
36
39
|
"""
|
|
37
40
|
Returns a string with the object's config and run time state
|
|
38
41
|
"""
|
|
39
42
|
info = ""
|
|
40
|
-
info += "
|
|
43
|
+
info += "Infographic Extraction Task:\n"
|
|
44
|
+
info += f" params: {self._params}\n"
|
|
41
45
|
return info
|
|
42
46
|
|
|
43
47
|
def to_dict(self) -> Dict:
|
|
44
48
|
"""
|
|
45
49
|
Convert to a dict for submission to redis
|
|
46
50
|
"""
|
|
47
|
-
|
|
48
51
|
task_properties = {
|
|
49
|
-
"params":
|
|
52
|
+
"params": self._params,
|
|
50
53
|
}
|
|
51
54
|
|
|
52
55
|
return {"type": "infographic_data_extract", "task_properties": task_properties}
|
|
@@ -8,25 +8,14 @@
|
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
from typing import Dict
|
|
11
|
-
from typing import Optional
|
|
12
11
|
|
|
13
|
-
from
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
14
13
|
|
|
15
14
|
from .task_base import Task
|
|
16
15
|
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
18
17
|
|
|
19
18
|
|
|
20
|
-
class SplitTaskSchema(BaseModel):
|
|
21
|
-
tokenizer: Optional[str] = None
|
|
22
|
-
chunk_size: int = 1024
|
|
23
|
-
chunk_overlap: int = 150
|
|
24
|
-
params: dict = {}
|
|
25
|
-
|
|
26
|
-
class Config:
|
|
27
|
-
extra = "forbid"
|
|
28
|
-
|
|
29
|
-
|
|
30
19
|
class SplitTask(Task):
|
|
31
20
|
"""
|
|
32
21
|
Object for document splitting task
|
|
@@ -37,16 +26,26 @@ class SplitTask(Task):
|
|
|
37
26
|
tokenizer: str = None,
|
|
38
27
|
chunk_size: int = 1024,
|
|
39
28
|
chunk_overlap: int = 150,
|
|
40
|
-
params: dict =
|
|
41
|
-
)
|
|
29
|
+
params: dict = None,
|
|
30
|
+
):
|
|
42
31
|
"""
|
|
43
32
|
Setup Split Task Config
|
|
44
33
|
"""
|
|
45
34
|
super().__init__()
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
35
|
+
|
|
36
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
37
|
+
if params is None:
|
|
38
|
+
params = {}
|
|
39
|
+
|
|
40
|
+
# Use the API schema for validation
|
|
41
|
+
validated_data = IngestTaskSplitSchema(
|
|
42
|
+
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, params=params
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
self._tokenizer = validated_data.tokenizer
|
|
46
|
+
self._chunk_size = validated_data.chunk_size
|
|
47
|
+
self._chunk_overlap = validated_data.chunk_overlap
|
|
48
|
+
self._params = validated_data.params
|
|
50
49
|
|
|
51
50
|
def __str__(self) -> str:
|
|
52
51
|
"""
|