nv-ingest-client 2025.8.13.dev20250813__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/client/interface.py +209 -26
- nv_ingest_client/nv_ingest_cli.py +16 -0
- nv_ingest_client/primitives/jobs/job_spec.py +29 -9
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +21 -76
- nv_ingest_client/primitives/tasks/extract.py +67 -168
- nv_ingest_client/primitives/tasks/filter.py +21 -27
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +2 -0
- nv_ingest_client/primitives/tasks/udf.py +352 -0
- nv_ingest_client/util/vdb/milvus.py +1 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/RECORD +25 -27
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.8.13.dev20250813.dist-info → nv_ingest_client-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
|
@@ -12,12 +12,8 @@ from typing import Any
|
|
|
12
12
|
from typing import Dict
|
|
13
13
|
from typing import Literal
|
|
14
14
|
from typing import Optional
|
|
15
|
-
from typing import get_args
|
|
16
15
|
|
|
17
|
-
from
|
|
18
|
-
from pydantic import ConfigDict
|
|
19
|
-
from pydantic import field_validator
|
|
20
|
-
from pydantic import model_validator
|
|
16
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
21
17
|
|
|
22
18
|
from .task_base import Task
|
|
23
19
|
|
|
@@ -64,142 +60,9 @@ _Type_Extract_Method_PDF = Literal[
|
|
|
64
60
|
"unstructured_io",
|
|
65
61
|
]
|
|
66
62
|
|
|
67
|
-
|
|
63
|
+
_Type_Extract_Images_Method = Literal["group", "yolox"]
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
_Type_Extract_Method_Image = Literal["image"]
|
|
72
|
-
|
|
73
|
-
_Type_Extract_Method_Audio = Literal["audio"]
|
|
74
|
-
|
|
75
|
-
_Type_Extract_Method_Text = Literal["txt"]
|
|
76
|
-
|
|
77
|
-
_Type_Extract_Method_Html = Literal["markitdown"]
|
|
78
|
-
|
|
79
|
-
_Type_Extract_Method_Map = {
|
|
80
|
-
"bmp": get_args(_Type_Extract_Method_Image),
|
|
81
|
-
"docx": get_args(_Type_Extract_Method_DOCX),
|
|
82
|
-
"html": get_args(_Type_Extract_Method_Html),
|
|
83
|
-
"jpeg": get_args(_Type_Extract_Method_Image),
|
|
84
|
-
"jpg": get_args(_Type_Extract_Method_Image),
|
|
85
|
-
"pdf": get_args(_Type_Extract_Method_PDF),
|
|
86
|
-
"png": get_args(_Type_Extract_Method_Image),
|
|
87
|
-
"pptx": get_args(_Type_Extract_Method_PPTX),
|
|
88
|
-
"text": get_args(_Type_Extract_Method_Text),
|
|
89
|
-
"tiff": get_args(_Type_Extract_Method_Image),
|
|
90
|
-
"txt": get_args(_Type_Extract_Method_Text),
|
|
91
|
-
"mp3": get_args(_Type_Extract_Method_Audio),
|
|
92
|
-
"wav": get_args(_Type_Extract_Method_Audio),
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
_Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium", "nemoretriever_parse"]
|
|
96
|
-
|
|
97
|
-
_Type_Extract_Tables_Method_DOCX = Literal["python_docx",]
|
|
98
|
-
|
|
99
|
-
_Type_Extract_Tables_Method_PPTX = Literal["python_pptx",]
|
|
100
|
-
|
|
101
|
-
_Type_Extract_Tables_Method_Map = {
|
|
102
|
-
"pdf": get_args(_Type_Extract_Tables_Method_PDF),
|
|
103
|
-
"docx": get_args(_Type_Extract_Tables_Method_DOCX),
|
|
104
|
-
"pptx": get_args(_Type_Extract_Tables_Method_PPTX),
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
_Type_Extract_Images_Method = Literal["simple", "group"]
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
class ExtractTaskSchema(BaseModel):
|
|
111
|
-
document_type: str
|
|
112
|
-
extract_method: str = None # Initially allow None to set a smart default
|
|
113
|
-
extract_text: bool = True
|
|
114
|
-
extract_images: bool = True
|
|
115
|
-
extract_images_method: str = "group"
|
|
116
|
-
extract_images_params: Optional[Dict[str, Any]] = None
|
|
117
|
-
extract_tables: bool = True
|
|
118
|
-
extract_tables_method: str = "yolox"
|
|
119
|
-
extract_charts: Optional[bool] = None # Initially allow None to set a smart default
|
|
120
|
-
extract_infographics: bool = False
|
|
121
|
-
extract_page_as_image: bool = False
|
|
122
|
-
extract_audio_params: Optional[Dict[str, Any]] = None
|
|
123
|
-
text_depth: str = "document"
|
|
124
|
-
paddle_output_format: Optional[str] = None
|
|
125
|
-
table_output_format: str = "pseudo_markdown"
|
|
126
|
-
|
|
127
|
-
@model_validator(mode="after")
|
|
128
|
-
@classmethod
|
|
129
|
-
def set_default_extract_method(cls, values):
|
|
130
|
-
document_type = values.document_type.lower() # Ensure case-insensitive comparison
|
|
131
|
-
extract_method = values.extract_method
|
|
132
|
-
paddle_output_format = values.paddle_output_format
|
|
133
|
-
|
|
134
|
-
if document_type not in _DEFAULT_EXTRACTOR_MAP:
|
|
135
|
-
raise ValueError(
|
|
136
|
-
f"Unsupported document type: {document_type}."
|
|
137
|
-
f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
if extract_method is None:
|
|
141
|
-
values.extract_method = _DEFAULT_EXTRACTOR_MAP[document_type]
|
|
142
|
-
|
|
143
|
-
if paddle_output_format is not None:
|
|
144
|
-
logger.warning(
|
|
145
|
-
"`paddle_output_format` is deprecated and will be removed in a future release. "
|
|
146
|
-
"Please use `table_output_format` instead."
|
|
147
|
-
)
|
|
148
|
-
values.table_output_format = paddle_output_format
|
|
149
|
-
|
|
150
|
-
return values
|
|
151
|
-
|
|
152
|
-
@field_validator("extract_charts")
|
|
153
|
-
def set_default_extract_charts(cls, v, values):
|
|
154
|
-
# `extract_charts` is initially set to None for backward compatibility.
|
|
155
|
-
# {extract_tables: true, extract_charts: None} or {extract_tables: true, extract_charts: true} enables both
|
|
156
|
-
# table and chart extraction.
|
|
157
|
-
# {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
|
|
158
|
-
extract_charts = v
|
|
159
|
-
if extract_charts is None:
|
|
160
|
-
extract_charts = values.data.get("extract_tables")
|
|
161
|
-
|
|
162
|
-
return extract_charts
|
|
163
|
-
|
|
164
|
-
@field_validator("extract_method")
|
|
165
|
-
def extract_method_must_be_valid(cls, v, values, **kwargs):
|
|
166
|
-
document_type = values.data.get("document_type", "").lower() # Ensure case-insensitive comparison
|
|
167
|
-
|
|
168
|
-
# Skip validation for text-like types, since they do not have 'extract' stages.
|
|
169
|
-
if document_type in ["txt", "text", "json", "md", "sh"]:
|
|
170
|
-
return
|
|
171
|
-
|
|
172
|
-
valid_methods = set(_Type_Extract_Method_Map[document_type])
|
|
173
|
-
if v not in valid_methods:
|
|
174
|
-
raise ValueError(f"extract_method must be one of {valid_methods}")
|
|
175
|
-
|
|
176
|
-
return v
|
|
177
|
-
|
|
178
|
-
@field_validator("document_type")
|
|
179
|
-
def document_type_must_be_supported(cls, v):
|
|
180
|
-
if v.lower() not in _DEFAULT_EXTRACTOR_MAP:
|
|
181
|
-
raise ValueError(
|
|
182
|
-
f"Unsupported document type '{v}'. Supported types are: {', '.join(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
|
183
|
-
)
|
|
184
|
-
return v.lower()
|
|
185
|
-
|
|
186
|
-
@field_validator("extract_tables_method")
|
|
187
|
-
def extract_tables_method_must_be_valid(cls, v, values, **kwargs):
|
|
188
|
-
document_type = values.data.get("document_type", "").lower() # Ensure case-insensitive comparison
|
|
189
|
-
valid_methods = set(_Type_Extract_Tables_Method_Map[document_type])
|
|
190
|
-
if v not in valid_methods:
|
|
191
|
-
raise ValueError(f"extract_method must be one of {valid_methods}")
|
|
192
|
-
return v
|
|
193
|
-
|
|
194
|
-
@field_validator("extract_images_method")
|
|
195
|
-
def extract_images_method_must_be_valid(cls, v):
|
|
196
|
-
if v.lower() not in get_args(_Type_Extract_Images_Method):
|
|
197
|
-
raise ValueError(
|
|
198
|
-
f"Unsupported document type '{v}'. Supported types are: {', '.join(_Type_Extract_Images_Method)}"
|
|
199
|
-
)
|
|
200
|
-
return v.lower()
|
|
201
|
-
|
|
202
|
-
model_config = ConfigDict(extra="forbid")
|
|
65
|
+
_Type_Extract_Tables_Method_PDF = Literal["yolox", "paddle"]
|
|
203
66
|
|
|
204
67
|
|
|
205
68
|
class ExtractTask(Task):
|
|
@@ -210,7 +73,7 @@ class ExtractTask(Task):
|
|
|
210
73
|
def __init__(
|
|
211
74
|
self,
|
|
212
75
|
document_type,
|
|
213
|
-
extract_method: _Type_Extract_Method_PDF =
|
|
76
|
+
extract_method: _Type_Extract_Method_PDF = None,
|
|
214
77
|
extract_text: bool = False,
|
|
215
78
|
extract_images: bool = False,
|
|
216
79
|
extract_tables: bool = False,
|
|
@@ -230,19 +93,62 @@ class ExtractTask(Task):
|
|
|
230
93
|
"""
|
|
231
94
|
super().__init__()
|
|
232
95
|
|
|
233
|
-
|
|
96
|
+
# Set default extract_method if None
|
|
97
|
+
if extract_method is None:
|
|
98
|
+
# Handle both string and enum inputs
|
|
99
|
+
if hasattr(document_type, "value"):
|
|
100
|
+
document_type_str = document_type.value
|
|
101
|
+
else:
|
|
102
|
+
document_type_str = document_type
|
|
103
|
+
document_type_lower = document_type_str.lower()
|
|
104
|
+
if document_type_lower not in _DEFAULT_EXTRACTOR_MAP:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Unsupported document type: {document_type}."
|
|
107
|
+
f" Supported types are: {list(_DEFAULT_EXTRACTOR_MAP.keys())}"
|
|
108
|
+
)
|
|
109
|
+
extract_method = _DEFAULT_EXTRACTOR_MAP[document_type_lower]
|
|
110
|
+
|
|
111
|
+
# Set default extract_charts if None
|
|
112
|
+
if extract_charts is None:
|
|
113
|
+
extract_charts = extract_tables
|
|
114
|
+
|
|
115
|
+
# Build params dict for API schema validation
|
|
116
|
+
extract_params = {
|
|
117
|
+
"extract_text": extract_text,
|
|
118
|
+
"extract_images": extract_images,
|
|
119
|
+
"extract_images_method": extract_images_method,
|
|
120
|
+
"extract_tables": extract_tables,
|
|
121
|
+
"extract_tables_method": extract_tables_method,
|
|
122
|
+
"extract_charts": extract_charts,
|
|
123
|
+
"extract_infographics": extract_infographics,
|
|
124
|
+
"extract_page_as_image": extract_page_as_image,
|
|
125
|
+
"text_depth": text_depth,
|
|
126
|
+
"table_output_format": table_output_format,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Add optional parameters if provided
|
|
130
|
+
if extract_images_params:
|
|
131
|
+
extract_params["extract_images_params"] = extract_images_params
|
|
132
|
+
if extract_audio_params:
|
|
133
|
+
extract_params["extract_audio_params"] = extract_audio_params
|
|
134
|
+
|
|
135
|
+
# Use the API schema for validation
|
|
136
|
+
validated_data = IngestTaskExtractSchema(
|
|
137
|
+
document_type=document_type,
|
|
138
|
+
method=extract_method,
|
|
139
|
+
params=extract_params,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Store validated data
|
|
143
|
+
self._document_type = validated_data.document_type
|
|
144
|
+
self._extract_method = validated_data.method
|
|
234
145
|
self._extract_audio_params = extract_audio_params
|
|
235
146
|
self._extract_images = extract_images
|
|
236
|
-
self._extract_method = extract_method
|
|
237
147
|
self._extract_tables = extract_tables
|
|
238
148
|
self._extract_images_method = extract_images_method
|
|
239
149
|
self._extract_images_params = extract_images_params
|
|
240
150
|
self._extract_tables_method = extract_tables_method
|
|
241
|
-
|
|
242
|
-
# {extract_tables: true, extract_charts: None} or {extract_tables: true, extract-charts: true} enables both
|
|
243
|
-
# table and chart extraction.
|
|
244
|
-
# {extract_tables: true, extract_charts: false} enables only the table extraction and disables chart extraction.
|
|
245
|
-
self._extract_charts = extract_charts if extract_charts is not None else extract_tables
|
|
151
|
+
self._extract_charts = extract_charts
|
|
246
152
|
self._extract_infographics = extract_infographics
|
|
247
153
|
self._extract_page_as_image = extract_page_as_image
|
|
248
154
|
self._extract_text = extract_text
|
|
@@ -256,34 +162,27 @@ class ExtractTask(Task):
|
|
|
256
162
|
"""
|
|
257
163
|
info = ""
|
|
258
164
|
info += "Extract Task:\n"
|
|
259
|
-
info += f"
|
|
260
|
-
info += f"
|
|
261
|
-
info += f"
|
|
262
|
-
info += f"
|
|
263
|
-
info += f"
|
|
264
|
-
info += f"
|
|
265
|
-
info += f"
|
|
266
|
-
info += f"
|
|
267
|
-
info += f"
|
|
268
|
-
info += f" extract tables method: {self._extract_tables_method}\n"
|
|
269
|
-
info += f" text depth: {self._text_depth}\n"
|
|
165
|
+
info += f" document_type: {self._document_type.value}\n"
|
|
166
|
+
info += f" extract_method: {self._extract_method}\n"
|
|
167
|
+
info += f" extract_text: {self._extract_text}\n"
|
|
168
|
+
info += f" extract_images: {self._extract_images}\n"
|
|
169
|
+
info += f" extract_tables: {self._extract_tables}\n"
|
|
170
|
+
info += f" extract_charts: {self._extract_charts}\n"
|
|
171
|
+
info += f" extract_infographics: {self._extract_infographics}\n"
|
|
172
|
+
info += f" extract_page_as_image: {self._extract_page_as_image}\n"
|
|
173
|
+
info += f" text_depth: {self._text_depth}\n"
|
|
270
174
|
info += f" table_output_format: {self._table_output_format}\n"
|
|
271
|
-
|
|
272
|
-
if self._extract_images_params:
|
|
273
|
-
info += f" extract images params: {self._extract_images_params}\n"
|
|
274
|
-
if self._extract_audio_params:
|
|
275
|
-
info += f" extract audio params: {self._extract_audio_params}\n"
|
|
276
175
|
return info
|
|
277
176
|
|
|
278
177
|
def to_dict(self) -> Dict:
|
|
279
178
|
"""
|
|
280
|
-
Convert to a dict for submission to redis
|
|
179
|
+
Convert to a dict for submission to redis
|
|
281
180
|
"""
|
|
282
181
|
extract_params = {
|
|
283
182
|
"extract_text": self._extract_text,
|
|
284
183
|
"extract_images": self._extract_images,
|
|
285
|
-
"extract_tables": self._extract_tables,
|
|
286
184
|
"extract_images_method": self._extract_images_method,
|
|
185
|
+
"extract_tables": self._extract_tables,
|
|
287
186
|
"extract_tables_method": self._extract_tables_method,
|
|
288
187
|
"extract_charts": self._extract_charts,
|
|
289
188
|
"extract_infographics": self._extract_infographics,
|
|
@@ -306,7 +205,7 @@ class ExtractTask(Task):
|
|
|
306
205
|
|
|
307
206
|
task_properties = {
|
|
308
207
|
"method": self._extract_method,
|
|
309
|
-
"document_type": self._document_type,
|
|
208
|
+
"document_type": self._document_type.value,
|
|
310
209
|
"params": extract_params,
|
|
311
210
|
}
|
|
312
211
|
|
|
@@ -339,4 +238,4 @@ class ExtractTask(Task):
|
|
|
339
238
|
|
|
340
239
|
@property
|
|
341
240
|
def document_type(self):
|
|
342
|
-
return self._document_type
|
|
241
|
+
return self._document_type.value
|
|
@@ -11,31 +11,13 @@ from typing import Dict
|
|
|
11
11
|
from typing import Literal
|
|
12
12
|
from typing import Union
|
|
13
13
|
|
|
14
|
-
from
|
|
14
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
15
15
|
|
|
16
16
|
from .task_base import Task
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
class FilterTaskSchema(BaseModel):
|
|
22
|
-
content_type: str = "image"
|
|
23
|
-
min_size: int = 128
|
|
24
|
-
max_aspect_ratio: Union[float, int] = 5.0
|
|
25
|
-
min_aspect_ratio: Union[float, int] = 0.2
|
|
26
|
-
filter: bool = False
|
|
27
|
-
|
|
28
|
-
@field_validator("content_type")
|
|
29
|
-
def content_type_must_be_valid(cls, v):
|
|
30
|
-
valid_criteria = ["image"]
|
|
31
|
-
if v not in valid_criteria:
|
|
32
|
-
raise ValueError(f"content_type must be one of {valid_criteria}")
|
|
33
|
-
return v
|
|
34
|
-
|
|
35
|
-
class Config:
|
|
36
|
-
extra = "forbid"
|
|
37
|
-
|
|
38
|
-
|
|
39
21
|
class FilterTask(Task):
|
|
40
22
|
"""
|
|
41
23
|
Object for document filter task
|
|
@@ -52,14 +34,26 @@ class FilterTask(Task):
|
|
|
52
34
|
filter: bool = False,
|
|
53
35
|
) -> None:
|
|
54
36
|
"""
|
|
55
|
-
Setup
|
|
37
|
+
Setup Filter Task Config
|
|
56
38
|
"""
|
|
57
39
|
super().__init__()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
40
|
+
|
|
41
|
+
# Use the API schema for validation
|
|
42
|
+
validated_data = IngestTaskFilterSchema(
|
|
43
|
+
content_type=content_type,
|
|
44
|
+
params={
|
|
45
|
+
"min_size": min_size,
|
|
46
|
+
"max_aspect_ratio": max_aspect_ratio,
|
|
47
|
+
"min_aspect_ratio": min_aspect_ratio,
|
|
48
|
+
"filter": filter,
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
self._content_type = validated_data.content_type
|
|
53
|
+
self._min_size = validated_data.params.min_size
|
|
54
|
+
self._max_aspect_ratio = validated_data.params.max_aspect_ratio
|
|
55
|
+
self._min_aspect_ratio = validated_data.params.min_aspect_ratio
|
|
56
|
+
self._filter = validated_data.params.filter
|
|
63
57
|
|
|
64
58
|
def __str__(self) -> str:
|
|
65
59
|
"""
|
|
@@ -67,7 +61,7 @@ class FilterTask(Task):
|
|
|
67
61
|
"""
|
|
68
62
|
info = ""
|
|
69
63
|
info += "Filter Task:\n"
|
|
70
|
-
info += f" content_type: {self._content_type}\n"
|
|
64
|
+
info += f" content_type: {self._content_type.value}\n"
|
|
71
65
|
info += f" min_size: {self._min_size}\n"
|
|
72
66
|
info += f" max_aspect_ratio: {self._max_aspect_ratio}\n"
|
|
73
67
|
info += f" min_aspect_ratio: {self._min_aspect_ratio}\n"
|
|
@@ -86,7 +80,7 @@ class FilterTask(Task):
|
|
|
86
80
|
}
|
|
87
81
|
|
|
88
82
|
task_properties = {
|
|
89
|
-
"content_type": self._content_type,
|
|
83
|
+
"content_type": self._content_type.value,
|
|
90
84
|
"params": filter_params,
|
|
91
85
|
}
|
|
92
86
|
|
|
@@ -9,44 +9,47 @@
|
|
|
9
9
|
import logging
|
|
10
10
|
from typing import Dict
|
|
11
11
|
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
from .task_base import Task
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
|
|
13
|
+
from nv_ingest_client.primitives.tasks.task_base import Task
|
|
15
14
|
|
|
16
15
|
logger = logging.getLogger(__name__)
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
class InfographicExtractionSchema(BaseModel):
|
|
20
|
-
class Config:
|
|
21
|
-
extra = "forbid"
|
|
22
|
-
|
|
23
|
-
|
|
24
18
|
class InfographicExtractionTask(Task):
|
|
25
19
|
"""
|
|
26
20
|
Object for infographic extraction task
|
|
27
21
|
"""
|
|
28
22
|
|
|
29
|
-
def __init__(self) -> None:
|
|
23
|
+
def __init__(self, params: dict = None) -> None:
|
|
30
24
|
"""
|
|
31
|
-
Setup
|
|
25
|
+
Setup Infographic Extraction Task Config
|
|
32
26
|
"""
|
|
33
27
|
super().__init__()
|
|
34
28
|
|
|
29
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
30
|
+
if params is None:
|
|
31
|
+
params = {}
|
|
32
|
+
|
|
33
|
+
# Use the API schema for validation
|
|
34
|
+
validated_data = IngestTaskInfographicExtraction(params=params)
|
|
35
|
+
|
|
36
|
+
self._params = validated_data.params
|
|
37
|
+
|
|
35
38
|
def __str__(self) -> str:
|
|
36
39
|
"""
|
|
37
40
|
Returns a string with the object's config and run time state
|
|
38
41
|
"""
|
|
39
42
|
info = ""
|
|
40
|
-
info += "
|
|
43
|
+
info += "Infographic Extraction Task:\n"
|
|
44
|
+
info += f" params: {self._params}\n"
|
|
41
45
|
return info
|
|
42
46
|
|
|
43
47
|
def to_dict(self) -> Dict:
|
|
44
48
|
"""
|
|
45
49
|
Convert to a dict for submission to redis
|
|
46
50
|
"""
|
|
47
|
-
|
|
48
51
|
task_properties = {
|
|
49
|
-
"params":
|
|
52
|
+
"params": self._params,
|
|
50
53
|
}
|
|
51
54
|
|
|
52
55
|
return {"type": "infographic_data_extract", "task_properties": task_properties}
|
|
@@ -8,25 +8,14 @@
|
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
from typing import Dict
|
|
11
|
-
from typing import Optional
|
|
12
11
|
|
|
13
|
-
from
|
|
12
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
14
13
|
|
|
15
14
|
from .task_base import Task
|
|
16
15
|
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
18
17
|
|
|
19
18
|
|
|
20
|
-
class SplitTaskSchema(BaseModel):
|
|
21
|
-
tokenizer: Optional[str] = None
|
|
22
|
-
chunk_size: int = 1024
|
|
23
|
-
chunk_overlap: int = 150
|
|
24
|
-
params: dict = {}
|
|
25
|
-
|
|
26
|
-
class Config:
|
|
27
|
-
extra = "forbid"
|
|
28
|
-
|
|
29
|
-
|
|
30
19
|
class SplitTask(Task):
|
|
31
20
|
"""
|
|
32
21
|
Object for document splitting task
|
|
@@ -37,16 +26,26 @@ class SplitTask(Task):
|
|
|
37
26
|
tokenizer: str = None,
|
|
38
27
|
chunk_size: int = 1024,
|
|
39
28
|
chunk_overlap: int = 150,
|
|
40
|
-
params: dict =
|
|
41
|
-
)
|
|
29
|
+
params: dict = None,
|
|
30
|
+
):
|
|
42
31
|
"""
|
|
43
32
|
Setup Split Task Config
|
|
44
33
|
"""
|
|
45
34
|
super().__init__()
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
35
|
+
|
|
36
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
37
|
+
if params is None:
|
|
38
|
+
params = {}
|
|
39
|
+
|
|
40
|
+
# Use the API schema for validation
|
|
41
|
+
validated_data = IngestTaskSplitSchema(
|
|
42
|
+
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap, params=params
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
self._tokenizer = validated_data.tokenizer
|
|
46
|
+
self._chunk_size = validated_data.chunk_size
|
|
47
|
+
self._chunk_overlap = validated_data.chunk_overlap
|
|
48
|
+
self._params = validated_data.params
|
|
50
49
|
|
|
51
50
|
def __str__(self) -> str:
|
|
52
51
|
"""
|
|
@@ -10,8 +10,8 @@ import logging
|
|
|
10
10
|
from typing import Dict
|
|
11
11
|
from typing import Literal
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
13
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
14
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
15
15
|
|
|
16
16
|
from .task_base import Task
|
|
17
17
|
|
|
@@ -20,28 +20,6 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
_DEFAULT_STORE_METHOD = "minio"
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class StoreEmbedTaskSchema(BaseModel):
|
|
24
|
-
|
|
25
|
-
class Config:
|
|
26
|
-
extra = "allow"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class StoreTaskSchema(BaseModel):
|
|
30
|
-
store_method: str = None
|
|
31
|
-
|
|
32
|
-
@model_validator(mode="before")
|
|
33
|
-
@classmethod
|
|
34
|
-
def set_default_store_method(cls, values):
|
|
35
|
-
store_method = values.get("store_method")
|
|
36
|
-
|
|
37
|
-
if store_method is None:
|
|
38
|
-
values["store_method"] = _DEFAULT_STORE_METHOD
|
|
39
|
-
return values
|
|
40
|
-
|
|
41
|
-
class Config:
|
|
42
|
-
extra = "allow"
|
|
43
|
-
|
|
44
|
-
|
|
45
23
|
class StoreTask(Task):
|
|
46
24
|
"""
|
|
47
25
|
Object for image storage task.
|
|
@@ -64,10 +42,22 @@ class StoreTask(Task):
|
|
|
64
42
|
"""
|
|
65
43
|
super().__init__()
|
|
66
44
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
45
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
46
|
+
if params is None:
|
|
47
|
+
params = {}
|
|
48
|
+
|
|
49
|
+
# Merge extra_params into params for API schema compatibility
|
|
50
|
+
merged_params = {**params, **extra_params}
|
|
51
|
+
|
|
52
|
+
# Use the API schema for validation
|
|
53
|
+
validated_data = IngestTaskStoreSchema(
|
|
54
|
+
structured=structured, images=images, method=store_method or _DEFAULT_STORE_METHOD, params=merged_params
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self._structured = validated_data.structured
|
|
58
|
+
self._images = validated_data.images
|
|
59
|
+
self._store_method = validated_data.method
|
|
60
|
+
self._params = validated_data.params
|
|
71
61
|
self._extra_params = extra_params
|
|
72
62
|
|
|
73
63
|
def __str__(self) -> str:
|
|
@@ -116,7 +106,17 @@ class StoreEmbedTask(Task):
|
|
|
116
106
|
"""
|
|
117
107
|
super().__init__()
|
|
118
108
|
|
|
119
|
-
|
|
109
|
+
# Handle None params by converting to empty dict for backward compatibility
|
|
110
|
+
if params is None:
|
|
111
|
+
params = {}
|
|
112
|
+
|
|
113
|
+
# Merge extra_params into params for API schema compatibility
|
|
114
|
+
merged_params = {**params, **extra_params}
|
|
115
|
+
|
|
116
|
+
# Use the API schema for validation
|
|
117
|
+
validated_data = IngestTaskStoreEmbedSchema(params=merged_params)
|
|
118
|
+
|
|
119
|
+
self._params = validated_data.params
|
|
120
120
|
self._extra_params = extra_params
|
|
121
121
|
|
|
122
122
|
def __str__(self) -> str:
|
|
@@ -27,6 +27,7 @@ class TaskType(Enum):
|
|
|
27
27
|
STORE_EMBEDDING = auto()
|
|
28
28
|
TABLE_DATA_EXTRACT = auto()
|
|
29
29
|
TRANSFORM = auto()
|
|
30
|
+
UDF = auto()
|
|
30
31
|
VDB_UPLOAD = auto()
|
|
31
32
|
|
|
32
33
|
|
|
@@ -71,75 +72,3 @@ class Task:
|
|
|
71
72
|
tasks that are then submitted to the redis client
|
|
72
73
|
"""
|
|
73
74
|
return {}
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
# class ExtractUnstructuredTask(ExtractTask):
|
|
77
|
-
# """
|
|
78
|
-
# Object for document unstructured extraction task
|
|
79
|
-
# extract_method = ["unstructured_local", "unstructured_service"]
|
|
80
|
-
# """
|
|
81
|
-
#
|
|
82
|
-
# def __init__(
|
|
83
|
-
# self,
|
|
84
|
-
# extract_method: ExtractTask._Type_Extract_Method,
|
|
85
|
-
# document_type: ExtractTask._TypeDocumentType,
|
|
86
|
-
# api_key: str,
|
|
87
|
-
# uri: str,
|
|
88
|
-
# ) -> None:
|
|
89
|
-
# """
|
|
90
|
-
# Setup Extract Task Config
|
|
91
|
-
# """
|
|
92
|
-
# super().__init__(extract_method, document_type)
|
|
93
|
-
# self._api_key = api_key
|
|
94
|
-
# self._uri = uri
|
|
95
|
-
#
|
|
96
|
-
# def __str__(self) -> str:
|
|
97
|
-
# """
|
|
98
|
-
# Returns a string with the object's config and run time state
|
|
99
|
-
# """
|
|
100
|
-
# info = ""
|
|
101
|
-
# info += super().__str__()
|
|
102
|
-
# info += f"unstructured uri: {self._uri}\n"
|
|
103
|
-
# return info
|
|
104
|
-
#
|
|
105
|
-
# def to_dict(self) -> Dict:
|
|
106
|
-
# """
|
|
107
|
-
# Convert to a dict for submission to redis (fixme)
|
|
108
|
-
# """
|
|
109
|
-
# unstructured_properties = {
|
|
110
|
-
# "api_key": self._api_key,
|
|
111
|
-
# "unstructured_url": self._uri,
|
|
112
|
-
# }
|
|
113
|
-
# task_desc = super().to_dict()
|
|
114
|
-
# task_desc["task_properties"]["params"].update(unstructured_properties)
|
|
115
|
-
# return task_desc
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
# class ExtractLlamaParseTask(ExtractTask):
|
|
119
|
-
# """
|
|
120
|
-
# Object for document llama extraction task
|
|
121
|
-
# extract_method = ["llama_parse"]
|
|
122
|
-
# """
|
|
123
|
-
#
|
|
124
|
-
# def __init__(
|
|
125
|
-
# self,
|
|
126
|
-
# extract_method: ExtractTask._Type_Extract_Method,
|
|
127
|
-
# document_type: ExtractTask._TypeDocumentType,
|
|
128
|
-
# api_key: str,
|
|
129
|
-
# ) -> None:
|
|
130
|
-
# """
|
|
131
|
-
# Setup Extract Task Config
|
|
132
|
-
# """
|
|
133
|
-
# super().__init__(extract_method, document_type)
|
|
134
|
-
# self._api_key = api_key
|
|
135
|
-
#
|
|
136
|
-
# def to_dict(self) -> Dict:
|
|
137
|
-
# """
|
|
138
|
-
# Convert to a dict for submission to redis (fixme)
|
|
139
|
-
# """
|
|
140
|
-
# llama_parse_properties = {
|
|
141
|
-
# "api_key": self._api_key,
|
|
142
|
-
# }
|
|
143
|
-
# task_desc = super().to_dict()
|
|
144
|
-
# task_desc["task_properties"]["params"].update(llama_parse_properties)
|
|
145
|
-
# return task_desc
|
|
@@ -19,6 +19,7 @@ from .store import StoreTask
|
|
|
19
19
|
from .task_base import Task
|
|
20
20
|
from .task_base import TaskType
|
|
21
21
|
from .task_base import is_valid_task_type
|
|
22
|
+
from .udf import UDFTask
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class TaskUnimplemented(Task):
|
|
@@ -42,6 +43,7 @@ _TASK_MAP: Dict[TaskType, Callable] = {
|
|
|
42
43
|
TaskType.STORE_EMBEDDING: StoreEmbedTask,
|
|
43
44
|
TaskType.STORE: StoreTask,
|
|
44
45
|
TaskType.TRANSFORM: TaskUnimplemented,
|
|
46
|
+
TaskType.UDF: UDFTask,
|
|
45
47
|
}
|
|
46
48
|
|
|
47
49
|
|