nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-client might be problematic. Click here for more details.
- nv_ingest_client/cli/util/click.py +182 -30
- nv_ingest_client/cli/util/processing.py +0 -393
- nv_ingest_client/client/client.py +561 -207
- nv_ingest_client/client/ingest_job_handler.py +412 -0
- nv_ingest_client/client/interface.py +466 -59
- nv_ingest_client/client/util/processing.py +11 -1
- nv_ingest_client/nv_ingest_cli.py +58 -6
- nv_ingest_client/primitives/jobs/job_spec.py +32 -10
- nv_ingest_client/primitives/tasks/__init__.py +6 -4
- nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
- nv_ingest_client/primitives/tasks/caption.py +10 -16
- nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
- nv_ingest_client/primitives/tasks/dedup.py +12 -21
- nv_ingest_client/primitives/tasks/embed.py +37 -76
- nv_ingest_client/primitives/tasks/extract.py +68 -169
- nv_ingest_client/primitives/tasks/filter.py +22 -28
- nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
- nv_ingest_client/primitives/tasks/split.py +17 -18
- nv_ingest_client/primitives/tasks/store.py +29 -29
- nv_ingest_client/primitives/tasks/task_base.py +1 -72
- nv_ingest_client/primitives/tasks/task_factory.py +10 -11
- nv_ingest_client/primitives/tasks/udf.py +349 -0
- nv_ingest_client/util/dataset.py +8 -2
- nv_ingest_client/util/document_analysis.py +314 -0
- nv_ingest_client/util/image_disk_utils.py +300 -0
- nv_ingest_client/util/transport.py +12 -6
- nv_ingest_client/util/util.py +66 -0
- nv_ingest_client/util/vdb/milvus.py +220 -75
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
- nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
- nv_ingest_client/cli/util/tasks.py +0 -3
- nv_ingest_client/primitives/exceptions.py +0 -0
- nv_ingest_client/primitives/tasks/transform.py +0 -0
- nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -12,23 +12,30 @@ from pprint import pprint
|
|
|
12
12
|
from typing import Union, List, Any, Dict
|
|
13
13
|
|
|
14
14
|
import click
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import PipelinePhase
|
|
17
|
+
from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
|
|
15
18
|
from nv_ingest_client.util.processing import check_schema
|
|
16
19
|
from nv_ingest_client.primitives.tasks import CaptionTask
|
|
17
20
|
from nv_ingest_client.primitives.tasks import DedupTask
|
|
18
21
|
from nv_ingest_client.primitives.tasks import EmbedTask
|
|
19
22
|
from nv_ingest_client.primitives.tasks import ExtractTask
|
|
20
23
|
from nv_ingest_client.primitives.tasks import FilterTask
|
|
24
|
+
from nv_ingest_client.primitives.tasks import InfographicExtractionTask
|
|
21
25
|
from nv_ingest_client.primitives.tasks import SplitTask
|
|
22
26
|
from nv_ingest_client.primitives.tasks import StoreEmbedTask
|
|
23
27
|
from nv_ingest_client.primitives.tasks import StoreTask
|
|
24
|
-
from nv_ingest_client.primitives.tasks
|
|
25
|
-
from
|
|
26
|
-
from
|
|
27
|
-
from
|
|
28
|
-
from
|
|
29
|
-
from
|
|
30
|
-
from
|
|
31
|
-
from
|
|
28
|
+
from nv_ingest_client.primitives.tasks import UDFTask
|
|
29
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
|
|
30
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
|
|
31
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
|
|
32
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
|
|
33
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
|
|
34
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
|
|
35
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
|
|
36
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
|
|
37
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
|
|
38
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
|
|
32
39
|
from nv_ingest_client.util.util import generate_matching_files
|
|
33
40
|
|
|
34
41
|
logger = logging.getLogger(__name__)
|
|
@@ -78,12 +85,6 @@ class ClientType(str, Enum):
|
|
|
78
85
|
KAFKA = "KAFKA"
|
|
79
86
|
|
|
80
87
|
|
|
81
|
-
# Example TaskId validation set
|
|
82
|
-
VALID_TASK_IDS = {"task1", "task2", "task3"}
|
|
83
|
-
|
|
84
|
-
_MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
|
|
85
|
-
|
|
86
|
-
|
|
87
88
|
def debug_print_click_options(ctx: click.Context) -> None:
|
|
88
89
|
"""
|
|
89
90
|
Retrieves all options from the Click context and pretty prints them.
|
|
@@ -149,9 +150,11 @@ TaskType = Union[
|
|
|
149
150
|
EmbedTask,
|
|
150
151
|
ExtractTask,
|
|
151
152
|
FilterTask,
|
|
153
|
+
InfographicExtractionTask,
|
|
152
154
|
SplitTask,
|
|
153
155
|
StoreEmbedTask,
|
|
154
156
|
StoreTask,
|
|
157
|
+
UDFTask,
|
|
155
158
|
]
|
|
156
159
|
|
|
157
160
|
|
|
@@ -178,7 +181,32 @@ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
|
|
|
178
181
|
the error details (e.g., expected property format), and show the input that was provided.
|
|
179
182
|
"""
|
|
180
183
|
try:
|
|
181
|
-
|
|
184
|
+
options = json.loads(options_str)
|
|
185
|
+
|
|
186
|
+
# Convert string boolean values to actual booleans for extract tasks
|
|
187
|
+
if task_id == "extract":
|
|
188
|
+
boolean_fields = [
|
|
189
|
+
"extract_text",
|
|
190
|
+
"extract_images",
|
|
191
|
+
"extract_tables",
|
|
192
|
+
"extract_charts",
|
|
193
|
+
"extract_infographics",
|
|
194
|
+
"extract_page_as_image",
|
|
195
|
+
]
|
|
196
|
+
for field in boolean_fields:
|
|
197
|
+
if field in options:
|
|
198
|
+
value = options[field]
|
|
199
|
+
if isinstance(value, str):
|
|
200
|
+
if value.lower() in ("true", "1", "yes", "on"):
|
|
201
|
+
options[field] = True
|
|
202
|
+
elif value.lower() in ("false", "0", "no", "off"):
|
|
203
|
+
options[field] = False
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return options
|
|
182
210
|
except json.JSONDecodeError as e:
|
|
183
211
|
error_message = (
|
|
184
212
|
f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
|
|
@@ -229,46 +257,170 @@ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[
|
|
|
229
257
|
options: Dict[str, Any] = parse_task_options(task_id, json_options)
|
|
230
258
|
|
|
231
259
|
if task_id == "split":
|
|
232
|
-
task_options = check_schema(
|
|
260
|
+
task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
|
|
233
261
|
new_task_id = f"{task_id}"
|
|
234
262
|
new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
|
|
235
263
|
elif task_id == "extract":
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
264
|
+
# Map CLI parameters to API schema structure
|
|
265
|
+
method = options.pop("extract_method", None)
|
|
266
|
+
if method is None:
|
|
267
|
+
method = "pdfium" # Default fallback
|
|
268
|
+
|
|
269
|
+
# Build params dict for API schema
|
|
270
|
+
params = {k: v for k, v in options.items() if k != "document_type"}
|
|
271
|
+
|
|
272
|
+
# Validate with API schema
|
|
273
|
+
api_options = {
|
|
274
|
+
"document_type": options.get("document_type"),
|
|
275
|
+
"method": method,
|
|
276
|
+
"params": params,
|
|
277
|
+
}
|
|
278
|
+
task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
|
|
279
|
+
new_task_id = f"{task_id}_{task_options.document_type.value}"
|
|
280
|
+
|
|
281
|
+
# Create ExtractTask with original CLI parameters
|
|
282
|
+
extract_task_params = {
|
|
283
|
+
"document_type": task_options.document_type,
|
|
284
|
+
"extract_method": task_options.method,
|
|
285
|
+
**task_options.params,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
# Start with the main extract task
|
|
289
|
+
new_task = [(new_task_id, ExtractTask(**extract_task_params))]
|
|
290
|
+
|
|
291
|
+
# Add ChartExtractionTask if extract_charts is True
|
|
292
|
+
if task_options.params.get("extract_charts", False):
|
|
293
|
+
from nv_ingest_client.primitives.tasks import ChartExtractionTask
|
|
294
|
+
|
|
295
|
+
chart_task_id = "chart_data_extract"
|
|
296
|
+
chart_params = {"params": {}} # ChartExtractionTask takes params dict
|
|
297
|
+
new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
|
|
298
|
+
|
|
299
|
+
# Add TableExtractionTask if extract_tables is True
|
|
300
|
+
if task_options.params.get("extract_tables", False):
|
|
301
|
+
from nv_ingest_client.primitives.tasks import TableExtractionTask
|
|
302
|
+
|
|
303
|
+
table_task_id = "table_data_extract"
|
|
304
|
+
new_task.append((table_task_id, TableExtractionTask()))
|
|
239
305
|
elif task_id == "store":
|
|
240
|
-
task_options = check_schema(
|
|
306
|
+
task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
|
|
241
307
|
new_task_id = f"{task_id}"
|
|
242
308
|
new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
|
|
243
309
|
elif task_id == "store_embedding":
|
|
244
|
-
task_options = check_schema(
|
|
310
|
+
task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
|
|
245
311
|
new_task_id = f"{task_id}"
|
|
246
312
|
new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
|
|
247
313
|
elif task_id == "caption":
|
|
248
|
-
task_options = check_schema(
|
|
314
|
+
task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
|
|
249
315
|
new_task_id = f"{task_id}"
|
|
250
|
-
|
|
316
|
+
# Extract individual parameters from API schema for CaptionTask constructor
|
|
317
|
+
caption_params = {
|
|
318
|
+
"api_key": task_options.api_key,
|
|
319
|
+
"endpoint_url": task_options.endpoint_url,
|
|
320
|
+
"prompt": task_options.prompt,
|
|
321
|
+
"model_name": task_options.model_name,
|
|
322
|
+
}
|
|
323
|
+
new_task = [(new_task_id, CaptionTask(**caption_params))]
|
|
251
324
|
elif task_id == "dedup":
|
|
252
|
-
task_options = check_schema(
|
|
325
|
+
task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
|
|
253
326
|
new_task_id = f"{task_id}"
|
|
254
|
-
|
|
327
|
+
# Extract individual parameters from API schema for DedupTask constructor
|
|
328
|
+
dedup_params = {
|
|
329
|
+
"content_type": task_options.content_type,
|
|
330
|
+
"filter": task_options.params.filter,
|
|
331
|
+
}
|
|
332
|
+
new_task = [(new_task_id, DedupTask(**dedup_params))]
|
|
255
333
|
elif task_id == "filter":
|
|
256
|
-
task_options = check_schema(
|
|
334
|
+
task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
|
|
257
335
|
new_task_id = f"{task_id}"
|
|
258
|
-
|
|
336
|
+
# Extract individual parameters from API schema for FilterTask constructor
|
|
337
|
+
filter_params = {
|
|
338
|
+
"content_type": task_options.content_type,
|
|
339
|
+
"min_size": task_options.params.min_size,
|
|
340
|
+
"max_aspect_ratio": task_options.params.max_aspect_ratio,
|
|
341
|
+
"min_aspect_ratio": task_options.params.min_aspect_ratio,
|
|
342
|
+
"filter": task_options.params.filter,
|
|
343
|
+
}
|
|
344
|
+
new_task = [(new_task_id, FilterTask(**filter_params))]
|
|
259
345
|
elif task_id == "embed":
|
|
260
|
-
task_options = check_schema(
|
|
346
|
+
task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
|
|
261
347
|
new_task_id = f"{task_id}"
|
|
262
348
|
new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
|
|
349
|
+
elif task_id == "infographic":
|
|
350
|
+
task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
|
|
351
|
+
new_task_id = f"{task_id}"
|
|
352
|
+
new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
|
|
353
|
+
elif task_id == "udf":
|
|
354
|
+
# Validate mutual exclusivity of target_stage and phase
|
|
355
|
+
has_target_stage = "target_stage" in options and options["target_stage"] is not None
|
|
356
|
+
has_phase = "phase" in options and options["phase"] is not None
|
|
357
|
+
|
|
358
|
+
if has_target_stage and has_phase:
|
|
359
|
+
raise ValueError(
|
|
360
|
+
"UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
|
|
361
|
+
)
|
|
362
|
+
elif not has_target_stage and not has_phase:
|
|
363
|
+
raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
|
|
364
|
+
|
|
365
|
+
# Pre-process UDF task options to convert phase names to integers
|
|
366
|
+
if "phase" in options and isinstance(options["phase"], str):
|
|
367
|
+
# Convert phase string to integer using the same logic as UDFTask
|
|
368
|
+
phase_str = options["phase"].upper()
|
|
369
|
+
phase_aliases = {
|
|
370
|
+
"PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
|
|
371
|
+
"PREPROCESSING": PipelinePhase.PRE_PROCESSING,
|
|
372
|
+
"PRE": PipelinePhase.PRE_PROCESSING,
|
|
373
|
+
"EXTRACTION": PipelinePhase.EXTRACTION,
|
|
374
|
+
"EXTRACT": PipelinePhase.EXTRACTION,
|
|
375
|
+
"POST_PROCESSING": PipelinePhase.POST_PROCESSING,
|
|
376
|
+
"POSTPROCESSING": PipelinePhase.POST_PROCESSING,
|
|
377
|
+
"POST": PipelinePhase.POST_PROCESSING,
|
|
378
|
+
"MUTATION": PipelinePhase.MUTATION,
|
|
379
|
+
"MUTATE": PipelinePhase.MUTATION,
|
|
380
|
+
"TRANSFORM": PipelinePhase.TRANSFORM,
|
|
381
|
+
"RESPONSE": PipelinePhase.RESPONSE,
|
|
382
|
+
"RESP": PipelinePhase.RESPONSE,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if phase_str in phase_aliases:
|
|
386
|
+
options["phase"] = phase_aliases[phase_str].value
|
|
387
|
+
else:
|
|
388
|
+
raise ValueError(f"Invalid phase name: {options['phase']}")
|
|
389
|
+
|
|
390
|
+
# Try to infer udf_function_name if not provided
|
|
391
|
+
if "udf_function_name" not in options or not options["udf_function_name"]:
|
|
392
|
+
udf_function = options.get("udf_function", "")
|
|
393
|
+
if udf_function:
|
|
394
|
+
inferred_name = infer_udf_function_name(udf_function)
|
|
395
|
+
if inferred_name:
|
|
396
|
+
options["udf_function_name"] = inferred_name
|
|
397
|
+
logger.info(f"Inferred UDF function name: {inferred_name}")
|
|
398
|
+
else:
|
|
399
|
+
raise ValueError(
|
|
400
|
+
f"Could not infer UDF function name from '{udf_function}'. "
|
|
401
|
+
"Please specify 'udf_function_name' explicitly."
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
|
|
405
|
+
new_task_id = f"{task_id}"
|
|
406
|
+
new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
|
|
263
407
|
else:
|
|
264
408
|
raise ValueError(f"Unsupported task type: {task_id}")
|
|
265
409
|
|
|
410
|
+
# Check for duplicate tasks - now allowing multiple tasks of the same type
|
|
266
411
|
if new_task_id in validated_tasks:
|
|
267
|
-
|
|
412
|
+
logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
|
|
268
413
|
|
|
269
414
|
logger.debug("Adding task: %s", new_task_id)
|
|
270
415
|
for task_tuple in new_task:
|
|
271
|
-
|
|
416
|
+
if task_tuple[0] in validated_tasks:
|
|
417
|
+
# Convert single task to list if needed, then append
|
|
418
|
+
existing_task = validated_tasks[task_tuple[0]]
|
|
419
|
+
if not isinstance(existing_task, list):
|
|
420
|
+
validated_tasks[task_tuple[0]] = [existing_task]
|
|
421
|
+
validated_tasks[task_tuple[0]].append(task_tuple[1])
|
|
422
|
+
else:
|
|
423
|
+
validated_tasks[task_tuple[0]] = task_tuple[1]
|
|
272
424
|
except ValueError as e:
|
|
273
425
|
validation_errors.append(str(e))
|
|
274
426
|
|