nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -12,23 +12,30 @@ from pprint import pprint
12
12
  from typing import Union, List, Any, Dict
13
13
 
14
14
  import click
15
+
16
+ from nv_ingest_api.internal.enums.common import PipelinePhase
17
+ from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
15
18
  from nv_ingest_client.util.processing import check_schema
16
19
  from nv_ingest_client.primitives.tasks import CaptionTask
17
20
  from nv_ingest_client.primitives.tasks import DedupTask
18
21
  from nv_ingest_client.primitives.tasks import EmbedTask
19
22
  from nv_ingest_client.primitives.tasks import ExtractTask
20
23
  from nv_ingest_client.primitives.tasks import FilterTask
24
+ from nv_ingest_client.primitives.tasks import InfographicExtractionTask
21
25
  from nv_ingest_client.primitives.tasks import SplitTask
22
26
  from nv_ingest_client.primitives.tasks import StoreEmbedTask
23
27
  from nv_ingest_client.primitives.tasks import StoreTask
24
- from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
25
- from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
26
- from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
27
- from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
28
- from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
29
- from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
30
- from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
31
- from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
28
+ from nv_ingest_client.primitives.tasks import UDFTask
29
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
30
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
31
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
32
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
33
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
34
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
35
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
36
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
37
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
38
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
32
39
  from nv_ingest_client.util.util import generate_matching_files
33
40
 
34
41
  logger = logging.getLogger(__name__)
@@ -78,12 +85,6 @@ class ClientType(str, Enum):
78
85
  KAFKA = "KAFKA"
79
86
 
80
87
 
81
- # Example TaskId validation set
82
- VALID_TASK_IDS = {"task1", "task2", "task3"}
83
-
84
- _MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
85
-
86
-
87
88
  def debug_print_click_options(ctx: click.Context) -> None:
88
89
  """
89
90
  Retrieves all options from the Click context and pretty prints them.
@@ -149,9 +150,11 @@ TaskType = Union[
149
150
  EmbedTask,
150
151
  ExtractTask,
151
152
  FilterTask,
153
+ InfographicExtractionTask,
152
154
  SplitTask,
153
155
  StoreEmbedTask,
154
156
  StoreTask,
157
+ UDFTask,
155
158
  ]
156
159
 
157
160
 
@@ -178,7 +181,32 @@ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
178
181
  the error details (e.g., expected property format), and show the input that was provided.
179
182
  """
180
183
  try:
181
- return json.loads(options_str)
184
+ options = json.loads(options_str)
185
+
186
+ # Convert string boolean values to actual booleans for extract tasks
187
+ if task_id == "extract":
188
+ boolean_fields = [
189
+ "extract_text",
190
+ "extract_images",
191
+ "extract_tables",
192
+ "extract_charts",
193
+ "extract_infographics",
194
+ "extract_page_as_image",
195
+ ]
196
+ for field in boolean_fields:
197
+ if field in options:
198
+ value = options[field]
199
+ if isinstance(value, str):
200
+ if value.lower() in ("true", "1", "yes", "on"):
201
+ options[field] = True
202
+ elif value.lower() in ("false", "0", "no", "off"):
203
+ options[field] = False
204
+ else:
205
+ raise ValueError(
206
+ f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
207
+ )
208
+
209
+ return options
182
210
  except json.JSONDecodeError as e:
183
211
  error_message = (
184
212
  f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
@@ -229,46 +257,170 @@ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[
229
257
  options: Dict[str, Any] = parse_task_options(task_id, json_options)
230
258
 
231
259
  if task_id == "split":
232
- task_options = check_schema(SplitTaskSchema, options, task_id, json_options)
260
+ task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
233
261
  new_task_id = f"{task_id}"
234
262
  new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
235
263
  elif task_id == "extract":
236
- task_options = check_schema(ExtractTaskSchema, options, task_id, json_options)
237
- new_task_id = f"{task_id}_{task_options.document_type}"
238
- new_task = [(new_task_id, ExtractTask(**task_options.model_dump()))]
264
+ # Map CLI parameters to API schema structure
265
+ method = options.pop("extract_method", None)
266
+ if method is None:
267
+ method = "pdfium" # Default fallback
268
+
269
+ # Build params dict for API schema
270
+ params = {k: v for k, v in options.items() if k != "document_type"}
271
+
272
+ # Validate with API schema
273
+ api_options = {
274
+ "document_type": options.get("document_type"),
275
+ "method": method,
276
+ "params": params,
277
+ }
278
+ task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
279
+ new_task_id = f"{task_id}_{task_options.document_type.value}"
280
+
281
+ # Create ExtractTask with original CLI parameters
282
+ extract_task_params = {
283
+ "document_type": task_options.document_type,
284
+ "extract_method": task_options.method,
285
+ **task_options.params,
286
+ }
287
+
288
+ # Start with the main extract task
289
+ new_task = [(new_task_id, ExtractTask(**extract_task_params))]
290
+
291
+ # Add ChartExtractionTask if extract_charts is True
292
+ if task_options.params.get("extract_charts", False):
293
+ from nv_ingest_client.primitives.tasks import ChartExtractionTask
294
+
295
+ chart_task_id = "chart_data_extract"
296
+ chart_params = {"params": {}} # ChartExtractionTask takes params dict
297
+ new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
298
+
299
+ # Add TableExtractionTask if extract_tables is True
300
+ if task_options.params.get("extract_tables", False):
301
+ from nv_ingest_client.primitives.tasks import TableExtractionTask
302
+
303
+ table_task_id = "table_data_extract"
304
+ new_task.append((table_task_id, TableExtractionTask()))
239
305
  elif task_id == "store":
240
- task_options = check_schema(StoreTaskSchema, options, task_id, json_options)
306
+ task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
241
307
  new_task_id = f"{task_id}"
242
308
  new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
243
309
  elif task_id == "store_embedding":
244
- task_options = check_schema(StoreEmbedTaskSchema, options, task_id, json_options)
310
+ task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
245
311
  new_task_id = f"{task_id}"
246
312
  new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
247
313
  elif task_id == "caption":
248
- task_options = check_schema(CaptionTaskSchema, options, task_id, json_options)
314
+ task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
249
315
  new_task_id = f"{task_id}"
250
- new_task = [(new_task_id, CaptionTask(**task_options.model_dump()))]
316
+ # Extract individual parameters from API schema for CaptionTask constructor
317
+ caption_params = {
318
+ "api_key": task_options.api_key,
319
+ "endpoint_url": task_options.endpoint_url,
320
+ "prompt": task_options.prompt,
321
+ "model_name": task_options.model_name,
322
+ }
323
+ new_task = [(new_task_id, CaptionTask(**caption_params))]
251
324
  elif task_id == "dedup":
252
- task_options = check_schema(DedupTaskSchema, options, task_id, json_options)
325
+ task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
253
326
  new_task_id = f"{task_id}"
254
- new_task = [(new_task_id, DedupTask(**task_options.model_dump()))]
327
+ # Extract individual parameters from API schema for DedupTask constructor
328
+ dedup_params = {
329
+ "content_type": task_options.content_type,
330
+ "filter": task_options.params.filter,
331
+ }
332
+ new_task = [(new_task_id, DedupTask(**dedup_params))]
255
333
  elif task_id == "filter":
256
- task_options = check_schema(FilterTaskSchema, options, task_id, json_options)
334
+ task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
257
335
  new_task_id = f"{task_id}"
258
- new_task = [(new_task_id, FilterTask(**task_options.model_dump()))]
336
+ # Extract individual parameters from API schema for FilterTask constructor
337
+ filter_params = {
338
+ "content_type": task_options.content_type,
339
+ "min_size": task_options.params.min_size,
340
+ "max_aspect_ratio": task_options.params.max_aspect_ratio,
341
+ "min_aspect_ratio": task_options.params.min_aspect_ratio,
342
+ "filter": task_options.params.filter,
343
+ }
344
+ new_task = [(new_task_id, FilterTask(**filter_params))]
259
345
  elif task_id == "embed":
260
- task_options = check_schema(EmbedTaskSchema, options, task_id, json_options)
346
+ task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
261
347
  new_task_id = f"{task_id}"
262
348
  new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
349
+ elif task_id == "infographic":
350
+ task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
351
+ new_task_id = f"{task_id}"
352
+ new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
353
+ elif task_id == "udf":
354
+ # Validate mutual exclusivity of target_stage and phase
355
+ has_target_stage = "target_stage" in options and options["target_stage"] is not None
356
+ has_phase = "phase" in options and options["phase"] is not None
357
+
358
+ if has_target_stage and has_phase:
359
+ raise ValueError(
360
+ "UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
361
+ )
362
+ elif not has_target_stage and not has_phase:
363
+ raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
364
+
365
+ # Pre-process UDF task options to convert phase names to integers
366
+ if "phase" in options and isinstance(options["phase"], str):
367
+ # Convert phase string to integer using the same logic as UDFTask
368
+ phase_str = options["phase"].upper()
369
+ phase_aliases = {
370
+ "PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
371
+ "PREPROCESSING": PipelinePhase.PRE_PROCESSING,
372
+ "PRE": PipelinePhase.PRE_PROCESSING,
373
+ "EXTRACTION": PipelinePhase.EXTRACTION,
374
+ "EXTRACT": PipelinePhase.EXTRACTION,
375
+ "POST_PROCESSING": PipelinePhase.POST_PROCESSING,
376
+ "POSTPROCESSING": PipelinePhase.POST_PROCESSING,
377
+ "POST": PipelinePhase.POST_PROCESSING,
378
+ "MUTATION": PipelinePhase.MUTATION,
379
+ "MUTATE": PipelinePhase.MUTATION,
380
+ "TRANSFORM": PipelinePhase.TRANSFORM,
381
+ "RESPONSE": PipelinePhase.RESPONSE,
382
+ "RESP": PipelinePhase.RESPONSE,
383
+ }
384
+
385
+ if phase_str in phase_aliases:
386
+ options["phase"] = phase_aliases[phase_str].value
387
+ else:
388
+ raise ValueError(f"Invalid phase name: {options['phase']}")
389
+
390
+ # Try to infer udf_function_name if not provided
391
+ if "udf_function_name" not in options or not options["udf_function_name"]:
392
+ udf_function = options.get("udf_function", "")
393
+ if udf_function:
394
+ inferred_name = infer_udf_function_name(udf_function)
395
+ if inferred_name:
396
+ options["udf_function_name"] = inferred_name
397
+ logger.info(f"Inferred UDF function name: {inferred_name}")
398
+ else:
399
+ raise ValueError(
400
+ f"Could not infer UDF function name from '{udf_function}'. "
401
+ "Please specify 'udf_function_name' explicitly."
402
+ )
403
+
404
+ task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
405
+ new_task_id = f"{task_id}"
406
+ new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
263
407
  else:
264
408
  raise ValueError(f"Unsupported task type: {task_id}")
265
409
 
410
+ # Check for duplicate tasks - now allowing multiple tasks of the same type
266
411
  if new_task_id in validated_tasks:
267
- raise ValueError(f"Duplicate task detected: {new_task_id}")
412
+ logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
268
413
 
269
414
  logger.debug("Adding task: %s", new_task_id)
270
415
  for task_tuple in new_task:
271
- validated_tasks[task_tuple[0]] = task_tuple[1]
416
+ if task_tuple[0] in validated_tasks:
417
+ # Convert single task to list if needed, then append
418
+ existing_task = validated_tasks[task_tuple[0]]
419
+ if not isinstance(existing_task, list):
420
+ validated_tasks[task_tuple[0]] = [existing_task]
421
+ validated_tasks[task_tuple[0]].append(task_tuple[1])
422
+ else:
423
+ validated_tasks[task_tuple[0]] = task_tuple[1]
272
424
  except ValueError as e:
273
425
  validation_errors.append(str(e))
274
426