nv-ingest-client 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (28) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/client/interface.py +209 -26
  3. nv_ingest_client/nv_ingest_cli.py +16 -0
  4. nv_ingest_client/primitives/jobs/job_spec.py +29 -9
  5. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  6. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  7. nv_ingest_client/primitives/tasks/caption.py +10 -16
  8. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  9. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  10. nv_ingest_client/primitives/tasks/embed.py +21 -76
  11. nv_ingest_client/primitives/tasks/extract.py +67 -168
  12. nv_ingest_client/primitives/tasks/filter.py +21 -27
  13. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  14. nv_ingest_client/primitives/tasks/split.py +17 -18
  15. nv_ingest_client/primitives/tasks/store.py +29 -29
  16. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  17. nv_ingest_client/primitives/tasks/task_factory.py +2 -0
  18. nv_ingest_client/primitives/tasks/udf.py +352 -0
  19. nv_ingest_client/util/vdb/milvus.py +1 -0
  20. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/METADATA +1 -1
  21. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/RECORD +25 -27
  22. nv_ingest_client/cli/util/tasks.py +0 -3
  23. nv_ingest_client/primitives/exceptions.py +0 -0
  24. nv_ingest_client/primitives/tasks/transform.py +0 -0
  25. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/WHEEL +0 -0
  26. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/entry_points.txt +0 -0
  27. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/licenses/LICENSE +0 -0
  28. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/top_level.txt +0 -0
@@ -12,23 +12,30 @@ from pprint import pprint
12
12
  from typing import Union, List, Any, Dict
13
13
 
14
14
  import click
15
+
16
+ from nv_ingest_api.internal.enums.common import PipelinePhase
17
+ from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
15
18
  from nv_ingest_client.util.processing import check_schema
16
19
  from nv_ingest_client.primitives.tasks import CaptionTask
17
20
  from nv_ingest_client.primitives.tasks import DedupTask
18
21
  from nv_ingest_client.primitives.tasks import EmbedTask
19
22
  from nv_ingest_client.primitives.tasks import ExtractTask
20
23
  from nv_ingest_client.primitives.tasks import FilterTask
24
+ from nv_ingest_client.primitives.tasks import InfographicExtractionTask
21
25
  from nv_ingest_client.primitives.tasks import SplitTask
22
26
  from nv_ingest_client.primitives.tasks import StoreEmbedTask
23
27
  from nv_ingest_client.primitives.tasks import StoreTask
24
- from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
25
- from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
26
- from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
27
- from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
28
- from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
29
- from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
30
- from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
31
- from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
28
+ from nv_ingest_client.primitives.tasks import UDFTask
29
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
30
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
31
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
32
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
33
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
34
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskInfographicExtraction
35
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
36
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
37
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
38
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
32
39
  from nv_ingest_client.util.util import generate_matching_files
33
40
 
34
41
  logger = logging.getLogger(__name__)
@@ -78,12 +85,6 @@ class ClientType(str, Enum):
78
85
  KAFKA = "KAFKA"
79
86
 
80
87
 
81
- # Example TaskId validation set
82
- VALID_TASK_IDS = {"task1", "task2", "task3"}
83
-
84
- _MODULE_UNDER_TEST = "nv_ingest_client.cli.util.click"
85
-
86
-
87
88
  def debug_print_click_options(ctx: click.Context) -> None:
88
89
  """
89
90
  Retrieves all options from the Click context and pretty prints them.
@@ -149,9 +150,11 @@ TaskType = Union[
149
150
  EmbedTask,
150
151
  ExtractTask,
151
152
  FilterTask,
153
+ InfographicExtractionTask,
152
154
  SplitTask,
153
155
  StoreEmbedTask,
154
156
  StoreTask,
157
+ UDFTask,
155
158
  ]
156
159
 
157
160
 
@@ -178,7 +181,32 @@ def parse_task_options(task_id: str, options_str: str) -> Dict[str, Any]:
178
181
  the error details (e.g., expected property format), and show the input that was provided.
179
182
  """
180
183
  try:
181
- return json.loads(options_str)
184
+ options = json.loads(options_str)
185
+
186
+ # Convert string boolean values to actual booleans for extract tasks
187
+ if task_id == "extract":
188
+ boolean_fields = [
189
+ "extract_text",
190
+ "extract_images",
191
+ "extract_tables",
192
+ "extract_charts",
193
+ "extract_infographics",
194
+ "extract_page_as_image",
195
+ ]
196
+ for field in boolean_fields:
197
+ if field in options:
198
+ value = options[field]
199
+ if isinstance(value, str):
200
+ if value.lower() in ("true", "1", "yes", "on"):
201
+ options[field] = True
202
+ elif value.lower() in ("false", "0", "no", "off"):
203
+ options[field] = False
204
+ else:
205
+ raise ValueError(
206
+ f"Invalid boolean value for {field}: '{value}'. Use true/false, 1/0, yes/no, or on/off."
207
+ )
208
+
209
+ return options
182
210
  except json.JSONDecodeError as e:
183
211
  error_message = (
184
212
  f"Invalid JSON format for task '{task_id}': {e.msg} at line {e.lineno} column {e.colno} (char {e.pos}). "
@@ -229,46 +257,170 @@ def click_validate_task(ctx: click.Context, param: click.Parameter, value: List[
229
257
  options: Dict[str, Any] = parse_task_options(task_id, json_options)
230
258
 
231
259
  if task_id == "split":
232
- task_options = check_schema(SplitTaskSchema, options, task_id, json_options)
260
+ task_options = check_schema(IngestTaskSplitSchema, options, task_id, json_options)
233
261
  new_task_id = f"{task_id}"
234
262
  new_task = [(new_task_id, SplitTask(**task_options.model_dump()))]
235
263
  elif task_id == "extract":
236
- task_options = check_schema(ExtractTaskSchema, options, task_id, json_options)
237
- new_task_id = f"{task_id}_{task_options.document_type}"
238
- new_task = [(new_task_id, ExtractTask(**task_options.model_dump()))]
264
+ # Map CLI parameters to API schema structure
265
+ method = options.pop("extract_method", None)
266
+ if method is None:
267
+ method = "pdfium" # Default fallback
268
+
269
+ # Build params dict for API schema
270
+ params = {k: v for k, v in options.items() if k != "document_type"}
271
+
272
+ # Validate with API schema
273
+ api_options = {
274
+ "document_type": options.get("document_type"),
275
+ "method": method,
276
+ "params": params,
277
+ }
278
+ task_options = check_schema(IngestTaskExtractSchema, api_options, task_id, json_options)
279
+ new_task_id = f"{task_id}_{task_options.document_type.value}"
280
+
281
+ # Create ExtractTask with original CLI parameters
282
+ extract_task_params = {
283
+ "document_type": task_options.document_type,
284
+ "extract_method": task_options.method,
285
+ **task_options.params,
286
+ }
287
+
288
+ # Start with the main extract task
289
+ new_task = [(new_task_id, ExtractTask(**extract_task_params))]
290
+
291
+ # Add ChartExtractionTask if extract_charts is True
292
+ if task_options.params.get("extract_charts", False):
293
+ from nv_ingest_client.primitives.tasks import ChartExtractionTask
294
+
295
+ chart_task_id = "chart_data_extract"
296
+ chart_params = {"params": {}} # ChartExtractionTask takes params dict
297
+ new_task.append((chart_task_id, ChartExtractionTask(chart_params)))
298
+
299
+ # Add TableExtractionTask if extract_tables is True
300
+ if task_options.params.get("extract_tables", False):
301
+ from nv_ingest_client.primitives.tasks import TableExtractionTask
302
+
303
+ table_task_id = "table_data_extract"
304
+ new_task.append((table_task_id, TableExtractionTask()))
239
305
  elif task_id == "store":
240
- task_options = check_schema(StoreTaskSchema, options, task_id, json_options)
306
+ task_options = check_schema(IngestTaskStoreSchema, options, task_id, json_options)
241
307
  new_task_id = f"{task_id}"
242
308
  new_task = [(new_task_id, StoreTask(**task_options.model_dump()))]
243
309
  elif task_id == "store_embedding":
244
- task_options = check_schema(StoreEmbedTaskSchema, options, task_id, json_options)
310
+ task_options = check_schema(IngestTaskStoreEmbedSchema, options, task_id, json_options)
245
311
  new_task_id = f"{task_id}"
246
312
  new_task = [(new_task_id, StoreEmbedTask(**task_options.model_dump()))]
247
313
  elif task_id == "caption":
248
- task_options = check_schema(CaptionTaskSchema, options, task_id, json_options)
314
+ task_options = check_schema(IngestTaskCaptionSchema, options, task_id, json_options)
249
315
  new_task_id = f"{task_id}"
250
- new_task = [(new_task_id, CaptionTask(**task_options.model_dump()))]
316
+ # Extract individual parameters from API schema for CaptionTask constructor
317
+ caption_params = {
318
+ "api_key": task_options.api_key,
319
+ "endpoint_url": task_options.endpoint_url,
320
+ "prompt": task_options.prompt,
321
+ "model_name": task_options.model_name,
322
+ }
323
+ new_task = [(new_task_id, CaptionTask(**caption_params))]
251
324
  elif task_id == "dedup":
252
- task_options = check_schema(DedupTaskSchema, options, task_id, json_options)
325
+ task_options = check_schema(IngestTaskDedupSchema, options, task_id, json_options)
253
326
  new_task_id = f"{task_id}"
254
- new_task = [(new_task_id, DedupTask(**task_options.model_dump()))]
327
+ # Extract individual parameters from API schema for DedupTask constructor
328
+ dedup_params = {
329
+ "content_type": task_options.content_type,
330
+ "filter": task_options.params.filter,
331
+ }
332
+ new_task = [(new_task_id, DedupTask(**dedup_params))]
255
333
  elif task_id == "filter":
256
- task_options = check_schema(FilterTaskSchema, options, task_id, json_options)
334
+ task_options = check_schema(IngestTaskFilterSchema, options, task_id, json_options)
257
335
  new_task_id = f"{task_id}"
258
- new_task = [(new_task_id, FilterTask(**task_options.model_dump()))]
336
+ # Extract individual parameters from API schema for FilterTask constructor
337
+ filter_params = {
338
+ "content_type": task_options.content_type,
339
+ "min_size": task_options.params.min_size,
340
+ "max_aspect_ratio": task_options.params.max_aspect_ratio,
341
+ "min_aspect_ratio": task_options.params.min_aspect_ratio,
342
+ "filter": task_options.params.filter,
343
+ }
344
+ new_task = [(new_task_id, FilterTask(**filter_params))]
259
345
  elif task_id == "embed":
260
- task_options = check_schema(EmbedTaskSchema, options, task_id, json_options)
346
+ task_options = check_schema(IngestTaskEmbedSchema, options, task_id, json_options)
261
347
  new_task_id = f"{task_id}"
262
348
  new_task = [(new_task_id, EmbedTask(**task_options.model_dump()))]
349
+ elif task_id == "infographic":
350
+ task_options = check_schema(IngestTaskInfographicExtraction, options, task_id, json_options)
351
+ new_task_id = f"{task_id}"
352
+ new_task = [(new_task_id, InfographicExtractionTask(**task_options.model_dump()))]
353
+ elif task_id == "udf":
354
+ # Validate mutual exclusivity of target_stage and phase
355
+ has_target_stage = "target_stage" in options and options["target_stage"] is not None
356
+ has_phase = "phase" in options and options["phase"] is not None
357
+
358
+ if has_target_stage and has_phase:
359
+ raise ValueError(
360
+ "UDF task cannot specify both 'target_stage' and 'phase'. Please specify only one."
361
+ )
362
+ elif not has_target_stage and not has_phase:
363
+ raise ValueError("UDF task must specify either 'target_stage' or 'phase'.")
364
+
365
+ # Pre-process UDF task options to convert phase names to integers
366
+ if "phase" in options and isinstance(options["phase"], str):
367
+ # Convert phase string to integer using the same logic as UDFTask
368
+ phase_str = options["phase"].upper()
369
+ phase_aliases = {
370
+ "PRE_PROCESSING": PipelinePhase.PRE_PROCESSING,
371
+ "PREPROCESSING": PipelinePhase.PRE_PROCESSING,
372
+ "PRE": PipelinePhase.PRE_PROCESSING,
373
+ "EXTRACTION": PipelinePhase.EXTRACTION,
374
+ "EXTRACT": PipelinePhase.EXTRACTION,
375
+ "POST_PROCESSING": PipelinePhase.POST_PROCESSING,
376
+ "POSTPROCESSING": PipelinePhase.POST_PROCESSING,
377
+ "POST": PipelinePhase.POST_PROCESSING,
378
+ "MUTATION": PipelinePhase.MUTATION,
379
+ "MUTATE": PipelinePhase.MUTATION,
380
+ "TRANSFORM": PipelinePhase.TRANSFORM,
381
+ "RESPONSE": PipelinePhase.RESPONSE,
382
+ "RESP": PipelinePhase.RESPONSE,
383
+ }
384
+
385
+ if phase_str in phase_aliases:
386
+ options["phase"] = phase_aliases[phase_str].value
387
+ else:
388
+ raise ValueError(f"Invalid phase name: {options['phase']}")
389
+
390
+ # Try to infer udf_function_name if not provided
391
+ if "udf_function_name" not in options or not options["udf_function_name"]:
392
+ udf_function = options.get("udf_function", "")
393
+ if udf_function:
394
+ inferred_name = infer_udf_function_name(udf_function)
395
+ if inferred_name:
396
+ options["udf_function_name"] = inferred_name
397
+ logger.info(f"Inferred UDF function name: {inferred_name}")
398
+ else:
399
+ raise ValueError(
400
+ f"Could not infer UDF function name from '{udf_function}'. "
401
+ "Please specify 'udf_function_name' explicitly."
402
+ )
403
+
404
+ task_options = check_schema(IngestTaskUDFSchema, options, task_id, json_options)
405
+ new_task_id = f"{task_id}"
406
+ new_task = [(new_task_id, UDFTask(**task_options.model_dump()))]
263
407
  else:
264
408
  raise ValueError(f"Unsupported task type: {task_id}")
265
409
 
410
+ # Check for duplicate tasks - now allowing multiple tasks of the same type
266
411
  if new_task_id in validated_tasks:
267
- raise ValueError(f"Duplicate task detected: {new_task_id}")
412
+ logger.debug(f"Multiple tasks detected for {new_task_id}, storing as list")
268
413
 
269
414
  logger.debug("Adding task: %s", new_task_id)
270
415
  for task_tuple in new_task:
271
- validated_tasks[task_tuple[0]] = task_tuple[1]
416
+ if task_tuple[0] in validated_tasks:
417
+ # Convert single task to list if needed, then append
418
+ existing_task = validated_tasks[task_tuple[0]]
419
+ if not isinstance(existing_task, list):
420
+ validated_tasks[task_tuple[0]] = [existing_task]
421
+ validated_tasks[task_tuple[0]].append(task_tuple[1])
422
+ else:
423
+ validated_tasks[task_tuple[0]] = task_tuple[1]
272
424
  except ValueError as e:
273
425
  validation_errors.append(str(e))
274
426
 
@@ -27,6 +27,16 @@ from typing import Union
27
27
  from urllib.parse import urlparse
28
28
 
29
29
  import fsspec
30
+ from nv_ingest_api.internal.enums.common import PipelinePhase
31
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
32
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
33
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
34
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskExtractSchema
35
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskFilterSchema
36
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskSplitSchema
37
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
38
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
39
+ from nv_ingest_api.util.introspection.function_inspect import infer_udf_function_name
30
40
  from nv_ingest_client.client.client import NvIngestClient
31
41
  from nv_ingest_client.client.util.processing import get_valid_filename
32
42
  from nv_ingest_client.client.util.processing import save_document_results_to_jsonl
@@ -38,16 +48,9 @@ from nv_ingest_client.primitives.tasks import EmbedTask
38
48
  from nv_ingest_client.primitives.tasks import ExtractTask
39
49
  from nv_ingest_client.primitives.tasks import FilterTask
40
50
  from nv_ingest_client.primitives.tasks import SplitTask
41
- from nv_ingest_client.primitives.tasks import StoreEmbedTask
42
51
  from nv_ingest_client.primitives.tasks import StoreTask
43
- from nv_ingest_client.primitives.tasks.caption import CaptionTaskSchema
44
- from nv_ingest_client.primitives.tasks.dedup import DedupTaskSchema
45
- from nv_ingest_client.primitives.tasks.embed import EmbedTaskSchema
46
- from nv_ingest_client.primitives.tasks.extract import ExtractTaskSchema
47
- from nv_ingest_client.primitives.tasks.filter import FilterTaskSchema
48
- from nv_ingest_client.primitives.tasks.split import SplitTaskSchema
49
- from nv_ingest_client.primitives.tasks.store import StoreEmbedTaskSchema
50
- from nv_ingest_client.primitives.tasks.store import StoreTaskSchema
52
+ from nv_ingest_client.primitives.tasks import StoreEmbedTask
53
+ from nv_ingest_client.primitives.tasks import UDFTask
51
54
  from nv_ingest_client.util.processing import check_schema
52
55
  from nv_ingest_client.util.system import ensure_directory_with_permissions
53
56
  from nv_ingest_client.util.util import filter_function_kwargs
@@ -436,7 +439,7 @@ class Ingestor:
436
439
 
437
440
  final_results_payload_list: Union[List[List[Dict[str, Any]]], List[LazyLoadedList]] = []
438
441
 
439
- # Lock for thread-safe appends to final_results_payload_list by I/O tasks
442
+ # Lock for thread-safe appending to final_results_payload_list by I/O tasks
440
443
  results_lock = threading.Lock() if self._output_config else None
441
444
 
442
445
  io_executor: Optional[ThreadPoolExecutor] = None
@@ -698,8 +701,23 @@ class Ingestor:
698
701
  Ingestor
699
702
  Returns self for chaining.
700
703
  """
701
- task_options = check_schema(DedupTaskSchema, kwargs, "dedup", json.dumps(kwargs))
702
- dedup_task = DedupTask(**task_options.model_dump())
704
+ # Extract content_type and build params dict for API schema
705
+ content_type = kwargs.pop("content_type", "text") # Default to "text" if not specified
706
+ params = kwargs # Remaining parameters go into params dict
707
+
708
+ # Validate with API schema
709
+ api_options = {
710
+ "content_type": content_type,
711
+ "params": params,
712
+ }
713
+ task_options = check_schema(IngestTaskDedupSchema, api_options, "dedup", json.dumps(api_options))
714
+
715
+ # Extract individual parameters from API schema for DedupTask constructor
716
+ dedup_params = {
717
+ "content_type": task_options.content_type,
718
+ "filter": task_options.params.filter,
719
+ }
720
+ dedup_task = DedupTask(**dedup_params)
703
721
  self._job_specs.add_task(dedup_task)
704
722
 
705
723
  return self
@@ -719,8 +737,14 @@ class Ingestor:
719
737
  Ingestor
720
738
  Returns self for chaining.
721
739
  """
722
- task_options = check_schema(EmbedTaskSchema, kwargs, "embed", json.dumps(kwargs))
723
- embed_task = EmbedTask(**task_options.model_dump())
740
+ # Filter out deprecated parameters before API schema validation
741
+ # The EmbedTask constructor handles these deprecated parameters with warnings
742
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k not in ["text", "tables"]}
743
+
744
+ _ = check_schema(IngestTaskEmbedSchema, filtered_kwargs, "embed", json.dumps(filtered_kwargs))
745
+
746
+ # Pass original kwargs to EmbedTask constructor so it can handle deprecated parameters
747
+ embed_task = EmbedTask(**kwargs)
724
748
  self._job_specs.add_task(embed_task)
725
749
 
726
750
  return self
@@ -767,9 +791,52 @@ class Ingestor:
767
791
  extract_page_as_image=extract_page_as_image,
768
792
  **kwargs,
769
793
  )
770
- task_options = check_schema(ExtractTaskSchema, task_options, "extract", json.dumps(task_options))
771
794
 
772
- extract_task = ExtractTask(**task_options.model_dump())
795
+ # Extract method from task_options for API schema
796
+ method = task_options.pop("extract_method", None)
797
+ if method is None:
798
+ # Let ExtractTask constructor handle default method selection
799
+ method = "pdfium" # Default fallback
800
+
801
+ # Build params dict for API schema
802
+ params = {k: v for k, v in task_options.items() if k != "document_type"}
803
+
804
+ # Map document type to API schema expected values
805
+ # Handle common file extension to DocumentTypeEnum mapping
806
+ document_type_mapping = {
807
+ "txt": "text",
808
+ "md": "text",
809
+ "sh": "text",
810
+ "json": "text",
811
+ "jpg": "jpeg",
812
+ "jpeg": "jpeg",
813
+ "png": "png",
814
+ "pdf": "pdf",
815
+ "docx": "docx",
816
+ "pptx": "pptx",
817
+ "html": "html",
818
+ "bmp": "bmp",
819
+ "tiff": "tiff",
820
+ "svg": "svg",
821
+ "mp3": "mp3",
822
+ "wav": "wav",
823
+ }
824
+
825
+ # Use mapped document type for API schema validation
826
+ api_document_type = document_type_mapping.get(document_type.lower(), document_type)
827
+
828
+ # Validate with API schema
829
+ api_task_options = {
830
+ "document_type": api_document_type,
831
+ "method": method,
832
+ "params": params,
833
+ }
834
+
835
+ check_schema(IngestTaskExtractSchema, api_task_options, "extract", json.dumps(api_task_options))
836
+
837
+ # Create ExtractTask with mapped document type for API schema compatibility
838
+ extract_task_params = {"document_type": api_document_type, "extract_method": method, **params}
839
+ extract_task = ExtractTask(**extract_task_params)
773
840
  self._job_specs.add_task(extract_task, document_type=document_type)
774
841
 
775
842
  return self
@@ -789,8 +856,27 @@ class Ingestor:
789
856
  Ingestor
790
857
  Returns self for chaining.
791
858
  """
792
- task_options = check_schema(FilterTaskSchema, kwargs, "filter", json.dumps(kwargs))
793
- filter_task = FilterTask(**task_options.model_dump())
859
+ # Restructure parameters to match API schema structure
860
+ params_fields = {"min_size", "max_aspect_ratio", "min_aspect_ratio", "filter"}
861
+ params = {k: v for k, v in kwargs.items() if k in params_fields}
862
+ top_level = {k: v for k, v in kwargs.items() if k not in params_fields}
863
+
864
+ # Build API schema structure
865
+ api_kwargs = top_level.copy()
866
+ if params:
867
+ api_kwargs["params"] = params
868
+
869
+ task_options = check_schema(IngestTaskFilterSchema, api_kwargs, "filter", json.dumps(api_kwargs))
870
+
871
+ # Extract individual parameters from API schema for FilterTask constructor
872
+ filter_params = {
873
+ "content_type": task_options.content_type,
874
+ "min_size": task_options.params.min_size,
875
+ "max_aspect_ratio": task_options.params.max_aspect_ratio,
876
+ "min_aspect_ratio": task_options.params.min_aspect_ratio,
877
+ "filter": task_options.params.filter,
878
+ }
879
+ filter_task = FilterTask(**filter_params)
794
880
  self._job_specs.add_task(filter_task)
795
881
 
796
882
  return self
@@ -810,7 +896,7 @@ class Ingestor:
810
896
  Ingestor
811
897
  Returns self for chaining.
812
898
  """
813
- task_options = check_schema(SplitTaskSchema, kwargs, "split", json.dumps(kwargs))
899
+ task_options = check_schema(IngestTaskSplitSchema, kwargs, "split", json.dumps(kwargs))
814
900
  extract_task = SplitTask(**task_options.model_dump())
815
901
  self._job_specs.add_task(extract_task)
816
902
 
@@ -831,8 +917,24 @@ class Ingestor:
831
917
  Ingestor
832
918
  Returns self for chaining.
833
919
  """
834
- task_options = check_schema(StoreTaskSchema, kwargs, "store", json.dumps(kwargs))
835
- store_task = StoreTask(**task_options.model_dump())
920
+ # Handle parameter name mapping: store_method -> method for API schema
921
+ if "store_method" in kwargs:
922
+ kwargs["method"] = kwargs.pop("store_method")
923
+
924
+ # Provide default method if not specified (matching client StoreTask behavior)
925
+ if "method" not in kwargs:
926
+ kwargs["method"] = "minio"
927
+
928
+ task_options = check_schema(IngestTaskStoreSchema, kwargs, "store", json.dumps(kwargs))
929
+
930
+ # Map API schema fields back to StoreTask constructor parameters
931
+ store_params = {
932
+ "structured": task_options.structured,
933
+ "images": task_options.images,
934
+ "store_method": task_options.method, # Map method back to store_method
935
+ "params": task_options.params,
936
+ }
937
+ store_task = StoreTask(**store_params)
836
938
  self._job_specs.add_task(store_task)
837
939
 
838
940
  return self
@@ -840,24 +942,97 @@ class Ingestor:
840
942
  @ensure_job_specs
841
943
  def store_embed(self, **kwargs: Any) -> "Ingestor":
842
944
  """
843
- Adds a StoreTask to the batch job specification.
945
+ Adds a StoreEmbedTask to the batch job specification.
844
946
 
845
947
  Parameters
846
948
  ----------
847
949
  kwargs : dict
848
- Parameters specific to the StoreTask.
950
+ Parameters specific to the StoreEmbedTask.
849
951
 
850
952
  Returns
851
953
  -------
852
954
  Ingestor
853
955
  Returns self for chaining.
854
956
  """
855
- task_options = check_schema(StoreEmbedTaskSchema, kwargs, "store_embedding", json.dumps(kwargs))
957
+ task_options = check_schema(IngestTaskStoreEmbedSchema, kwargs, "store_embedding", json.dumps(kwargs))
856
958
  store_task = StoreEmbedTask(**task_options.model_dump())
857
959
  self._job_specs.add_task(store_task)
858
960
 
859
961
  return self
860
962
 
963
+ def udf(
964
+ self,
965
+ udf_function: str,
966
+ udf_function_name: Optional[str] = None,
967
+ phase: Optional[Union[PipelinePhase, int, str]] = None,
968
+ target_stage: Optional[str] = None,
969
+ run_before: bool = False,
970
+ run_after: bool = False,
971
+ ) -> "Ingestor":
972
+ """
973
+ Adds a UDFTask to the batch job specification.
974
+
975
+ Parameters
976
+ ----------
977
+ udf_function : str
978
+ UDF specification. Supports three formats:
979
+ 1. Inline function: 'def my_func(control_message): ...'
980
+ 2. Import path: 'my_module.my_function'
981
+ 3. File path: '/path/to/file.py:function_name'
982
+ udf_function_name : str, optional
983
+ Name of the function to execute from the UDF specification.
984
+ If not provided, attempts to infer from udf_function.
985
+ phase : Union[PipelinePhase, int, str], optional
986
+ Pipeline phase to execute UDF. Accepts phase names ('extract', 'split', 'embed', 'response')
987
+ or numbers (1-4). Cannot be used with target_stage.
988
+ target_stage : str, optional
989
+ Specific stage name to target for UDF execution. Cannot be used with phase.
990
+ run_before : bool, optional
991
+ If True and target_stage is specified, run UDF before the target stage. Default: False.
992
+ run_after : bool, optional
993
+ If True and target_stage is specified, run UDF after the target stage. Default: False.
994
+
995
+ Returns
996
+ -------
997
+ Ingestor
998
+ Returns self for chaining.
999
+
1000
+ Raises
1001
+ ------
1002
+ ValueError
1003
+ If udf_function_name cannot be inferred and is not provided explicitly,
1004
+ or if both phase and target_stage are specified, or if neither is specified.
1005
+ """
1006
+ # Validate mutual exclusivity of phase and target_stage
1007
+ if phase is not None and target_stage is not None:
1008
+ raise ValueError("Cannot specify both 'phase' and 'target_stage'. Please specify only one.")
1009
+ elif phase is None and target_stage is None:
1010
+ # Default to response phase for backward compatibility
1011
+ phase = PipelinePhase.RESPONSE
1012
+
1013
+ # Try to infer udf_function_name if not provided
1014
+ if udf_function_name is None:
1015
+ udf_function_name = infer_udf_function_name(udf_function)
1016
+ if udf_function_name is None:
1017
+ raise ValueError(
1018
+ f"Could not infer UDF function name from '{udf_function}'. "
1019
+ "Please specify 'udf_function_name' explicitly."
1020
+ )
1021
+ logger.info(f"Inferred UDF function name: {udf_function_name}")
1022
+
1023
+ # Use UDFTask constructor with explicit parameters
1024
+ udf_task = UDFTask(
1025
+ udf_function=udf_function,
1026
+ udf_function_name=udf_function_name,
1027
+ phase=phase,
1028
+ target_stage=target_stage,
1029
+ run_before=run_before,
1030
+ run_after=run_after,
1031
+ )
1032
+ self._job_specs.add_task(udf_task)
1033
+
1034
+ return self
1035
+
861
1036
  def vdb_upload(self, purge_results_after_upload: bool = True, **kwargs: Any) -> "Ingestor":
862
1037
  """
863
1038
  Adds a VdbUploadTask to the batch job specification.
@@ -986,8 +1161,16 @@ class Ingestor:
986
1161
  Ingestor
987
1162
  Returns self for chaining.
988
1163
  """
989
- task_options = check_schema(CaptionTaskSchema, kwargs, "caption", json.dumps(kwargs))
990
- caption_task = CaptionTask(**task_options.model_dump())
1164
+ task_options = check_schema(IngestTaskCaptionSchema, kwargs, "caption", json.dumps(kwargs))
1165
+
1166
+ # Extract individual parameters from API schema for CaptionTask constructor
1167
+ caption_params = {
1168
+ "api_key": task_options.api_key,
1169
+ "endpoint_url": task_options.endpoint_url,
1170
+ "prompt": task_options.prompt,
1171
+ "model_name": task_options.model_name,
1172
+ }
1173
+ caption_task = CaptionTask(**caption_params)
991
1174
  self._job_specs.add_task(caption_task)
992
1175
 
993
1176
  return self
@@ -169,6 +169,22 @@ Tasks and Options:
169
169
  - split_length (int): Segment length. No default.
170
170
  - split_overlap (int): Segment overlap. No default.
171
171
  \b
172
+ - udf: Executes user-defined functions (UDFs) for custom processing logic.
173
+ Options:
174
+ - udf_function (str): UDF specification. Supports three formats:
175
+ 1. Inline function: 'def my_func(control_message): ...'
176
+ 2. Import path: 'my_module.my_function'
177
+ 3. File path: '/path/to/file.py:function_name' or '/path/to/file.py' (assumes 'process' function)
178
+ - udf_function_name (str): Name of the function to execute from the UDF specification. Required.
179
+ - target_stage (str): Specific pipeline stage name to target for UDF execution (e.g.,
180
+ 'text_extractor', 'text_embedder', 'image_extractor'). Cannot be used with phase.
181
+ - run_before (bool): If True and target_stage is specified, run UDF before the target stage. Default: False.
182
+ - run_after (bool): If True and target_stage is specified, run UDF after the target stage. Default: False.
183
+ Examples:
184
+ --task 'udf:{"udf_function": "my_file.py:my_func", "target_stage": "text_embedder", "run_before": true}'
185
+ --task 'udf:{"udf_function": "def process(cm): return cm",
186
+ "target_stage": "image_extractor", "run_after": true}'
187
+ \b
172
188
  Note: The 'extract_method' automatically selects the optimal method based on 'document_type' if not explicitly stated.
173
189
  """,
174
190
  )