nv-ingest-client 2025.9.26.dev20250926__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

@@ -6,6 +6,7 @@
6
6
 
7
7
  import collections
8
8
  import glob
9
+ import gzip
9
10
  import json
10
11
  import logging
11
12
  import os
@@ -53,7 +54,7 @@ from nv_ingest_client.primitives.tasks import StoreEmbedTask
53
54
  from nv_ingest_client.primitives.tasks import UDFTask
54
55
  from nv_ingest_client.util.processing import check_schema
55
56
  from nv_ingest_client.util.system import ensure_directory_with_permissions
56
- from nv_ingest_client.util.util import filter_function_kwargs
57
+ from nv_ingest_client.util.util import filter_function_kwargs, apply_pdf_split_config_to_job_specs
57
58
  from nv_ingest_client.util.vdb import VDB, get_vdb_op_cls
58
59
  from tqdm import tqdm
59
60
 
@@ -93,17 +94,20 @@ def ensure_job_specs(func):
93
94
 
94
95
 
95
96
  class LazyLoadedList(collections.abc.Sequence):
96
- def __init__(self, filepath: str, expected_len: Optional[int] = None):
97
+ def __init__(self, filepath: str, expected_len: Optional[int] = None, compression: Optional[str] = None):
97
98
  self.filepath = filepath
98
99
  self._len: Optional[int] = expected_len # Store pre-calculated length
99
100
  self._offsets: Optional[List[int]] = None
101
+ self.compression = compression
100
102
 
101
103
  if self._len == 0:
102
104
  self._offsets = []
103
105
 
106
+ self._open = gzip.open if self.compression == "gzip" else open
107
+
104
108
  def __iter__(self) -> Iterator[Any]:
105
109
  try:
106
- with open(self.filepath, "r", encoding="utf-8") as f:
110
+ with self._open(self.filepath, "rt", encoding="utf-8") as f:
107
111
  for line in f:
108
112
  yield json.loads(line)
109
113
  except FileNotFoundError:
@@ -120,7 +124,7 @@ class LazyLoadedList(collections.abc.Sequence):
120
124
  self._offsets = []
121
125
  line_count = 0
122
126
  try:
123
- with open(self.filepath, "rb") as f:
127
+ with self._open(self.filepath, "rb") as f:
124
128
  while True:
125
129
  current_pos = f.tell()
126
130
  line = f.readline()
@@ -144,10 +148,12 @@ class LazyLoadedList(collections.abc.Sequence):
144
148
  def __len__(self) -> int:
145
149
  if self._len is not None:
146
150
  return self._len
151
+
147
152
  if self._offsets is not None:
148
153
  self._len = len(self._offsets)
149
154
  return self._len
150
155
  self._build_index()
156
+
151
157
  return self._len if self._len is not None else 0
152
158
 
153
159
  def __getitem__(self, idx: int) -> Any:
@@ -170,7 +176,7 @@ class LazyLoadedList(collections.abc.Sequence):
170
176
  raise IndexError(f"Index {idx} out of range for {self.filepath} (len: {len(self._offsets)})")
171
177
 
172
178
  try:
173
- with open(self.filepath, "rb") as f:
179
+ with self._open(self.filepath, "rb") as f:
174
180
  f.seek(self._offsets[idx])
175
181
  line_bytes = f.readline()
176
182
  return json.loads(line_bytes.decode("utf-8"))
@@ -396,15 +402,9 @@ class Ingestor:
396
402
  show_progress: bool = False,
397
403
  return_failures: bool = False,
398
404
  save_to_disk: bool = False,
405
+ return_traces: bool = False,
399
406
  **kwargs: Any,
400
- ) -> Union[
401
- List[List[Dict[str, Any]]], # In-memory: List of (response['data'] for each doc)
402
- List[LazyLoadedList], # Disk: List of proxies, one per original doc
403
- Tuple[
404
- Union[List[List[Dict[str, Any]]], List[LazyLoadedList]],
405
- List[Tuple[str, str]],
406
- ],
407
- ]: # noqa: E501
407
+ ) -> Union[List[Any], Tuple[Any, ...]]:
408
408
  """
409
409
  Ingest documents by submitting jobs and fetching results concurrently.
410
410
 
@@ -414,22 +414,36 @@ class Ingestor:
414
414
  Whether to display a progress bar. Default is False.
415
415
  return_failures : bool, optional
416
416
  If True, return a tuple (results, failures); otherwise, return only results. Default is False.
417
+ save_to_disk : bool, optional
418
+ If True, save results to disk and return LazyLoadedList proxies. Default is False.
419
+ return_traces : bool, optional
420
+ If True, return trace metrics alongside results. Default is False.
421
+ Traces contain timing metrics (entry, exit, resident_time) for each stage.
417
422
  **kwargs : Any
418
- Additional keyword arguments for the underlying client methods. Supported keys:
419
- 'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
420
- 'data_only', 'verbose'. Unrecognized keys are passed through to
421
- process_jobs_concurrently.
423
+ Additional keyword arguments for the underlying client methods.
424
+ Optional flags include `include_parent_trace_ids=True` to also return
425
+ parent job trace identifiers (V2 API only).
422
426
 
423
427
  Returns
424
428
  -------
425
- results : list of dict
426
- List of successful job results when `return_failures` is False.
427
- results, failures : tuple (list of dict, list of tuple of str)
428
- Tuple containing successful results and failure information when `return_failures` is True.
429
+ list or tuple
430
+ Returns vary based on flags:
431
+ - Default: list of results
432
+ - return_failures=True: (results, failures)
433
+ - return_traces=True: (results, traces)
434
+ - return_failures=True, return_traces=True: (results, failures, traces)
435
+ - Additional combinations with include_parent_trace_ids kwarg
436
+
437
+ Notes
438
+ -----
439
+ Trace metrics include timing data for each processing stage. For detailed
440
+ usage and examples, see src/nv_ingest/api/v2/README.md
429
441
  """
430
442
  if save_to_disk and (not self._output_config):
431
443
  self.save_to_disk()
432
444
 
445
+ include_parent_trace_ids = bool(kwargs.pop("include_parent_trace_ids", False))
446
+
433
447
  self._prepare_ingest_run()
434
448
 
435
449
  # Add jobs locally first
@@ -455,6 +469,8 @@ class Ingestor:
455
469
  clean_source_basename = get_valid_filename(os.path.basename(source_name))
456
470
  file_name, file_ext = os.path.splitext(clean_source_basename)
457
471
  file_suffix = f".{file_ext.strip('.')}.results.jsonl"
472
+ if self._output_config["compression"] == "gzip":
473
+ file_suffix += ".gz"
458
474
  jsonl_filepath = os.path.join(output_dir, safe_filename(output_dir, file_name, file_suffix))
459
475
 
460
476
  num_items_saved = save_document_results_to_jsonl(
@@ -462,10 +478,13 @@ class Ingestor:
462
478
  jsonl_filepath,
463
479
  source_name,
464
480
  ensure_parent_dir_exists=False,
481
+ compression=self._output_config["compression"],
465
482
  )
466
483
 
467
484
  if num_items_saved > 0:
468
- results = LazyLoadedList(jsonl_filepath, expected_len=num_items_saved)
485
+ results = LazyLoadedList(
486
+ jsonl_filepath, expected_len=num_items_saved, compression=self._output_config["compression"]
487
+ )
469
488
  if results_lock:
470
489
  with results_lock:
471
490
  final_results_payload_list.append(results)
@@ -538,7 +557,24 @@ class Ingestor:
538
557
 
539
558
  proc_kwargs = filter_function_kwargs(self._client.process_jobs_concurrently, **kwargs)
540
559
 
541
- results, failures = self._client.process_jobs_concurrently(
560
+ # Telemetry controls (optional)
561
+ enable_telemetry: Optional[bool] = kwargs.pop("enable_telemetry", None)
562
+ show_telemetry: Optional[bool] = kwargs.pop("show_telemetry", None)
563
+ if show_telemetry is None:
564
+ # Fallback to env NV_INGEST_CLIENT_SHOW_TELEMETRY (0/1), default off
565
+ try:
566
+ show_telemetry = bool(int(os.getenv("NV_INGEST_CLIENT_SHOW_TELEMETRY", "0")))
567
+ except ValueError:
568
+ show_telemetry = False
569
+ # If user explicitly wants to show telemetry but did not specify enable_telemetry,
570
+ # ensure collection is enabled so summary isn't empty.
571
+ if enable_telemetry is None and show_telemetry:
572
+ enable_telemetry = True
573
+ if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
574
+ self._client.enable_telemetry(bool(enable_telemetry))
575
+
576
+ # Call process_jobs_concurrently
577
+ proc_result = self._client.process_jobs_concurrently(
542
578
  job_indices=self._job_ids,
543
579
  job_queue_id=self._job_queue_id,
544
580
  timeout=timeout,
@@ -547,9 +583,17 @@ class Ingestor:
547
583
  return_failures=True,
548
584
  stream_to_callback_only=stream_to_callback_only,
549
585
  verbose=verbose,
586
+ return_traces=return_traces,
550
587
  **proc_kwargs,
551
588
  )
552
589
 
590
+ # Unpack result based on return_traces flag
591
+ if return_traces:
592
+ results, failures, traces_list = proc_result
593
+ else:
594
+ results, failures = proc_result
595
+ traces_list = [] # Empty list when traces not requested
596
+
553
597
  if show_progress and pbar:
554
598
  pbar.close()
555
599
 
@@ -600,7 +644,30 @@ class Ingestor:
600
644
  logger.info("Purging saved results from disk after successful VDB upload.")
601
645
  self._purge_saved_results(results)
602
646
 
603
- return (results, failures) if return_failures else results
647
+ # Print telemetry summary if requested
648
+ if show_telemetry:
649
+ try:
650
+ summary = self._client.summarize_telemetry()
651
+ # Print to stdout and log for convenience
652
+ print("NvIngestClient Telemetry Summary:", json.dumps(summary, indent=2))
653
+ logger.info("NvIngestClient Telemetry Summary: %s", json.dumps(summary, indent=2))
654
+ except Exception:
655
+ pass
656
+
657
+ parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
658
+
659
+ # Build return tuple based on requested outputs
660
+ # Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
661
+ returns = [results]
662
+
663
+ if return_failures:
664
+ returns.append(failures)
665
+ if return_traces:
666
+ returns.append(traces_list)
667
+ if include_parent_trace_ids:
668
+ returns.append(parent_trace_ids)
669
+
670
+ return tuple(returns) if len(returns) > 1 else results
604
671
 
605
672
  def ingest_async(self, **kwargs: Any) -> Future:
606
673
  """
@@ -1068,6 +1135,7 @@ class Ingestor:
1068
1135
  self,
1069
1136
  output_directory: Optional[str] = None,
1070
1137
  cleanup: bool = True,
1138
+ compression: Optional[str] = "gzip",
1071
1139
  ) -> "Ingestor":
1072
1140
  """Configures the Ingestor to save results to disk instead of memory.
1073
1141
 
@@ -1092,6 +1160,12 @@ class Ingestor:
1092
1160
  when the Ingestor's context is exited (i.e., when used in a `with`
1093
1161
  statement).
1094
1162
  Defaults to True.
1163
+ compression : str, optional
1164
+ The compression algorithm to use for the saved result files.
1165
+ Currently, the only supported value is `'gzip'`. To disable
1166
+ compression, set this parameter to `None`. Defaults to `'gzip'`,
1167
+ which significantly reduces the disk space required for results.
1168
+ When enabled, files are saved with a `.gz` suffix (e.g., `results.jsonl.gz`).
1095
1169
 
1096
1170
  Returns
1097
1171
  -------
@@ -1107,6 +1181,7 @@ class Ingestor:
1107
1181
  self._output_config = {
1108
1182
  "output_directory": output_directory,
1109
1183
  "cleanup": cleanup,
1184
+ "compression": compression,
1110
1185
  }
1111
1186
  ensure_directory_with_permissions(output_directory)
1112
1187
 
@@ -1175,6 +1250,44 @@ class Ingestor:
1175
1250
 
1176
1251
  return self
1177
1252
 
1253
+ @ensure_job_specs
1254
+ def pdf_split_config(self, pages_per_chunk: int = 32) -> "Ingestor":
1255
+ """
1256
+ Configure PDF splitting behavior for V2 API.
1257
+
1258
+ Parameters
1259
+ ----------
1260
+ pages_per_chunk : int, optional
1261
+ Number of pages per PDF chunk (default: 32)
1262
+ Server enforces boundaries: min=1, max=128
1263
+
1264
+ Returns
1265
+ -------
1266
+ Ingestor
1267
+ Self for method chaining
1268
+
1269
+ Notes
1270
+ -----
1271
+ - Only affects V2 API endpoints with PDF splitting support
1272
+ - Server will clamp values outside [1, 128] range
1273
+ - Smaller chunks = more parallelism but more overhead
1274
+ - Larger chunks = less overhead but reduced concurrency
1275
+ """
1276
+ MIN_PAGES = 1
1277
+ MAX_PAGES = 128
1278
+
1279
+ # Warn if value will be clamped by server
1280
+ if pages_per_chunk < MIN_PAGES:
1281
+ logger.warning(f"pages_per_chunk={pages_per_chunk} is below minimum. Server will clamp to {MIN_PAGES}.")
1282
+ elif pages_per_chunk > MAX_PAGES:
1283
+ logger.warning(f"pages_per_chunk={pages_per_chunk} exceeds maximum. Server will clamp to {MAX_PAGES}.")
1284
+
1285
+ # Flatten all job specs and apply PDF config using shared utility
1286
+ all_job_specs = [spec for job_specs in self._job_specs._file_type_to_job_spec.values() for spec in job_specs]
1287
+ apply_pdf_split_config_to_job_specs(all_job_specs, pages_per_chunk)
1288
+
1289
+ return self
1290
+
1178
1291
  def _count_job_states(self, job_states: set[JobStateEnum]) -> int:
1179
1292
  """
1180
1293
  Counts the jobs in specified states.
@@ -1,3 +1,4 @@
1
+ import gzip
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -6,6 +7,7 @@ import re
6
7
  from typing import Any
7
8
  from typing import Dict
8
9
  from typing import List
10
+ from typing import Optional
9
11
  from typing import Tuple
10
12
 
11
13
  try:
@@ -33,6 +35,7 @@ def save_document_results_to_jsonl(
33
35
  jsonl_output_filepath: str,
34
36
  original_source_name_for_log: str,
35
37
  ensure_parent_dir_exists: bool = True,
38
+ compression: Optional[str] = None,
36
39
  ) -> Tuple[int, Dict[str, str]]:
37
40
  """
38
41
  Saves a list of extraction items (for a single source document) to a JSON Lines file.
@@ -50,6 +53,13 @@ def save_document_results_to_jsonl(
50
53
  if parent_dir:
51
54
  os.makedirs(parent_dir, exist_ok=True)
52
55
 
56
+ if compression == "gzip":
57
+ open_func = gzip.open
58
+ elif compression is None:
59
+ open_func = open
60
+ else:
61
+ raise ValueError(f"Unsupported compression type: {compression}")
62
+
53
63
  with io.BytesIO() as buffer:
54
64
  for extraction_item in doc_response_data:
55
65
  if USING_ORJSON:
@@ -60,7 +70,7 @@ def save_document_results_to_jsonl(
60
70
 
61
71
  count_items_written = len(doc_response_data)
62
72
 
63
- with open(jsonl_output_filepath, "wb") as f_jsonl:
73
+ with open_func(jsonl_output_filepath, "wb") as f_jsonl:
64
74
  f_jsonl.write(full_byte_content)
65
75
 
66
76
  logger.info(
@@ -25,10 +25,10 @@ from nv_ingest_client.cli.util.click import click_match_and_validate_files
25
25
  from nv_ingest_client.cli.util.click import click_validate_batch_size
26
26
  from nv_ingest_client.cli.util.click import click_validate_file_exists
27
27
  from nv_ingest_client.cli.util.click import click_validate_task
28
- from nv_ingest_client.cli.util.processing import create_and_process_jobs
29
28
  from nv_ingest_client.cli.util.processing import report_statistics
30
29
  from nv_ingest_client.cli.util.system import configure_logging
31
30
  from nv_ingest_client.client import NvIngestClient
31
+ from nv_ingest_client.client.ingest_job_handler import IngestJobHandler
32
32
  from nv_ingest_client.util.dataset import get_dataset_files
33
33
  from nv_ingest_client.util.dataset import get_dataset_statistics
34
34
  from nv_ingest_client.util.system import ensure_directory_with_permissions
@@ -74,6 +74,12 @@ logger = logging.getLogger(__name__)
74
74
  @click.option("--client_host", default="localhost", help="DNS name or URL for the endpoint.")
75
75
  @click.option("--client_port", default=7670, type=int, help="Port for the client endpoint.")
76
76
  @click.option("--client_kwargs", help="Additional arguments to pass to the client.", default="{}")
77
+ @click.option(
78
+ "--api_version",
79
+ default="v1",
80
+ type=click.Choice(["v1", "v2"], case_sensitive=False),
81
+ help="API version to use (v1 or v2). V2 required for PDF split page count feature.",
82
+ )
77
83
  @click.option(
78
84
  "--client_type",
79
85
  default="rest",
@@ -119,6 +125,8 @@ Example:
119
125
  --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
120
126
  --task 'embed'
121
127
  --task 'caption:{}'
128
+ --pdf_split_page_count 64 # Configure PDF splitting (requires --api_version v2)
129
+ --api_version v2 # Use V2 API for PDF splitting support
122
130
 
123
131
  \b
124
132
  Tasks and Options:
@@ -207,6 +215,12 @@ for locating portions of the system that might be bottlenecks for the overall ru
207
215
  )
208
216
  @click.option("--zipkin_host", default="localhost", help="DNS name or Zipkin API.")
209
217
  @click.option("--zipkin_port", default=9411, type=int, help="Port for the Zipkin trace API")
218
+ @click.option(
219
+ "--pdf_split_page_count",
220
+ default=None,
221
+ type=int,
222
+ help="Number of pages per PDF chunk for splitting. Allows per-request tuning of PDF split size in v2 api.",
223
+ )
210
224
  @click.option("--version", is_flag=True, help="Show version.")
211
225
  @click.pass_context
212
226
  def main(
@@ -215,6 +229,7 @@ def main(
215
229
  client_host: str,
216
230
  client_kwargs: str,
217
231
  client_port: int,
232
+ api_version: str,
218
233
  client_type: str,
219
234
  concurrency_n: int,
220
235
  dataset: str,
@@ -228,6 +243,7 @@ def main(
228
243
  collect_profiling_traces: bool,
229
244
  zipkin_host: str,
230
245
  zipkin_port: int,
246
+ pdf_split_page_count: int,
231
247
  task: [str],
232
248
  version: [bool],
233
249
  ):
@@ -268,6 +284,10 @@ def main(
268
284
  _client_kwargs_obj = json.loads(client_kwargs)
269
285
  except Exception:
270
286
  _client_kwargs_obj = {"raw": client_kwargs}
287
+
288
+ # Merge api_version into client_kwargs
289
+ _client_kwargs_obj["api_version"] = api_version
290
+
271
291
  _sanitized_client_kwargs = sanitize_for_logging(_client_kwargs_obj)
272
292
  logging.debug(
273
293
  f"Creating message client: {client_host} and port: {client_port} -> "
@@ -285,20 +305,24 @@ def main(
285
305
  message_client_allocator=client_allocator,
286
306
  message_client_hostname=client_host,
287
307
  message_client_port=client_port,
288
- message_client_kwargs=json.loads(client_kwargs),
308
+ message_client_kwargs=_client_kwargs_obj,
289
309
  worker_pool_size=concurrency_n,
290
310
  )
291
311
 
292
312
  start_time_ns = time.time_ns()
293
- (total_files, trace_times, pages_processed, trace_ids) = create_and_process_jobs(
294
- files=docs,
313
+ handler = IngestJobHandler(
295
314
  client=ingest_client,
315
+ files=docs,
296
316
  tasks=task,
297
317
  output_directory=output_directory,
298
318
  batch_size=batch_size,
299
319
  fail_on_error=fail_on_error,
300
320
  save_images_separately=save_images_separately,
321
+ show_progress=True,
322
+ show_telemetry=True,
323
+ pdf_split_page_count=pdf_split_page_count,
301
324
  )
325
+ (total_files, trace_times, pages_processed, trace_ids) = handler.run()
302
326
 
303
327
  report_statistics(start_time_ns, trace_times, pages_processed, total_files)
304
328
 
@@ -110,6 +110,7 @@ class JobSpec:
110
110
  "job_id": str(self._job_id),
111
111
  "tasks": [task.to_dict() for task in self._tasks],
112
112
  "tracing_options": self._extended_options.get("tracing_options", {}),
113
+ "pdf_config": self._extended_options.get("pdf_config", {}),
113
114
  }
114
115
 
115
116
  @property
@@ -36,6 +36,8 @@ class EmbedTask(Task):
36
36
  image_elements_modality: Optional[str] = None,
37
37
  structured_elements_modality: Optional[str] = None,
38
38
  audio_elements_modality: Optional[str] = None,
39
+ custom_content_field: Optional[str] = None,
40
+ result_target_field: Optional[str] = None,
39
41
  ) -> None:
40
42
  """
41
43
  Initialize the EmbedTask configuration.
@@ -76,6 +78,8 @@ class EmbedTask(Task):
76
78
  image_elements_modality=image_elements_modality,
77
79
  structured_elements_modality=structured_elements_modality,
78
80
  audio_elements_modality=audio_elements_modality,
81
+ custom_content_field=custom_content_field,
82
+ result_target_field=result_target_field,
79
83
  )
80
84
 
81
85
  self._endpoint_url = validated_data.endpoint_url
@@ -86,6 +90,8 @@ class EmbedTask(Task):
86
90
  self._image_elements_modality = validated_data.image_elements_modality
87
91
  self._structured_elements_modality = validated_data.structured_elements_modality
88
92
  self._audio_elements_modality = validated_data.audio_elements_modality
93
+ self._custom_content_field = validated_data.custom_content_field
94
+ self._result_target_field = validated_data.result_target_field
89
95
 
90
96
  def __str__(self) -> str:
91
97
  """
@@ -114,6 +120,10 @@ class EmbedTask(Task):
114
120
  info += f" structured_elements_modality: {self._structured_elements_modality}\n"
115
121
  if self._audio_elements_modality:
116
122
  info += f" audio_elements_modality: {self._audio_elements_modality}\n"
123
+ if self._custom_content_field:
124
+ info += f" custom_content_field: {self._custom_content_field}\n"
125
+ if self._result_target_field:
126
+ info += f" result_target_field: {self.result_target_field}\n"
117
127
  return info
118
128
 
119
129
  def to_dict(self) -> Dict[str, Any]:
@@ -149,4 +159,10 @@ class EmbedTask(Task):
149
159
  if self._audio_elements_modality:
150
160
  task_properties["audio_elements_modality"] = self._audio_elements_modality
151
161
 
162
+ if self._custom_content_field:
163
+ task_properties["custom_content_field"] = self._custom_content_field
164
+
165
+ if self._result_target_field:
166
+ task_properties["result_target_field"] = self.result_target_field
167
+
152
168
  return {"type": "embed", "task_properties": task_properties}
@@ -86,7 +86,7 @@ class ExtractTask(Task):
86
86
  extract_page_as_image: bool = False,
87
87
  text_depth: str = "document",
88
88
  paddle_output_format: str = "pseudo_markdown",
89
- table_output_format: str = "pseudo_markdown",
89
+ table_output_format: str = "markdown",
90
90
  ) -> None:
91
91
  """
92
92
  Setup Extract Task Config
@@ -31,7 +31,7 @@ class FilterTask(Task):
31
31
  min_size: int = 128,
32
32
  max_aspect_ratio: Union[int, float] = 5.0,
33
33
  min_aspect_ratio: Union[int, float] = 0.2,
34
- filter: bool = False,
34
+ filter: bool = True,
35
35
  ) -> None:
36
36
  """
37
37
  Setup Filter Task Config
@@ -8,18 +8,15 @@ from typing import Dict
8
8
  from typing import Type
9
9
  from typing import Union
10
10
 
11
- from .caption import CaptionTask
12
- from .dedup import DedupTask
13
- from .embed import EmbedTask
14
- from .extract import ExtractTask
15
- from .filter import FilterTask
16
- from .split import SplitTask
17
- from .store import StoreEmbedTask
18
- from .store import StoreTask
19
- from .task_base import Task
20
- from .task_base import TaskType
21
- from .task_base import is_valid_task_type
22
- from .udf import UDFTask
11
+ from nv_ingest_client.primitives.tasks.task_base import Task, TaskType, is_valid_task_type
12
+ from nv_ingest_client.primitives.tasks.caption import CaptionTask
13
+ from nv_ingest_client.primitives.tasks.dedup import DedupTask
14
+ from nv_ingest_client.primitives.tasks.embed import EmbedTask
15
+ from nv_ingest_client.primitives.tasks.extract import ExtractTask
16
+ from nv_ingest_client.primitives.tasks.filter import FilterTask
17
+ from nv_ingest_client.primitives.tasks.split import SplitTask
18
+ from nv_ingest_client.primitives.tasks.store import StoreEmbedTask, StoreTask
19
+ from nv_ingest_client.primitives.tasks.udf import UDFTask
23
20
 
24
21
 
25
22
  class TaskUnimplemented(Task):
@@ -11,6 +11,7 @@ import logging
11
11
  import importlib
12
12
  import inspect
13
13
  import ast
14
+ import re
14
15
  from typing import Dict, Optional, Union
15
16
 
16
17
  from nv_ingest_api.internal.enums.common import PipelinePhase
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
122
123
  3. File path: '/path/to/file.py:my_function'
123
124
  4. Legacy import path: 'my_module.my_function' (function name only, no imports)
124
125
  """
125
- if udf_function_spec.strip().startswith("def "):
126
- # Already an inline function string
127
- return udf_function_spec
126
+ # Default to treating as inline unless it clearly matches a
127
+ # module/file specification. This avoids misclassifying inline code that
128
+ # contains colons, imports, or annotations before the def line.
128
129
 
129
- elif ".py:" in udf_function_spec:
130
- # File path format: /path/to/file.py:function_name
131
- file_path, function_name = udf_function_spec.split(":", 1)
130
+ spec = udf_function_spec.strip()
131
+
132
+ # 1) File path with function: /path/to/file.py:function_name
133
+ if ".py:" in spec:
134
+ file_path, function_name = spec.split(":", 1)
132
135
  return _extract_function_with_context(file_path, function_name)
133
136
 
134
- elif udf_function_spec.endswith(".py"):
135
- # File path format without function name - this is an error
137
+ # 2) File path without function name is an explicit error
138
+ if spec.endswith(".py"):
136
139
  raise ValueError(
137
- f"File path '{udf_function_spec}' is missing function name. "
138
- f"Use format 'file.py:function_name' to specify which function to use."
140
+ f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
139
141
  )
140
142
 
141
- elif ":" in udf_function_spec and ".py:" not in udf_function_spec:
142
- # Module path format with colon: my_module.submodule:function_name
143
- # This preserves imports and module context
144
- module_path, function_name = udf_function_spec.split(":", 1)
145
-
143
+ # 3) Module path with colon: my.module:function
144
+ # Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
145
+ # no whitespace/newlines.
146
+ module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
147
+ if module_colon_pattern.match(spec):
148
+ module_path, function_name = spec.split(":", 1)
146
149
  try:
147
- # Import the module to get its file path
148
150
  module = importlib.import_module(module_path)
149
151
  module_file = inspect.getfile(module)
150
-
151
- # Extract the function with full module context
152
152
  return _extract_function_with_context(module_file, function_name)
153
-
154
153
  except ImportError as e:
155
154
  raise ValueError(f"Failed to import module '{module_path}': {e}")
156
155
  except Exception as e:
157
156
  raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
158
157
 
159
- elif "." in udf_function_spec:
160
- # Legacy import path format: module.submodule.function
161
- # This only extracts the function source without imports (legacy behavior)
162
- func = _load_function_from_import_path(udf_function_spec)
163
-
164
- # Get the source code of the function only
158
+ # 4) Legacy import path: my.module.function (no colon)
159
+ legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
160
+ if legacy_import_pattern.match(spec):
161
+ func = _load_function_from_import_path(spec)
165
162
  try:
166
163
  source = inspect.getsource(func)
167
164
  return source
168
165
  except (OSError, TypeError) as e:
169
166
  raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
170
167
 
171
- else:
172
- raise ValueError(f"Invalid UDF function specification: {udf_function_spec}")
168
+ # 5) Default: treat as inline UDF source (entire string)
169
+ return udf_function_spec
173
170
 
174
171
 
175
172
  class UDFTask(Task):
@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
20
20
 
21
21
 
22
22
  def analyze_document_chunks(
23
- results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]]
23
+ results: Union[List[List[Dict[str, Any]]], List[Dict[str, Any]]],
24
24
  ) -> Dict[str, Dict[str, Dict[str, int]]]:
25
25
  """
26
26
  Analyze ingestor results to count elements by type and page for each document.