nv-ingest-client 2025.10.18.dev20251018__tar.gz → 2025.11.14.dev20251114__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {nv_ingest_client-2025.10.18.dev20251018/src/nv_ingest_client.egg-info → nv_ingest_client-2025.11.14.dev20251114}/PKG-INFO +1 -1
  2. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/client.py +83 -9
  3. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/ingest_job_handler.py +28 -6
  4. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/interface.py +128 -29
  5. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_spec.py +3 -0
  6. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/embed.py +24 -0
  7. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/filter.py +1 -1
  8. nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client/primitives/tasks/ocr_extraction.py +55 -0
  9. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/udf.py +24 -27
  10. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/extract.py +4 -0
  11. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/milvus.py +44 -20
  12. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114/src/nv_ingest_client.egg-info}/PKG-INFO +1 -1
  13. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/SOURCES.txt +1 -0
  14. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/LICENSE +0 -0
  15. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/MANIFEST.in +0 -0
  16. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/README.md +0 -0
  17. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/pyproject.toml +0 -0
  18. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/setup.cfg +0 -0
  19. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/__init__.py +0 -0
  20. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/__init__.py +0 -0
  21. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/__init__.py +0 -0
  22. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/click.py +0 -0
  23. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/processing.py +0 -0
  24. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/cli/util/system.py +0 -0
  25. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/__init__.py +0 -0
  26. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/client/util/processing.py +0 -0
  27. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/nv_ingest_cli.py +0 -0
  28. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/__init__.py +0 -0
  29. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/__init__.py +0 -0
  30. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/jobs/job_state.py +0 -0
  31. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/__init__.py +0 -0
  32. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/audio_extraction.py +0 -0
  33. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/caption.py +0 -0
  34. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/chart_extraction.py +0 -0
  35. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/dedup.py +0 -0
  36. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/extract.py +0 -0
  37. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/infographic_extraction.py +0 -0
  38. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/split.py +0 -0
  39. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/store.py +0 -0
  40. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/table_extraction.py +0 -0
  41. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_base.py +0 -0
  42. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/task_factory.py +0 -0
  43. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/primitives/tasks/vdb_upload.py +0 -0
  44. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/__init__.py +0 -0
  45. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/dataset.py +0 -0
  46. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/document_analysis.py +0 -0
  47. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/file_processing/__init__.py +0 -0
  48. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/image_disk_utils.py +0 -0
  49. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/milvus.py +0 -0
  50. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/process_json_files.py +0 -0
  51. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/processing.py +0 -0
  52. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/system.py +0 -0
  53. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/transport.py +0 -0
  54. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/util.py +0 -0
  55. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/__init__.py +0 -0
  56. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/adt_vdb.py +0 -0
  57. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/vdb/opensearch.py +0 -0
  58. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client/util/zipkin.py +0 -0
  59. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/dependency_links.txt +0 -0
  60. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/entry_points.txt +0 -0
  61. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/requires.txt +0 -0
  62. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/nv_ingest_client.egg-info/top_level.txt +0 -0
  63. {nv_ingest_client-2025.10.18.dev20251018 → nv_ingest_client-2025.11.14.dev20251114}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.10.18.dev20251018
3
+ Version: 2025.11.14.dev20251114
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -44,6 +44,50 @@ from nv_ingest_client.util.util import (
44
44
  logger = logging.getLogger(__name__)
45
45
 
46
46
 
47
+ def _compute_resident_times(trace_dict: Dict[str, Any]) -> Dict[str, Any]:
48
+ """
49
+ Compute resident_time entries from entry/exit pairs if not already present.
50
+
51
+ This ensures consistency between split jobs (where server computes resident_time)
52
+ and non-split jobs (where we compute it client-side).
53
+
54
+ Parameters
55
+ ----------
56
+ trace_dict : Dict[str, Any]
57
+ Trace dictionary with entry/exit pairs
58
+
59
+ Returns
60
+ -------
61
+ Dict[str, Any]
62
+ Trace dictionary with resident_time entries added
63
+ """
64
+ if not trace_dict or not isinstance(trace_dict, dict):
65
+ return trace_dict
66
+
67
+ # Check if resident_time already exists (server-computed for split jobs)
68
+ has_resident = any(k.startswith("trace::resident_time::") for k in trace_dict.keys())
69
+ if has_resident:
70
+ return trace_dict # Already computed by server
71
+
72
+ # Compute resident_time from entry/exit pairs
73
+ result = dict(trace_dict)
74
+ stages = set()
75
+
76
+ # Find all unique stages
77
+ for key in trace_dict:
78
+ if key.startswith("trace::entry::"):
79
+ stages.add(key.replace("trace::entry::", ""))
80
+
81
+ # Compute resident_time for each stage
82
+ for stage in stages:
83
+ entry_key = f"trace::entry::{stage}"
84
+ exit_key = f"trace::exit::{stage}"
85
+ if entry_key in trace_dict and exit_key in trace_dict:
86
+ result[f"trace::resident_time::{stage}"] = trace_dict[exit_key] - trace_dict[entry_key]
87
+
88
+ return result
89
+
90
+
47
91
  class DataDecodeException(Exception):
48
92
  """
49
93
  Exception raised for errors in decoding data.
@@ -87,6 +131,7 @@ class _ConcurrentProcessor:
87
131
  stream_to_callback_only: bool,
88
132
  return_full_response: bool,
89
133
  verbose: bool = False,
134
+ return_traces: bool = False,
90
135
  ):
91
136
  """
92
137
  Initializes the concurrent processor.
@@ -120,6 +165,8 @@ class _ConcurrentProcessor:
120
165
  initiating job submission or fetching fails for a batch.
121
166
  verbose : bool, optional
122
167
  If True, enables detailed debug logging. Default is False.
168
+ return_traces : bool, optional
169
+ If True, parent-level trace data for each completed job is stored.
123
170
 
124
171
  Raises
125
172
  ------
@@ -142,12 +189,14 @@ class _ConcurrentProcessor:
142
189
  self.stream_to_callback_only = stream_to_callback_only
143
190
  self.return_full_response = return_full_response
144
191
  self.verbose = verbose
192
+ self.return_traces = return_traces
145
193
 
146
194
  # State variables managed across batch cycles
147
195
  self.retry_job_ids: List[str] = []
148
196
  self.retry_counts: Dict[str, int] = defaultdict(int)
149
197
  self.results: List[Dict[str, Any]] = [] # Stores successful results (full dicts)
150
198
  self.failures: List[Tuple[str, str]] = [] # (job_index, error_message)
199
+ self.traces: List[Optional[Dict[str, Any]]] = []
151
200
 
152
201
  # --- Initial Checks ---
153
202
  if not self.job_queue_id:
@@ -247,6 +296,14 @@ class _ConcurrentProcessor:
247
296
  # When requested, return the full response envelope (includes 'trace' and 'annotations')
248
297
  self.results.append(result_data if self.return_full_response else result_data.get("data"))
249
298
 
299
+ # Extract trace data for all successful (non-failed) jobs
300
+ if self.return_traces and not is_failed:
301
+ trace_payload = result_data.get("trace") if result_data else None
302
+ # Compute resident_time if not already present (for consistency)
303
+ if trace_payload:
304
+ trace_payload = _compute_resident_times(trace_payload)
305
+ self.traces.append(trace_payload if trace_payload else None)
306
+
250
307
  # Cleanup retry count if it exists
251
308
  if job_index in self.retry_counts:
252
309
  del self.retry_counts[job_index]
@@ -438,7 +495,7 @@ class _ConcurrentProcessor:
438
495
 
439
496
  return batch_futures_dict, normalized_job_indices
440
497
 
441
- def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
498
+ def run(self) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]]:
442
499
  """
443
500
  Executes the main processing loop in batches.
444
501
 
@@ -581,7 +638,7 @@ class _ConcurrentProcessor:
581
638
  # --- Final Logging ---
582
639
  self._log_final_status(total_jobs)
583
640
 
584
- return self.results, self.failures
641
+ return self.results, self.failures, self.traces if self.return_traces else []
585
642
 
586
643
 
587
644
  class NvIngestClient:
@@ -1212,7 +1269,12 @@ class NvIngestClient:
1212
1269
  stream_to_callback_only: bool = False,
1213
1270
  return_full_response: bool = False,
1214
1271
  verbose: bool = False,
1215
- ) -> Union[List[Any], Tuple[List[Any], List[Tuple[str, str]]]]:
1272
+ return_traces: bool = False,
1273
+ ) -> Union[
1274
+ List[Any],
1275
+ Tuple[List[Any], List[Tuple[str, str]]],
1276
+ Tuple[List[Any], List[Tuple[str, str]], List[Optional[Dict[str, Any]]]],
1277
+ ]:
1216
1278
  """
1217
1279
  Submit and fetch multiple jobs concurrently.
1218
1280
 
@@ -1247,6 +1309,8 @@ class NvIngestClient:
1247
1309
  Ignored when stream_to_callback_only=True. Default is False.
1248
1310
  verbose : bool, optional
1249
1311
  If True, enable debug logging. Default is False.
1312
+ return_traces : bool, optional
1313
+ If True, parent-level aggregated trace metrics are extracted and returned. Default is False.
1250
1314
 
1251
1315
  Returns
1252
1316
  -------
@@ -1254,6 +1318,9 @@ class NvIngestClient:
1254
1318
  List of successful job results when `return_failures` is False.
1255
1319
  results, failures : tuple
1256
1320
  Tuple of (successful results, failure tuples) when `return_failures` is True.
1321
+ results, failures, traces : tuple
1322
+ Tuple of (successful results, failure tuples, trace dicts) when both
1323
+ `return_failures` and `return_traces` are True.
1257
1324
 
1258
1325
  Raises
1259
1326
  ------
@@ -1266,7 +1333,12 @@ class NvIngestClient:
1266
1333
 
1267
1334
  # Handle empty input
1268
1335
  if not job_indices:
1269
- return ([], []) if return_failures else []
1336
+ if return_failures and return_traces:
1337
+ return [], [], []
1338
+ elif return_failures:
1339
+ return [], []
1340
+ else:
1341
+ return []
1270
1342
 
1271
1343
  # Validate and set batch_size
1272
1344
  validated_batch_size = self._validate_batch_size(batch_size)
@@ -1289,12 +1361,17 @@ class NvIngestClient:
1289
1361
  stream_to_callback_only=stream_to_callback_only,
1290
1362
  return_full_response=return_full_response,
1291
1363
  verbose=verbose,
1364
+ return_traces=return_traces,
1292
1365
  )
1293
1366
 
1294
- results, failures = processor.run()
1367
+ results, failures, traces = processor.run()
1295
1368
 
1296
- if return_failures:
1369
+ if return_failures and return_traces:
1370
+ return results, failures, traces
1371
+ elif return_failures:
1297
1372
  return results, failures
1373
+ elif return_traces:
1374
+ return results, traces
1298
1375
 
1299
1376
  if failures:
1300
1377
  logger.warning(f"{len(failures)} job(s) failed during concurrent processing." " Check logs for details.")
@@ -1628,9 +1705,6 @@ class NvIngestClient:
1628
1705
  )
1629
1706
  logger.error(error_msg)
1630
1707
  failures.append((self._job_index_to_job_spec[job_id].source_id, str(e)))
1631
- finally:
1632
- # Clean up the job spec mapping
1633
- del self._job_index_to_job_spec[job_id]
1634
1708
 
1635
1709
  if return_failures:
1636
1710
  return results, failures
@@ -323,18 +323,40 @@ class IngestJobHandler:
323
323
 
324
324
  futures_dict: Dict[Any, str] = self.client.fetch_job_result_async(self._job_ids_batch, data_only=False)
325
325
  for future in as_completed(futures_dict.keys()):
326
+ pages_per_sec = None
326
327
  try:
327
328
  # Block as each future completes; this mirrors CLI behavior
328
329
  future_response, trace_id = self._handle_future_result(future)
329
330
  job_id: str = futures_dict[future]
330
331
  trace_ids[job_id_map[job_id]] = trace_id
331
332
 
332
- first_page_metadata = future_response["data"][0]["metadata"]
333
- file_page_counts: Dict[str, int] = {
334
- first_page_metadata["source_metadata"]["source_name"]: first_page_metadata[
335
- "content_metadata"
336
- ]["hierarchy"]["page_count"]
337
- }
333
+ # Extract page count: prefer V2 metadata location, fall back to V1
334
+ page_count = None
335
+ source_name = None
336
+
337
+ # Try V2 metadata location first (top-level metadata.total_pages)
338
+ if "metadata" in future_response and future_response["metadata"]:
339
+ response_metadata = future_response["metadata"]
340
+ page_count = response_metadata.get("total_pages")
341
+ source_name = response_metadata.get("original_source_name")
342
+
343
+ # Fall back to V1 location (first data element's hierarchy.page_count)
344
+ if page_count is None and future_response.get("data"):
345
+ try:
346
+ first_page_metadata = future_response["data"][0]["metadata"]
347
+ page_count = first_page_metadata["content_metadata"]["hierarchy"]["page_count"]
348
+ source_name = first_page_metadata["source_metadata"]["source_name"]
349
+ except (KeyError, IndexError, TypeError):
350
+ # If we can't extract from V1 location, use defaults
351
+ pass
352
+
353
+ # Use extracted values or defaults
354
+ if page_count is None:
355
+ page_count = 0 # Default if not found
356
+ if source_name is None:
357
+ source_name = "unknown_source"
358
+
359
+ file_page_counts: Dict[str, int] = {source_name: page_count}
338
360
 
339
361
  if self.output_directory:
340
362
  self._save_response_data(
@@ -402,16 +402,9 @@ class Ingestor:
402
402
  show_progress: bool = False,
403
403
  return_failures: bool = False,
404
404
  save_to_disk: bool = False,
405
+ return_traces: bool = False,
405
406
  **kwargs: Any,
406
- ) -> Union[
407
- List[List[Dict[str, Any]]], # In-memory: List of response['data'] for each doc
408
- List[Dict[str, Any]], # In-memory: Full response envelopes when return_full_response=True
409
- List[LazyLoadedList], # Disk: List of proxies, one per original doc
410
- Tuple[
411
- Union[List[List[Dict[str, Any]]], List[Dict[str, Any]], List[LazyLoadedList]],
412
- List[Tuple[str, str]],
413
- ],
414
- ]: # noqa: E501
407
+ ) -> Union[List[Any], Tuple[Any, ...]]:
415
408
  """
416
409
  Ingest documents by submitting jobs and fetching results concurrently.
417
410
 
@@ -421,24 +414,30 @@ class Ingestor:
421
414
  Whether to display a progress bar. Default is False.
422
415
  return_failures : bool, optional
423
416
  If True, return a tuple (results, failures); otherwise, return only results. Default is False.
417
+ save_to_disk : bool, optional
418
+ If True, save results to disk and return LazyLoadedList proxies. Default is False.
419
+ return_traces : bool, optional
420
+ If True, return trace metrics alongside results. Default is False.
421
+ Traces contain timing metrics (entry, exit, resident_time) for each stage.
424
422
  **kwargs : Any
425
- Additional keyword arguments for the underlying client methods. Supported keys:
426
- 'concurrency_limit', 'timeout', 'max_job_retries', 'retry_delay',
427
- 'data_only', 'return_full_response', 'verbose'. Unrecognized keys are passed
428
- through to process_jobs_concurrently.
423
+ Additional keyword arguments for the underlying client methods.
429
424
  Optional flags include `include_parent_trace_ids=True` to also return
430
- parent job trace identifiers gathered during ingestion.
425
+ parent job trace identifiers (V2 API only).
431
426
 
432
427
  Returns
433
428
  -------
434
- results : list of dict
435
- List of successful job results when `return_failures` is False.
429
+ list or tuple
430
+ Returns vary based on flags:
431
+ - Default: list of results
432
+ - return_failures=True: (results, failures)
433
+ - return_traces=True: (results, traces)
434
+ - return_failures=True, return_traces=True: (results, failures, traces)
435
+ - Additional combinations with include_parent_trace_ids kwarg
436
436
 
437
- results, failures : tuple (list of dict, list of tuple of str)
438
- Tuple containing successful results and failure information when `return_failures` is True.
439
-
440
- If `include_parent_trace_ids=True` is provided via kwargs, an additional
441
- list of parent trace IDs is appended to the return value.
437
+ Notes
438
+ -----
439
+ Trace metrics include timing data for each processing stage. For detailed
440
+ usage and examples, see src/nv_ingest/api/v2/README.md
442
441
  """
443
442
  if save_to_disk and (not self._output_config):
444
443
  self.save_to_disk()
@@ -574,7 +573,8 @@ class Ingestor:
574
573
  if enable_telemetry is not None and hasattr(self._client, "enable_telemetry"):
575
574
  self._client.enable_telemetry(bool(enable_telemetry))
576
575
 
577
- results, failures = self._client.process_jobs_concurrently(
576
+ # Call process_jobs_concurrently
577
+ proc_result = self._client.process_jobs_concurrently(
578
578
  job_indices=self._job_ids,
579
579
  job_queue_id=self._job_queue_id,
580
580
  timeout=timeout,
@@ -583,9 +583,17 @@ class Ingestor:
583
583
  return_failures=True,
584
584
  stream_to_callback_only=stream_to_callback_only,
585
585
  verbose=verbose,
586
+ return_traces=return_traces,
586
587
  **proc_kwargs,
587
588
  )
588
589
 
590
+ # Unpack result based on return_traces flag
591
+ if return_traces:
592
+ results, failures, traces_list = proc_result
593
+ else:
594
+ results, failures = proc_result
595
+ traces_list = [] # Empty list when traces not requested
596
+
589
597
  if show_progress and pbar:
590
598
  pbar.close()
591
599
 
@@ -648,13 +656,18 @@ class Ingestor:
648
656
 
649
657
  parent_trace_ids = self._client.consume_completed_parent_trace_ids() if include_parent_trace_ids else []
650
658
 
651
- if return_failures and include_parent_trace_ids:
652
- return results, failures, parent_trace_ids
659
+ # Build return tuple based on requested outputs
660
+ # Order: results, failures (if requested), traces (if requested), parent_trace_ids (if requested)
661
+ returns = [results]
662
+
653
663
  if return_failures:
654
- return results, failures
664
+ returns.append(failures)
665
+ if return_traces:
666
+ returns.append(traces_list)
655
667
  if include_parent_trace_ids:
656
- return results, parent_trace_ids
657
- return results
668
+ returns.append(parent_trace_ids)
669
+
670
+ return tuple(returns) if len(returns) > 1 else results
658
671
 
659
672
  def ingest_async(self, **kwargs: Any) -> Future:
660
673
  """
@@ -681,6 +694,7 @@ class Ingestor:
681
694
  submitted_futures = set(future_to_job_id.keys())
682
695
  completed_futures = set()
683
696
  future_results = []
697
+ vdb_future = None
684
698
 
685
699
  def _done_callback(future):
686
700
  job_id = future_to_job_id[future]
@@ -702,9 +716,10 @@ class Ingestor:
702
716
  future.add_done_callback(_done_callback)
703
717
 
704
718
  if self._vdb_bulk_upload:
705
- self._vdb_bulk_upload.run(combined_future.result())
719
+ executor = ThreadPoolExecutor(max_workers=1)
720
+ vdb_future = executor.submit(self._vdb_bulk_upload.run_async, combined_future)
706
721
 
707
- return combined_future
722
+ return combined_future if not vdb_future else vdb_future
708
723
 
709
724
  @ensure_job_specs
710
725
  def _prepare_ingest_run(self):
@@ -821,6 +836,7 @@ class Ingestor:
821
836
  extract_tables = kwargs.pop("extract_tables", True)
822
837
  extract_charts = kwargs.pop("extract_charts", True)
823
838
  extract_page_as_image = kwargs.pop("extract_page_as_image", False)
839
+ table_output_format = kwargs.pop("table_output_format", "markdown")
824
840
 
825
841
  # Defaulting to False since enabling infographic extraction reduces throughput.
826
842
  # Users have to set to True if infographic extraction is required.
@@ -843,6 +859,7 @@ class Ingestor:
843
859
  extract_charts=extract_charts,
844
860
  extract_infographics=extract_infographics,
845
861
  extract_page_as_image=extract_page_as_image,
862
+ table_output_format=table_output_format,
846
863
  **kwargs,
847
864
  )
848
865
 
@@ -1346,3 +1363,85 @@ class Ingestor:
1346
1363
  terminal_jobs = self.completed_jobs() + self.failed_jobs() + self.cancelled_jobs()
1347
1364
 
1348
1365
  return len(self._job_states) - terminal_jobs
1366
+
1367
+ def get_status(self) -> Dict[str, str]:
1368
+ """
1369
+ Returns a dictionary mapping document identifiers to their current status in the pipeline.
1370
+
1371
+ This method is designed for use with async ingestion to poll the status of submitted jobs.
1372
+ For each document submitted to the ingestor, the method returns its current processing state.
1373
+
1374
+ Returns
1375
+ -------
1376
+ Dict[str, str]
1377
+ A dictionary where:
1378
+ - Keys are document identifiers (source names or source IDs)
1379
+ - Values are status strings representing the current state:
1380
+ * "pending": Job created but not yet submitted
1381
+ * "submitted": Job submitted and waiting for processing
1382
+ * "processing": Job is currently being processed
1383
+ * "completed": Job finished successfully
1384
+ * "failed": Job encountered an error
1385
+ * "cancelled": Job was cancelled
1386
+ * "unknown": Job state could not be determined (initial state)
1387
+
1388
+ Examples
1389
+ --------
1390
+ >>> ingestor = Ingestor(documents=["doc1.pdf", "doc2.pdf"], client=client)
1391
+ >>> ingestor.extract().embed()
1392
+ >>> future = ingestor.ingest_async()
1393
+ >>>
1394
+ >>> # Poll status while processing
1395
+ >>> status = ingestor.get_status()
1396
+ >>> print(status)
1397
+ {'doc1.pdf': 'processing', 'doc2.pdf': 'submitted'}
1398
+ >>>
1399
+ >>> # Check again after some time
1400
+ >>> status = ingestor.get_status()
1401
+ >>> print(status)
1402
+ {'doc1.pdf': 'completed', 'doc2.pdf': 'processing'}
1403
+
1404
+ Notes
1405
+ -----
1406
+ - This method is most useful when called after `ingest_async()` to track progress
1407
+ - If called before any jobs are submitted, returns an empty dictionary or
1408
+ documents with "unknown" status
1409
+ - The method accesses internal job state from the client, so it reflects
1410
+ the most current known state
1411
+ """
1412
+ status_dict = {}
1413
+
1414
+ if not self._job_states:
1415
+ # If job states haven't been initialized yet (before ingest_async is called)
1416
+ # Return unknown status for all documents
1417
+ for doc in self._documents:
1418
+ doc_name = os.path.basename(doc) if isinstance(doc, str) else str(doc)
1419
+ status_dict[doc_name] = "unknown"
1420
+ return status_dict
1421
+
1422
+ # Map job IDs to their states and source identifiers
1423
+ for job_id, job_state in self._job_states.items():
1424
+ # Get the job spec to find the source identifier
1425
+ job_spec = self._client._job_index_to_job_spec.get(job_id)
1426
+
1427
+ if job_spec:
1428
+ # Use source_name as the key (the document name)
1429
+ source_identifier = job_spec.source_name
1430
+ else:
1431
+ # Fallback to job_id if we can't find the spec
1432
+ source_identifier = f"job_{job_id}"
1433
+
1434
+ # Map the JobStateEnum to a user-friendly string
1435
+ state_mapping = {
1436
+ JobStateEnum.PENDING: "pending",
1437
+ JobStateEnum.SUBMITTED_ASYNC: "submitted",
1438
+ JobStateEnum.SUBMITTED: "submitted",
1439
+ JobStateEnum.PROCESSING: "processing",
1440
+ JobStateEnum.COMPLETED: "completed",
1441
+ JobStateEnum.FAILED: "failed",
1442
+ JobStateEnum.CANCELLED: "cancelled",
1443
+ }
1444
+
1445
+ status_dict[source_identifier] = state_mapping.get(job_state.state, "unknown")
1446
+
1447
+ return status_dict
@@ -18,6 +18,7 @@ from nv_ingest_client.primitives.tasks.audio_extraction import AudioExtractionTa
18
18
  from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
19
19
  from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
20
20
  from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
21
+ from nv_ingest_client.primitives.tasks.ocr_extraction import OCRExtractionTask
21
22
  from nv_ingest_client.util.dataset import get_dataset_files
22
23
  from nv_ingest_client.util.dataset import get_dataset_statistics
23
24
 
@@ -199,6 +200,8 @@ class JobSpec:
199
200
  self._tasks.append(ChartExtractionTask())
200
201
  if isinstance(task, ExtractTask) and (task._extract_infographics is True):
201
202
  self._tasks.append(InfographicExtractionTask())
203
+ if isinstance(task, ExtractTask) and (task._extract_method in {"ocr"}):
204
+ self._tasks.append(OCRExtractionTask())
202
205
  if isinstance(task, ExtractTask) and (task._extract_method == "audio"):
203
206
  extract_audio_params = task._extract_audio_params or {}
204
207
  self._tasks.append(AudioExtractionTask(**extract_audio_params))
@@ -36,6 +36,9 @@ class EmbedTask(Task):
36
36
  image_elements_modality: Optional[str] = None,
37
37
  structured_elements_modality: Optional[str] = None,
38
38
  audio_elements_modality: Optional[str] = None,
39
+ custom_content_field: Optional[str] = None,
40
+ result_target_field: Optional[str] = None,
41
+ dimensions: Optional[int] = None,
39
42
  ) -> None:
40
43
  """
41
44
  Initialize the EmbedTask configuration.
@@ -76,6 +79,9 @@ class EmbedTask(Task):
76
79
  image_elements_modality=image_elements_modality,
77
80
  structured_elements_modality=structured_elements_modality,
78
81
  audio_elements_modality=audio_elements_modality,
82
+ custom_content_field=custom_content_field,
83
+ result_target_field=result_target_field,
84
+ dimensions=dimensions,
79
85
  )
80
86
 
81
87
  self._endpoint_url = validated_data.endpoint_url
@@ -86,6 +92,9 @@ class EmbedTask(Task):
86
92
  self._image_elements_modality = validated_data.image_elements_modality
87
93
  self._structured_elements_modality = validated_data.structured_elements_modality
88
94
  self._audio_elements_modality = validated_data.audio_elements_modality
95
+ self._custom_content_field = validated_data.custom_content_field
96
+ self._result_target_field = validated_data.result_target_field
97
+ self._dimensions = validated_data.dimensions
89
98
 
90
99
  def __str__(self) -> str:
91
100
  """
@@ -114,6 +123,12 @@ class EmbedTask(Task):
114
123
  info += f" structured_elements_modality: {self._structured_elements_modality}\n"
115
124
  if self._audio_elements_modality:
116
125
  info += f" audio_elements_modality: {self._audio_elements_modality}\n"
126
+ if self._custom_content_field:
127
+ info += f" custom_content_field: {self._custom_content_field}\n"
128
+ if self._result_target_field:
129
+ info += f" result_target_field: {self.result_target_field}\n"
130
+ if self._dimensions:
131
+ info += f" dimensions: {self._dimensions}\n"
117
132
  return info
118
133
 
119
134
  def to_dict(self) -> Dict[str, Any]:
@@ -149,4 +164,13 @@ class EmbedTask(Task):
149
164
  if self._audio_elements_modality:
150
165
  task_properties["audio_elements_modality"] = self._audio_elements_modality
151
166
 
167
+ if self._custom_content_field:
168
+ task_properties["custom_content_field"] = self._custom_content_field
169
+
170
+ if self._result_target_field:
171
+ task_properties["result_target_field"] = self._result_target_field
172
+
173
+ if self._dimensions:
174
+ task_properties["dimensions"] = self._dimensions
175
+
152
176
  return {"type": "embed", "task_properties": task_properties}
@@ -31,7 +31,7 @@ class FilterTask(Task):
31
31
  min_size: int = 128,
32
32
  max_aspect_ratio: Union[int, float] = 5.0,
33
33
  min_aspect_ratio: Union[int, float] = 0.2,
34
- filter: bool = False,
34
+ filter: bool = True,
35
35
  ) -> None:
36
36
  """
37
37
  Setup Filter Task Config
@@ -0,0 +1,55 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # pylint: disable=too-few-public-methods
7
+ # pylint: disable=too-many-arguments
8
+
9
+ import logging
10
+ from typing import Dict
11
+
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskOCRExtraction
13
+ from nv_ingest_client.primitives.tasks.task_base import Task
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class OCRExtractionTask(Task):
19
+ """
20
+ Object for ocr extraction task
21
+ """
22
+
23
+ def __init__(self, params: dict = None) -> None:
24
+ """
25
+ Setup OCR Extraction Task Config
26
+ """
27
+ super().__init__()
28
+
29
+ # Handle None params by converting to empty dict for backward compatibility
30
+ if params is None:
31
+ params = {}
32
+
33
+ # Use the API schema for validation
34
+ validated_data = IngestTaskOCRExtraction(params=params)
35
+
36
+ self._params = validated_data.params
37
+
38
+ def __str__(self) -> str:
39
+ """
40
+ Returns a string with the object's config and run time state
41
+ """
42
+ info = ""
43
+ info += "OCR Extraction Task:\n"
44
+ info += f" params: {self._params}\n"
45
+ return info
46
+
47
+ def to_dict(self) -> Dict:
48
+ """
49
+ Convert to a dict for submission to redis
50
+ """
51
+ task_properties = {
52
+ "params": self._params,
53
+ }
54
+
55
+ return {"type": "ocr_data_extract", "task_properties": task_properties}
@@ -11,6 +11,7 @@ import logging
11
11
  import importlib
12
12
  import inspect
13
13
  import ast
14
+ import re
14
15
  from typing import Dict, Optional, Union
15
16
 
16
17
  from nv_ingest_api.internal.enums.common import PipelinePhase
@@ -122,54 +123,50 @@ def _resolve_udf_function(udf_function_spec: str) -> str:
122
123
  3. File path: '/path/to/file.py:my_function'
123
124
  4. Legacy import path: 'my_module.my_function' (function name only, no imports)
124
125
  """
125
- if udf_function_spec.strip().startswith("def "):
126
- # Already an inline function string
127
- return udf_function_spec
126
+ # Default to treating as inline unless it clearly matches a
127
+ # module/file specification. This avoids misclassifying inline code that
128
+ # contains colons, imports, or annotations before the def line.
128
129
 
129
- elif ".py:" in udf_function_spec:
130
- # File path format: /path/to/file.py:function_name
131
- file_path, function_name = udf_function_spec.split(":", 1)
130
+ spec = udf_function_spec.strip()
131
+
132
+ # 1) File path with function: /path/to/file.py:function_name
133
+ if ".py:" in spec:
134
+ file_path, function_name = spec.split(":", 1)
132
135
  return _extract_function_with_context(file_path, function_name)
133
136
 
134
- elif udf_function_spec.endswith(".py"):
135
- # File path format without function name - this is an error
137
+ # 2) File path without function name is an explicit error
138
+ if spec.endswith(".py"):
136
139
  raise ValueError(
137
- f"File path '{udf_function_spec}' is missing function name. "
138
- f"Use format 'file.py:function_name' to specify which function to use."
140
+ f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
139
141
  )
140
142
 
141
- elif ":" in udf_function_spec and ".py:" not in udf_function_spec:
142
- # Module path format with colon: my_module.submodule:function_name
143
- # This preserves imports and module context
144
- module_path, function_name = udf_function_spec.split(":", 1)
145
-
143
+ # 3) Module path with colon: my.module:function
144
+ # Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
145
+ # no whitespace/newlines.
146
+ module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
147
+ if module_colon_pattern.match(spec):
148
+ module_path, function_name = spec.split(":", 1)
146
149
  try:
147
- # Import the module to get its file path
148
150
  module = importlib.import_module(module_path)
149
151
  module_file = inspect.getfile(module)
150
-
151
- # Extract the function with full module context
152
152
  return _extract_function_with_context(module_file, function_name)
153
-
154
153
  except ImportError as e:
155
154
  raise ValueError(f"Failed to import module '{module_path}': {e}")
156
155
  except Exception as e:
157
156
  raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
158
157
 
159
- elif "." in udf_function_spec:
160
- # Legacy import path format: module.submodule.function
161
- # This only extracts the function source without imports (legacy behavior)
162
- func = _load_function_from_import_path(udf_function_spec)
163
-
164
- # Get the source code of the function only
158
+ # 4) Legacy import path: my.module.function (no colon)
159
+ legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
160
+ if legacy_import_pattern.match(spec):
161
+ func = _load_function_from_import_path(spec)
165
162
  try:
166
163
  source = inspect.getsource(func)
167
164
  return source
168
165
  except (OSError, TypeError) as e:
169
166
  raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
170
167
 
171
- else:
172
- raise ValueError(f"Invalid UDF function specification: {udf_function_spec}")
168
+ # 5) Default: treat as inline UDF source (entire string)
169
+ return udf_function_spec
173
170
 
174
171
 
175
172
  class UDFTask(Task):
@@ -51,6 +51,10 @@ EXTENSION_TO_DOCUMENT_TYPE = {
51
51
  "txt": DocumentTypeEnum.TXT,
52
52
  "mp3": DocumentTypeEnum.MP3,
53
53
  "wav": DocumentTypeEnum.WAV,
54
+ "mp4": DocumentTypeEnum.MP4,
55
+ "mov": DocumentTypeEnum.MOV,
56
+ "avi": DocumentTypeEnum.AVI,
57
+ "mkv": DocumentTypeEnum.MKV,
54
58
  # Add more as needed
55
59
  }
56
60
 
@@ -44,6 +44,7 @@ from scipy.sparse import csr_array
44
44
  logger = logging.getLogger(__name__)
45
45
 
46
46
  CONSISTENCY = CONSISTENCY_BOUNDED
47
+ DENSE_INDEX_NAME = "dense_index"
47
48
 
48
49
  pandas_reader_map = {
49
50
  ".json": pd.read_json,
@@ -93,7 +94,7 @@ def create_meta_collection(
93
94
  index_params = MilvusClient.prepare_index_params()
94
95
  index_params.add_index(
95
96
  field_name="vector",
96
- index_name="dense_index",
97
+ index_name=DENSE_INDEX_NAME,
97
98
  index_type="FLAT",
98
99
  metric_type="L2",
99
100
  )
@@ -313,7 +314,7 @@ def create_nvingest_index_params(
313
314
  if local_index:
314
315
  index_params.add_index(
315
316
  field_name="vector",
316
- index_name="dense_index",
317
+ index_name=DENSE_INDEX_NAME,
317
318
  index_type="FLAT",
318
319
  metric_type="L2",
319
320
  )
@@ -321,7 +322,7 @@ def create_nvingest_index_params(
321
322
  if gpu_index:
322
323
  index_params.add_index(
323
324
  field_name="vector",
324
- index_name="dense_index",
325
+ index_name=DENSE_INDEX_NAME,
325
326
  index_type="GPU_CAGRA",
326
327
  metric_type="L2",
327
328
  params={
@@ -335,7 +336,7 @@ def create_nvingest_index_params(
335
336
  else:
336
337
  index_params.add_index(
337
338
  field_name="vector",
338
- index_name="dense_index",
339
+ index_name=DENSE_INDEX_NAME,
339
340
  index_type="HNSW",
340
341
  metric_type="L2",
341
342
  params={"M": 64, "efConstruction": 512},
@@ -493,7 +494,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
493
494
  if isinstance(indexes, dict):
494
495
  # Old Milvus behavior (< 2.5.6)
495
496
  for k, v in indexes.items():
496
- if k[1] == "dense_index" and hasattr(v, "_index_type"):
497
+ if k[1] == DENSE_INDEX_NAME and hasattr(v, "_index_type"):
497
498
  d_idx = v._index_type
498
499
  if sparse and k[1] == "sparse_index" and hasattr(v, "_index_type"):
499
500
  s_idx = v._index_type
@@ -504,7 +505,7 @@ def _get_index_types(index_params: IndexParams, sparse: bool = False) -> Tuple[s
504
505
  index_name = getattr(idx, "index_name", None)
505
506
  index_type = getattr(idx, "index_type", None)
506
507
 
507
- if index_name == "dense_index":
508
+ if index_name == DENSE_INDEX_NAME:
508
509
  d_idx = index_type
509
510
  if sparse and index_name == "sparse_index":
510
511
  s_idx = index_type
@@ -776,13 +777,13 @@ def bulk_insert_milvus(
776
777
  t_bulk_start = time.time()
777
778
  task_ids = []
778
779
 
779
- task_ids.append(
780
- utility.do_bulk_insert(
780
+ for files in writer.batch_files:
781
+ task_id = utility.do_bulk_insert(
781
782
  collection_name=collection_name,
782
- files=[file for files in writer.batch_files for file in files],
783
+ files=files,
783
784
  consistency_level=CONSISTENCY,
784
785
  )
785
- )
786
+ task_ids.append(task_id)
786
787
 
787
788
  while len(task_ids) > 0:
788
789
  time.sleep(1)
@@ -900,30 +901,32 @@ def wait_for_index(collection_name: str, num_elements: int, client: MilvusClient
900
901
  (refer to MilvusClient.refresh_load for bulk inserts).
901
902
  """
902
903
  client.flush(collection_name)
903
- index_names = utility.list_indexes(collection_name)
904
+ # index_names = utility.list_indexes(collection_name)
904
905
  indexed_rows = 0
905
- for index_name in index_names:
906
+ # observe dense_index, all indexes get populated simultaneously
907
+ for index_name in [DENSE_INDEX_NAME]:
906
908
  indexed_rows = 0
907
- while indexed_rows < num_elements:
909
+ expected_rows = client.describe_index(collection_name, index_name)["indexed_rows"] + num_elements
910
+ while indexed_rows < expected_rows:
908
911
  pos_movement = 10 # number of iteration allowed without noticing an increase in indexed_rows
909
912
  for i in range(20):
910
- new_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
913
+ current_indexed_rows = client.describe_index(collection_name, index_name)["indexed_rows"]
911
914
  time.sleep(1)
912
915
  logger.info(
913
- f"polling for indexed rows, {collection_name}, {index_name} - {new_indexed_rows} / {num_elements}"
916
+ f"Indexed rows, {collection_name}, {index_name} - {current_indexed_rows} / {expected_rows}"
914
917
  )
915
- if new_indexed_rows == num_elements:
916
- indexed_rows = new_indexed_rows
918
+ if current_indexed_rows == expected_rows:
919
+ indexed_rows = current_indexed_rows
917
920
  break
918
921
  # check if indexed_rows is staying the same, too many times means something is wrong
919
- if new_indexed_rows == indexed_rows:
922
+ if current_indexed_rows == indexed_rows:
920
923
  pos_movement -= 1
921
924
  else:
922
925
  pos_movement = 10
923
926
  # if pos_movement is 0, raise an error, means the rows are not getting indexed as expected
924
927
  if pos_movement == 0:
925
- raise ValueError("Rows are not getting indexed as expected")
926
- indexed_rows = new_indexed_rows
928
+ raise ValueError(f"Rows are not getting indexed as expected for: {index_name} - {collection_name}")
929
+ indexed_rows = current_indexed_rows
927
930
  return indexed_rows
928
931
 
929
932
 
@@ -2057,3 +2060,24 @@ class Milvus(VDB):
2057
2060
  self.write_to_index(records, collection_name=coll_name, **sub_write_params)
2058
2061
  else:
2059
2062
  raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
2063
+ return records
2064
+
2065
+ def run_async(self, records):
2066
+ collection_name, create_params = self.get_connection_params()
2067
+ _, write_params = self.get_write_params()
2068
+ if isinstance(collection_name, str):
2069
+ logger.info(f"creating index - {collection_name}")
2070
+ self.create_index(collection_name=collection_name, **create_params)
2071
+ records = records.result()
2072
+ logger.info(f"writing to index, for collection - {collection_name}")
2073
+ self.write_to_index(records, **write_params)
2074
+ elif isinstance(collection_name, dict):
2075
+ split_params_list = _dict_to_params(collection_name, write_params)
2076
+ for sub_params in split_params_list:
2077
+ coll_name, sub_write_params = sub_params
2078
+ sub_write_params.pop("collection_name", None)
2079
+ self.create_index(collection_name=coll_name, **create_params)
2080
+ self.write_to_index(records, collection_name=coll_name, **sub_write_params)
2081
+ else:
2082
+ raise ValueError(f"Unsupported type for collection_name detected: {type(collection_name)}")
2083
+ return records
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-client
3
- Version: 2025.10.18.dev20251018
3
+ Version: 2025.11.14.dev20251114
4
4
  Summary: Python client for the nv-ingest service
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -34,6 +34,7 @@ src/nv_ingest_client/primitives/tasks/embed.py
34
34
  src/nv_ingest_client/primitives/tasks/extract.py
35
35
  src/nv_ingest_client/primitives/tasks/filter.py
36
36
  src/nv_ingest_client/primitives/tasks/infographic_extraction.py
37
+ src/nv_ingest_client/primitives/tasks/ocr_extraction.py
37
38
  src/nv_ingest_client/primitives/tasks/split.py
38
39
  src/nv_ingest_client/primitives/tasks/store.py
39
40
  src/nv_ingest_client/primitives/tasks/table_extraction.py