nv-ingest 2025.11.10.dev20251110__py3-none-any.whl → 2025.12.5.dev20251205__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,8 @@ import os
13
13
  import time
14
14
  import uuid
15
15
  import random
16
+ from pathlib import Path
17
+ import fsspec
16
18
 
17
19
  from fastapi import APIRouter, Request, Response
18
20
  from fastapi import HTTPException
@@ -21,6 +23,8 @@ from redis import RedisError
21
23
 
22
24
  from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
23
25
  from nv_ingest_api.util.service_clients.client_base import FetchMode
26
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader
27
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
24
28
 
25
29
  # For PDF splitting
26
30
  import pypdfium2 as pdfium
@@ -188,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
188
192
  return 1 # Assume single page on error
189
193
 
190
194
 
191
- def _prepare_chunk_submission(
195
+ def _create_subjob_dict(
196
+ job_id: str,
197
+ job_payload: Dict[str, Any],
192
198
  job_spec_template: Dict[str, Any],
193
- chunk: Dict[str, Any],
194
- *,
195
- parent_uuid: uuid.UUID,
196
- parent_job_id: str,
197
199
  current_trace_id: int,
198
- original_source_id: str,
199
- original_source_name: str,
200
- ) -> Tuple[str, MessageWrapper]:
201
- """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
202
-
203
- chunk_number = chunk["chunk_index"] + 1
204
- start_page = chunk["start_page"]
205
- end_page = chunk["end_page"]
206
-
207
- subjob_spec = {
200
+ parent_job_id: str,
201
+ start_key: Dict[str, Any],
202
+ ) -> Dict[str, Any]:
203
+ job_spec = {
208
204
  key: value
209
205
  for key, value in job_spec_template.items()
210
206
  if key not in {"job_payload", "job_id", "tracing_options"}
211
207
  }
208
+ job_spec["job_payload"] = job_payload
209
+ job_spec["job_id"] = job_id
212
210
 
211
+ base_tracing_options = job_spec_template.get("tracing_options") or {}
212
+ tracing_options = dict(base_tracing_options)
213
+ tracing_options.setdefault("trace", True)
214
+ tracing_options["trace_id"] = str(current_trace_id)
215
+ tracing_options["ts_send"] = int(time.time() * 1000)
216
+ tracing_options["parent_job_id"] = parent_job_id
217
+ for key, value in start_key.items():
218
+ tracing_options[key] = value
219
+
220
+ job_spec["tracing_options"] = tracing_options
221
+ return job_spec
222
+
223
+
224
+ def _create_payload_dict(
225
+ job_spec_template: Dict[str, Any],
226
+ content: str,
227
+ source_id: str,
228
+ source_name: str,
229
+ document_type: str,
230
+ ) -> Dict[str, Any]:
213
231
  subjob_payload_template = job_spec_template.get("job_payload", {})
214
232
  subjob_payload = {
215
233
  key: value
@@ -217,27 +235,40 @@ def _prepare_chunk_submission(
217
235
  if key not in {"content", "source_id", "source_name"}
218
236
  }
219
237
 
220
- chunk_bytes = chunk["bytes"]
221
- subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
238
+ subjob_payload["content"] = [content]
222
239
 
223
- page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
224
- subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
225
- subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
240
+ subjob_payload["source_id"] = [source_id]
241
+ subjob_payload["source_name"] = [source_name]
242
+ subjob_payload["document_type"] = [document_type]
243
+ return subjob_payload
244
+
245
+
246
+ def _prepare_chunk_submission(
247
+ job_spec_template: Dict[str, Any],
248
+ chunk: Dict[str, Any],
249
+ *,
250
+ parent_uuid: uuid.UUID,
251
+ parent_job_id: str,
252
+ current_trace_id: int,
253
+ source_id: str,
254
+ source_name: str,
255
+ document_type: str,
256
+ ) -> Tuple[str, MessageWrapper]:
257
+ """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
258
+
259
+ chunk_number = chunk["chunk_index"] + 1
226
260
 
227
261
  subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
228
262
  subjob_id = str(subjob_uuid)
229
- subjob_spec["job_payload"] = subjob_payload
230
- subjob_spec["job_id"] = subjob_id
231
263
 
232
- base_tracing_options = job_spec_template.get("tracing_options") or {}
233
- tracing_options = dict(base_tracing_options)
234
- tracing_options.setdefault("trace", True)
235
- tracing_options["trace_id"] = str(current_trace_id)
236
- tracing_options["ts_send"] = int(time.time() * 1000)
237
- tracing_options["parent_job_id"] = parent_job_id
238
- tracing_options["page_num"] = start_page
264
+ subjob_payload_template = job_spec_template.get("job_payload", {})
265
+ chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
266
+ subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
267
+ start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
239
268
 
240
- subjob_spec["tracing_options"] = tracing_options
269
+ subjob_spec = _create_subjob_dict(
270
+ subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
271
+ )
241
272
 
242
273
  return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
243
274
 
@@ -801,6 +832,8 @@ async def submit_job_v2(
801
832
  request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
802
833
  ):
803
834
  span = trace.get_current_span()
835
+ source_id = None
836
+ document_type = None
804
837
  try:
805
838
  span.add_event("Submitting file for processing (V2)")
806
839
 
@@ -827,7 +860,19 @@ async def submit_job_v2(
827
860
 
828
861
  # Track page count for all PDFs (used for both splitting logic and metadata)
829
862
  pdf_page_count_cache = None
830
-
863
+ submission_items: List[Tuple[str, MessageWrapper]] = []
864
+ subjob_ids: List[str] = []
865
+ subjob_descriptors: List[Dict[str, Any]] = []
866
+ parent_metadata: Dict[str, Any] = {}
867
+ submission_items: List[Tuple[str, MessageWrapper]] = []
868
+ try:
869
+ parent_uuid = uuid.UUID(parent_job_id)
870
+ except ValueError:
871
+ logger.warning(
872
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
873
+ parent_job_id,
874
+ )
875
+ parent_uuid = uuid.uuid4()
831
876
  # Check if this is a PDF that needs splitting
832
877
  if document_types and payloads and document_types[0].lower() == "pdf":
833
878
  # Decode the payload to check page count
@@ -836,6 +881,7 @@ async def submit_job_v2(
836
881
  pdf_page_count_cache = page_count # Cache for later use
837
882
  qos_tier = get_qos_tier_for_page_count(page_count)
838
883
  pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
884
+ document_type = DocumentTypeEnum.PDF
839
885
 
840
886
  # Split if the document has more pages than our chunk size
841
887
  if page_count > pages_per_chunk:
@@ -846,13 +892,11 @@ async def submit_job_v2(
846
892
  page_count,
847
893
  qos_tier,
848
894
  )
849
-
850
895
  chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
851
896
 
852
897
  subjob_ids: List[str] = []
853
898
  subjob_descriptors: List[Dict[str, Any]] = []
854
899
  submission_items: List[Tuple[str, MessageWrapper]] = []
855
-
856
900
  try:
857
901
  parent_uuid = uuid.UUID(parent_job_id)
858
902
  except ValueError:
@@ -863,14 +907,20 @@ async def submit_job_v2(
863
907
  parent_uuid = uuid.uuid4()
864
908
 
865
909
  for chunk in chunks:
910
+ start = chunk["start_page"]
911
+ end = chunk["end_page"]
912
+ page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
913
+ source_id = f"{original_source_id}#{page_suffix}"
914
+ source_name = f"{original_source_name}#{page_suffix}"
866
915
  subjob_id, subjob_wrapper = _prepare_chunk_submission(
867
916
  job_spec_dict,
868
917
  chunk,
918
+ document_type=DocumentTypeEnum.PDF,
869
919
  parent_uuid=parent_uuid,
870
920
  parent_job_id=parent_job_id,
871
921
  current_trace_id=current_trace_id,
872
- original_source_id=original_source_id,
873
- original_source_name=original_source_name,
922
+ source_id=source_id,
923
+ source_name=source_name,
874
924
  )
875
925
 
876
926
  # Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
@@ -895,38 +945,98 @@ async def submit_job_v2(
895
945
  "page_count": chunk.get("page_count"),
896
946
  }
897
947
  )
948
+ parent_metadata.update(
949
+ {
950
+ "total_pages": page_count,
951
+ "pages_per_chunk": pages_per_chunk,
952
+ "original_source_id": original_source_id,
953
+ "original_source_name": original_source_name,
954
+ "document_type": document_types[0] if document_types else "pdf",
955
+ "subjob_order": subjob_ids,
956
+ }
957
+ )
958
+ elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
959
+ document_type = document_types[0]
960
+ upload_path = f"./{Path(original_source_id).name}"
961
+ # dump the payload to a file, just came from client
962
+ with fsspec.open(upload_path, "wb") as f:
963
+ f.write(base64.b64decode(payloads[0]))
964
+ dataloader = DataLoader(
965
+ path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
966
+ )
967
+ document_type = DocumentTypeEnum.MP3
968
+
969
+ parent_uuid = uuid.UUID(parent_job_id)
970
+ for task in job_spec_dict["tasks"]:
971
+ if "task_properties" in task and "document_type" in task["task_properties"]:
972
+ task["task_properties"]["document_type"] = document_type
973
+ end = 0
974
+ for idx, (file_path, duration) in enumerate(dataloader.files_completed):
975
+ start = end
976
+ end = int(start + duration)
977
+ chunk = {
978
+ "bytes": file_path.encode("utf-8"),
979
+ "chunk_index": idx,
980
+ "start": start,
981
+ "end": end,
982
+ }
898
983
 
899
- if submission_items:
900
- burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
901
- await _submit_subjobs_in_bursts(
902
- submission_items,
903
- ingest_service,
904
- burst_size=burst_size,
905
- pause_ms=pause_ms,
906
- jitter_ms=jitter_ms,
907
- )
984
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
985
+ job_spec_dict,
986
+ chunk,
987
+ parent_uuid=parent_uuid,
988
+ parent_job_id=parent_job_id,
989
+ current_trace_id=current_trace_id,
990
+ source_id=file_path,
991
+ source_name=upload_path,
992
+ document_type=document_type,
993
+ )
908
994
 
909
- parent_metadata: Dict[str, Any] = {
910
- "total_pages": page_count,
911
- "pages_per_chunk": pages_per_chunk,
995
+ submission_items.append((subjob_id, subjob_wrapper))
996
+ subjob_ids.append(subjob_id)
997
+ subjob_descriptors.append(
998
+ {
999
+ "job_id": subjob_id,
1000
+ "chunk_index": idx + 1,
1001
+ "start_page": chunk.get("start"),
1002
+ "end_page": chunk.get("end"),
1003
+ "page_count": chunk.get("page_count", 0),
1004
+ }
1005
+ )
1006
+ logger.error(f"Removing uploaded file {upload_path}")
1007
+ os.remove(upload_path)
1008
+
1009
+ if submission_items:
1010
+ burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
1011
+ await _submit_subjobs_in_bursts(
1012
+ submission_items,
1013
+ ingest_service,
1014
+ burst_size=burst_size,
1015
+ pause_ms=pause_ms,
1016
+ jitter_ms=jitter_ms,
1017
+ )
1018
+
1019
+ parent_metadata.update(
1020
+ {
912
1021
  "original_source_id": original_source_id,
913
1022
  "original_source_name": original_source_name,
914
- "document_type": document_types[0] if document_types else "pdf",
1023
+ "document_type": document_type,
915
1024
  "subjob_order": subjob_ids,
916
1025
  }
1026
+ )
1027
+ # raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
1028
+ await ingest_service.set_parent_job_mapping(
1029
+ parent_job_id,
1030
+ subjob_ids,
1031
+ parent_metadata,
1032
+ subjob_descriptors=subjob_descriptors,
1033
+ )
917
1034
 
918
- await ingest_service.set_parent_job_mapping(
919
- parent_job_id,
920
- subjob_ids,
921
- parent_metadata,
922
- subjob_descriptors=subjob_descriptors,
923
- )
924
-
925
- await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
1035
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
926
1036
 
927
- span.add_event(f"Split into {len(subjob_ids)} subjobs")
928
- response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
929
- return parent_job_id
1037
+ span.add_event(f"Split into {len(subjob_ids)} subjobs")
1038
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
1039
+ return parent_job_id
930
1040
 
931
1041
  # For non-PDFs or cases where splitting is not required, submit as normal
932
1042
  if "tracing_options" not in job_spec_dict:
@@ -982,8 +1092,8 @@ async def submit_job_v2(
982
1092
  return parent_job_id
983
1093
 
984
1094
  except Exception as ex:
985
- logger.exception(f"Error submitting job: {str(ex)}")
986
- raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
1095
+ logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
1096
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
987
1097
 
988
1098
 
989
1099
  # GET /v2/fetch_job
@@ -5,7 +5,6 @@
5
5
 
6
6
  import logging
7
7
  from typing import Optional
8
-
9
8
  import ray
10
9
 
11
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
67
66
  # Extract the DataFrame payload.
68
67
  df_ledger = control_message.payload()
69
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
70
-
71
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
72
70
  task_config = remove_task_by_type(control_message, "extract")
73
71
  self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -3,7 +3,9 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
7
9
 
8
10
  import pandas as pd
9
11
  import ray
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
26
28
  @ray.remote
27
29
  class ImageStorageStage(RayActorStage):
28
30
  """
29
- A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
31
+ A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
32
+ metadata with storage URLs.
30
33
 
31
34
  This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
32
35
  payload and updates the control message accordingly.
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
69
72
  task_config = remove_task_by_type(control_message, "store")
70
73
  # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
71
74
 
72
- store_structured: bool = task_config.get("structured", True)
73
- store_unstructured: bool = task_config.get("images", False)
75
+ stage_defaults = {
76
+ "structured": self.validated_config.structured,
77
+ "images": self.validated_config.images,
78
+ "storage_uri": self.validated_config.storage_uri,
79
+ "storage_options": self.validated_config.storage_options,
80
+ "public_base_url": self.validated_config.public_base_url,
81
+ }
82
+
83
+ store_structured: bool = task_config.get("structured", stage_defaults["structured"])
84
+ store_unstructured: bool = task_config.get("images", stage_defaults["images"])
74
85
 
75
86
  content_types: Dict[Any, Any] = {}
76
87
  if store_structured:
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
80
91
  content_types[ContentTypeEnum.IMAGE] = store_unstructured
81
92
 
82
93
  params: Dict[str, Any] = task_config.get("params", {})
83
- params["content_types"] = content_types
84
94
 
85
- logger.debug(f"Processing storage task with parameters: {params}")
95
+ storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
96
+ storage_options = {
97
+ **(stage_defaults["storage_options"] or {}),
98
+ **(task_config.get("storage_options") or {}),
99
+ **params.get("storage_options", {}),
100
+ }
101
+ if "public_base_url" in task_config:
102
+ public_base_url = task_config["public_base_url"]
103
+ else:
104
+ public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
105
+
106
+ storage_options = self._inject_storage_defaults(storage_uri, storage_options)
107
+
108
+ storage_params: Dict[str, Any] = {
109
+ "content_types": content_types,
110
+ "storage_uri": storage_uri,
111
+ "storage_options": storage_options,
112
+ }
113
+ if public_base_url:
114
+ storage_params["public_base_url"] = public_base_url
115
+
116
+ logger.debug("Processing storage task with parameters: %s", storage_params)
86
117
 
87
118
  # Store images or structured content.
88
119
  df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
89
120
  df_storage_ledger=df_payload,
90
- task_config=params,
121
+ task_config=storage_params,
91
122
  storage_config={},
92
123
  execution_trace_log=None,
93
124
  )
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
98
129
  control_message.payload(df_storage_ledger)
99
130
 
100
131
  return control_message
132
+
133
+ @staticmethod
134
+ def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
135
+ """
136
+ Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
137
+ """
138
+ parsed_scheme = urlparse(storage_uri).scheme.lower()
139
+ merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
140
+
141
+ if parsed_scheme not in {"s3", "s3a", "s3n"}:
142
+ return merged_options
143
+
144
+ def _set_if_absent(key: str, env_var: str) -> None:
145
+ if key not in merged_options and env_var in os.environ:
146
+ merged_options[key] = os.environ[env_var]
147
+
148
+ _set_if_absent("key", "MINIO_ACCESS_KEY")
149
+ _set_if_absent("secret", "MINIO_SECRET_KEY")
150
+ if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
151
+ merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
152
+
153
+ client_kwargs = dict(merged_options.get("client_kwargs", {}))
154
+ endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
155
+ if not endpoint:
156
+ endpoint = "http://minio:9000"
157
+ if endpoint and not endpoint.startswith(("http://", "https://")):
158
+ endpoint = f"http://{endpoint}"
159
+ client_kwargs.setdefault("endpoint_url", endpoint)
160
+ region = os.environ.get("MINIO_REGION")
161
+ if region:
162
+ client_kwargs.setdefault("region_name", region)
163
+ if client_kwargs:
164
+ merged_options["client_kwargs"] = client_kwargs
165
+
166
+ return merged_options
@@ -128,6 +128,13 @@ stages:
128
128
  ]
129
129
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
130
130
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
131
+ pdfium_config:
132
+ yolox_endpoints: [
133
+ $YOLOX_GRPC_ENDPOINT|"",
134
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
135
+ ]
136
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
137
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
131
138
  replicas:
132
139
  min_replicas: 0
133
140
  max_replicas:
@@ -149,6 +156,13 @@ stages:
149
156
  ]
150
157
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
151
158
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
159
+ pdfium_config:
160
+ yolox_endpoints: [
161
+ $YOLOX_GRPC_ENDPOINT|"",
162
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
163
+ ]
164
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
165
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
152
166
  replicas:
153
167
  min_replicas: 0
154
168
  max_replicas:
@@ -201,7 +215,7 @@ stages:
201
215
  endpoint_config:
202
216
  ocr_endpoints: [
203
217
  $OCR_GRPC_ENDPOINT|"",
204
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
218
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
205
219
  ]
206
220
  ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
207
221
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -227,7 +241,7 @@ stages:
227
241
  yolox_infer_protocol: $YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL|"http"
228
242
  ocr_endpoints: [
229
243
  $OCR_GRPC_ENDPOINT|"",
230
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
244
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
231
245
  ]
232
246
  ocr_infer_protocol: $PADDLE_INFER_PROTOCOL|"http"
233
247
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -254,7 +268,7 @@ stages:
254
268
  yolox_infer_protocol: $YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL|"http"
255
269
  ocr_endpoints: [
256
270
  $OCR_GRPC_ENDPOINT|"",
257
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
271
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
258
272
  ]
259
273
  ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
260
274
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -123,7 +123,14 @@ stages:
123
123
  docx_extraction_config:
124
124
  yolox_endpoints: [
125
125
  $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
126
- $YOLOX_HTTP_ENDPOINT|"",
126
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
127
+ ]
128
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
129
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
130
+ pdfium_config:
131
+ yolox_endpoints: [
132
+ $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
133
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
127
134
  ]
128
135
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
129
136
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -148,6 +155,13 @@ stages:
148
155
  ]
149
156
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
150
157
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
158
+ pdfium_config:
159
+ yolox_endpoints: [
160
+ $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
161
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
162
+ ]
163
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
164
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
151
165
  replicas:
152
166
  min_replicas: 0
153
167
  max_replicas:
@@ -192,6 +206,27 @@ stages:
192
206
  strategy: "static"
193
207
  value: 1
194
208
 
209
+ - name: "ocr_extractor"
210
+ type: "stage"
211
+ phase: 1 # EXTRACTION
212
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.ocr_extractor:OCRExtractorStage"
213
+ config:
214
+ endpoint_config:
215
+ ocr_endpoints: [
216
+ $OCR_GRPC_ENDPOINT|"ocr:8001",
217
+ $OCR_HTTP_ENDPOINT|"http://ocr:8000/v1/infer",
218
+ ]
219
+ ocr_infer_protocol: $OCR_INFER_PROTOCOL|grpc
220
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
221
+ replicas:
222
+ min_replicas: 0
223
+ max_replicas:
224
+ strategy: "static"
225
+ value: 4
226
+ static_replicas:
227
+ strategy: "static"
228
+ value: 3
229
+
195
230
  - name: "infographic_extractor"
196
231
  type: "stage"
197
232
  phase: 1 # EXTRACTION
@@ -351,6 +386,9 @@ stages:
351
386
  type: "stage"
352
387
  phase: 5 # RESPONSE
353
388
  actor: "nv_ingest.framework.orchestration.ray.stages.storage.image_storage:ImageStorageStage"
389
+ config:
390
+ storage_uri: $IMAGE_STORAGE_URI|"s3://nv-ingest/artifacts/store/images"
391
+ public_base_url: $IMAGE_STORAGE_PUBLIC_BASE_URL|""
354
392
  replicas:
355
393
  min_replicas: 0
356
394
  max_replicas:
@@ -461,6 +499,9 @@ edges:
461
499
  to: "chart_extractor"
462
500
  queue_size: 4
463
501
  - from: "chart_extractor"
502
+ to: "ocr_extractor"
503
+ queue_size: 8
504
+ - from: "ocr_extractor"
464
505
  to: "image_filter"
465
506
  queue_size: 4
466
507
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.10.dev20251110
3
+ Version: 2025.12.5.dev20251205
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,6 +219,8 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
@@ -226,7 +228,6 @@ Requires-Dist: isodate>=0.7.2
226
228
  Requires-Dist: langdetect>=1.0.9
227
229
  Requires-Dist: minio>=7.2.12
228
230
  Requires-Dist: librosa>=0.10.2
229
- Requires-Dist: openai>=1.82.0
230
231
  Requires-Dist: opentelemetry-api>=1.27.0
231
232
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
232
233
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -9,7 +9,7 @@ nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19
9
9
  nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
10
10
  nv_ingest/api/v2/README.md,sha256=VhpdjEmCyr3qIOhwqISFx9C5WezJFcxYc-NB9S98HMg,7562
11
11
  nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
12
- nv_ingest/api/v2/ingest.py,sha256=CFLRw9y0N0AklQWsH1wYDHUjxrfkvOmE97aFcaBViWw,48525
12
+ nv_ingest/api/v2/ingest.py,sha256=ikbZE2eAjSnFmt5CcpTduY1t9DsUQBhnBQlsd3HaBww,53103
13
13
  nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
14
14
  nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
15
15
  nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -38,12 +38,13 @@ nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=t9lf6zTj
38
38
  nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=GGY6_i6_g5xTFzdo9Qmsu9i4knMTq6pJfgm-aaPEt_o,17226
39
39
  nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
40
40
  nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
41
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=4SdgvzI9oJ_OK5oWGir9wXVIPV4Pont2EKv9mwcWMC0,3631
41
+ nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=UVp_kDmkaBlfO0Mbl_IxKq6imzLvs4-DKHgUHJIh3mo,3629
42
42
  nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py,sha256=rfaDx6PqRCguhSYkJI6iVmMMtAlJNxzKfUrLmw_fKqs,4381
43
43
  nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py,sha256=R4vshPcAUN2U6BIv8BCZQ862wLx8RJhCGXfpQ3K09Bs,3627
44
44
  nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py,sha256=7JrZSVIrK4_wr2s7TOTss7pgTY2F9GPQ7Ze3F_WFlKU,3642
45
45
  nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py,sha256=iY9fEfucfgCmO2ixX6qwn418J97nJz_FQGh7B6yziVo,3980
46
46
  nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py,sha256=v5J7dnJBEaDfjoTz_N_yC3RAt6lwMLgLT28V-ahquLE,3261
47
+ nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py,sha256=pwVoA5-CF9GVWusoFZOMGBvSyW5udD9bdxVJXA_SghE,3188
47
48
  nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=QagIA99AsHLihjRbXm-2BphdoQGHwzOHlqLyz7oDOSk,4992
48
49
  nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py,sha256=RMbbl7Cuj4BT-TcgUx_0k8R-DLdw-o3fHxcIBIgrWt4,3776
49
50
  nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py,sha256=p71ktv6v5T-9npYpCbgbwW6-fS-65UWS7rCm8OWr2Bc,4170
@@ -63,7 +64,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,s
63
64
  nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
64
65
  nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=LrqaWpWyuiAHlpXWKYSyHZJBFegGXfNlpCXrucbK5NM,24067
65
66
  nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
66
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=WZN_-3Li-izDaPtk8IMrtn2os1ckT3U8Rb2PsfOWrcI,4009
67
+ nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=f1iA7rjYFA1G1EXqFM6URUi_QRql1Y1OrnMPKONsSqo,6907
67
68
  nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=EUtwhSDf-qGLVEhWEInr1VaLsvpcHUSyzCmHQVai-Ps,3547
68
69
  nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
69
70
  nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py,sha256=jEtEUibqs6IS6QakrzWY9zmxSUzuBpg_hzXy2R-I10Y,2870
@@ -110,15 +111,15 @@ nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uN
110
111
  nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
111
112
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
112
113
  nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
113
- nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=yNJtjfHQyxtasGa1hQrvgX7UrPa7BAd0oog8EIN8Y_w,15592
114
- nv_ingest/pipeline/default_pipeline_impl.py,sha256=vQvP6VMEOPoFMtHDaMEhBMXQWI8L8iYh-vM6i_EVmBI,15339
114
+ nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=M31VN1xVTdoiNdjaSSPKEZr-yKhXDSwQ1hAVIkpJZLw,16232
115
+ nv_ingest/pipeline/default_pipeline_impl.py,sha256=TW9N9UcgsBL5SG1pxuSdgBIyFpBORskbHCmvJBmIIuw,16770
115
116
  nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
116
117
  nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
117
118
  nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
118
119
  nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
119
120
  nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
120
- nv_ingest-2025.11.10.dev20251110.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
121
- nv_ingest-2025.11.10.dev20251110.dist-info/METADATA,sha256=XOhTXZjQHdZ0AD9hFmZTZaqukc7fqM2C-Qv0cHZYtHw,15122
122
- nv_ingest-2025.11.10.dev20251110.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
- nv_ingest-2025.11.10.dev20251110.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
124
- nv_ingest-2025.11.10.dev20251110.dist-info/RECORD,,
121
+ nv_ingest-2025.12.5.dev20251205.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
+ nv_ingest-2025.12.5.dev20251205.dist-info/METADATA,sha256=xmWQacIeqshaQ9LxIv90A2xCEFrCGXzjYYIQBwKyHf0,15162
123
+ nv_ingest-2025.12.5.dev20251205.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
+ nv_ingest-2025.12.5.dev20251205.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
125
+ nv_ingest-2025.12.5.dev20251205.dist-info/RECORD,,