nv-ingest 2025.11.10.dev20251110__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,8 @@ import os
13
13
  import time
14
14
  import uuid
15
15
  import random
16
+ from pathlib import Path
17
+ import fsspec
16
18
 
17
19
  from fastapi import APIRouter, Request, Response
18
20
  from fastapi import HTTPException
@@ -21,6 +23,8 @@ from redis import RedisError
21
23
 
22
24
  from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
23
25
  from nv_ingest_api.util.service_clients.client_base import FetchMode
26
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader
27
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
24
28
 
25
29
  # For PDF splitting
26
30
  import pypdfium2 as pdfium
@@ -188,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
188
192
  return 1 # Assume single page on error
189
193
 
190
194
 
191
- def _prepare_chunk_submission(
195
+ def _create_subjob_dict(
196
+ job_id: str,
197
+ job_payload: Dict[str, Any],
192
198
  job_spec_template: Dict[str, Any],
193
- chunk: Dict[str, Any],
194
- *,
195
- parent_uuid: uuid.UUID,
196
- parent_job_id: str,
197
199
  current_trace_id: int,
198
- original_source_id: str,
199
- original_source_name: str,
200
- ) -> Tuple[str, MessageWrapper]:
201
- """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
202
-
203
- chunk_number = chunk["chunk_index"] + 1
204
- start_page = chunk["start_page"]
205
- end_page = chunk["end_page"]
206
-
207
- subjob_spec = {
200
+ parent_job_id: str,
201
+ start_key: Dict[str, Any],
202
+ ) -> Dict[str, Any]:
203
+ job_spec = {
208
204
  key: value
209
205
  for key, value in job_spec_template.items()
210
206
  if key not in {"job_payload", "job_id", "tracing_options"}
211
207
  }
208
+ job_spec["job_payload"] = job_payload
209
+ job_spec["job_id"] = job_id
212
210
 
211
+ base_tracing_options = job_spec_template.get("tracing_options") or {}
212
+ tracing_options = dict(base_tracing_options)
213
+ tracing_options.setdefault("trace", True)
214
+ tracing_options["trace_id"] = str(current_trace_id)
215
+ tracing_options["ts_send"] = int(time.time() * 1000)
216
+ tracing_options["parent_job_id"] = parent_job_id
217
+ for key, value in start_key.items():
218
+ tracing_options[key] = value
219
+
220
+ job_spec["tracing_options"] = tracing_options
221
+ return job_spec
222
+
223
+
224
+ def _create_payload_dict(
225
+ job_spec_template: Dict[str, Any],
226
+ content: str,
227
+ source_id: str,
228
+ source_name: str,
229
+ document_type: str,
230
+ ) -> Dict[str, Any]:
213
231
  subjob_payload_template = job_spec_template.get("job_payload", {})
214
232
  subjob_payload = {
215
233
  key: value
@@ -217,27 +235,40 @@ def _prepare_chunk_submission(
217
235
  if key not in {"content", "source_id", "source_name"}
218
236
  }
219
237
 
220
- chunk_bytes = chunk["bytes"]
221
- subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
238
+ subjob_payload["content"] = [content]
222
239
 
223
- page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
224
- subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
225
- subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
240
+ subjob_payload["source_id"] = [source_id]
241
+ subjob_payload["source_name"] = [source_name]
242
+ subjob_payload["document_type"] = [document_type]
243
+ return subjob_payload
244
+
245
+
246
+ def _prepare_chunk_submission(
247
+ job_spec_template: Dict[str, Any],
248
+ chunk: Dict[str, Any],
249
+ *,
250
+ parent_uuid: uuid.UUID,
251
+ parent_job_id: str,
252
+ current_trace_id: int,
253
+ source_id: str,
254
+ source_name: str,
255
+ document_type: str,
256
+ ) -> Tuple[str, MessageWrapper]:
257
+ """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
258
+
259
+ chunk_number = chunk["chunk_index"] + 1
226
260
 
227
261
  subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
228
262
  subjob_id = str(subjob_uuid)
229
- subjob_spec["job_payload"] = subjob_payload
230
- subjob_spec["job_id"] = subjob_id
231
263
 
232
- base_tracing_options = job_spec_template.get("tracing_options") or {}
233
- tracing_options = dict(base_tracing_options)
234
- tracing_options.setdefault("trace", True)
235
- tracing_options["trace_id"] = str(current_trace_id)
236
- tracing_options["ts_send"] = int(time.time() * 1000)
237
- tracing_options["parent_job_id"] = parent_job_id
238
- tracing_options["page_num"] = start_page
264
+ subjob_payload_template = job_spec_template.get("job_payload", {})
265
+ chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
266
+ subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
267
+ start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
239
268
 
240
- subjob_spec["tracing_options"] = tracing_options
269
+ subjob_spec = _create_subjob_dict(
270
+ subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
271
+ )
241
272
 
242
273
  return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
243
274
 
@@ -801,6 +832,8 @@ async def submit_job_v2(
801
832
  request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
802
833
  ):
803
834
  span = trace.get_current_span()
835
+ source_id = None
836
+ document_type = None
804
837
  try:
805
838
  span.add_event("Submitting file for processing (V2)")
806
839
 
@@ -827,7 +860,19 @@ async def submit_job_v2(
827
860
 
828
861
  # Track page count for all PDFs (used for both splitting logic and metadata)
829
862
  pdf_page_count_cache = None
830
-
863
+ submission_items: List[Tuple[str, MessageWrapper]] = []
864
+ subjob_ids: List[str] = []
865
+ subjob_descriptors: List[Dict[str, Any]] = []
866
+ parent_metadata: Dict[str, Any] = {}
867
+ submission_items: List[Tuple[str, MessageWrapper]] = []
868
+ try:
869
+ parent_uuid = uuid.UUID(parent_job_id)
870
+ except ValueError:
871
+ logger.warning(
872
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
873
+ parent_job_id,
874
+ )
875
+ parent_uuid = uuid.uuid4()
831
876
  # Check if this is a PDF that needs splitting
832
877
  if document_types and payloads and document_types[0].lower() == "pdf":
833
878
  # Decode the payload to check page count
@@ -836,6 +881,7 @@ async def submit_job_v2(
836
881
  pdf_page_count_cache = page_count # Cache for later use
837
882
  qos_tier = get_qos_tier_for_page_count(page_count)
838
883
  pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
884
+ document_type = DocumentTypeEnum.PDF
839
885
 
840
886
  # Split if the document has more pages than our chunk size
841
887
  if page_count > pages_per_chunk:
@@ -846,13 +892,11 @@ async def submit_job_v2(
846
892
  page_count,
847
893
  qos_tier,
848
894
  )
849
-
850
895
  chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
851
896
 
852
897
  subjob_ids: List[str] = []
853
898
  subjob_descriptors: List[Dict[str, Any]] = []
854
899
  submission_items: List[Tuple[str, MessageWrapper]] = []
855
-
856
900
  try:
857
901
  parent_uuid = uuid.UUID(parent_job_id)
858
902
  except ValueError:
@@ -863,14 +907,20 @@ async def submit_job_v2(
863
907
  parent_uuid = uuid.uuid4()
864
908
 
865
909
  for chunk in chunks:
910
+ start = chunk["start_page"]
911
+ end = chunk["end_page"]
912
+ page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
913
+ source_id = f"{original_source_id}#{page_suffix}"
914
+ source_name = f"{original_source_name}#{page_suffix}"
866
915
  subjob_id, subjob_wrapper = _prepare_chunk_submission(
867
916
  job_spec_dict,
868
917
  chunk,
918
+ document_type=DocumentTypeEnum.PDF,
869
919
  parent_uuid=parent_uuid,
870
920
  parent_job_id=parent_job_id,
871
921
  current_trace_id=current_trace_id,
872
- original_source_id=original_source_id,
873
- original_source_name=original_source_name,
922
+ source_id=source_id,
923
+ source_name=source_name,
874
924
  )
875
925
 
876
926
  # Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
@@ -895,38 +945,98 @@ async def submit_job_v2(
895
945
  "page_count": chunk.get("page_count"),
896
946
  }
897
947
  )
948
+ parent_metadata.update(
949
+ {
950
+ "total_pages": page_count,
951
+ "pages_per_chunk": pages_per_chunk,
952
+ "original_source_id": original_source_id,
953
+ "original_source_name": original_source_name,
954
+ "document_type": document_types[0] if document_types else "pdf",
955
+ "subjob_order": subjob_ids,
956
+ }
957
+ )
958
+ elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
959
+ document_type = document_types[0]
960
+ upload_path = f"./{Path(original_source_id).name}"
961
+ # dump the payload to a file, just came from client
962
+ with fsspec.open(upload_path, "wb") as f:
963
+ f.write(base64.b64decode(payloads[0]))
964
+ dataloader = DataLoader(
965
+ path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
966
+ )
967
+ document_type = DocumentTypeEnum.MP3
968
+
969
+ parent_uuid = uuid.UUID(parent_job_id)
970
+ for task in job_spec_dict["tasks"]:
971
+ if "task_properties" in task and "document_type" in task["task_properties"]:
972
+ task["task_properties"]["document_type"] = document_type
973
+ end = 0
974
+ for idx, (file_path, duration) in enumerate(dataloader.files_completed):
975
+ start = end
976
+ end = int(start + duration)
977
+ chunk = {
978
+ "bytes": file_path.encode("utf-8"),
979
+ "chunk_index": idx,
980
+ "start": start,
981
+ "end": end,
982
+ }
898
983
 
899
- if submission_items:
900
- burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
901
- await _submit_subjobs_in_bursts(
902
- submission_items,
903
- ingest_service,
904
- burst_size=burst_size,
905
- pause_ms=pause_ms,
906
- jitter_ms=jitter_ms,
907
- )
984
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
985
+ job_spec_dict,
986
+ chunk,
987
+ parent_uuid=parent_uuid,
988
+ parent_job_id=parent_job_id,
989
+ current_trace_id=current_trace_id,
990
+ source_id=file_path,
991
+ source_name=upload_path,
992
+ document_type=document_type,
993
+ )
908
994
 
909
- parent_metadata: Dict[str, Any] = {
910
- "total_pages": page_count,
911
- "pages_per_chunk": pages_per_chunk,
995
+ submission_items.append((subjob_id, subjob_wrapper))
996
+ subjob_ids.append(subjob_id)
997
+ subjob_descriptors.append(
998
+ {
999
+ "job_id": subjob_id,
1000
+ "chunk_index": idx + 1,
1001
+ "start_page": chunk.get("start"),
1002
+ "end_page": chunk.get("end"),
1003
+ "page_count": chunk.get("page_count", 0),
1004
+ }
1005
+ )
1006
+ logger.error(f"Removing uploaded file {upload_path}")
1007
+ os.remove(upload_path)
1008
+
1009
+ if submission_items:
1010
+ burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
1011
+ await _submit_subjobs_in_bursts(
1012
+ submission_items,
1013
+ ingest_service,
1014
+ burst_size=burst_size,
1015
+ pause_ms=pause_ms,
1016
+ jitter_ms=jitter_ms,
1017
+ )
1018
+
1019
+ parent_metadata.update(
1020
+ {
912
1021
  "original_source_id": original_source_id,
913
1022
  "original_source_name": original_source_name,
914
- "document_type": document_types[0] if document_types else "pdf",
1023
+ "document_type": document_type,
915
1024
  "subjob_order": subjob_ids,
916
1025
  }
1026
+ )
1027
+ # raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
1028
+ await ingest_service.set_parent_job_mapping(
1029
+ parent_job_id,
1030
+ subjob_ids,
1031
+ parent_metadata,
1032
+ subjob_descriptors=subjob_descriptors,
1033
+ )
917
1034
 
918
- await ingest_service.set_parent_job_mapping(
919
- parent_job_id,
920
- subjob_ids,
921
- parent_metadata,
922
- subjob_descriptors=subjob_descriptors,
923
- )
924
-
925
- await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
1035
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
926
1036
 
927
- span.add_event(f"Split into {len(subjob_ids)} subjobs")
928
- response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
929
- return parent_job_id
1037
+ span.add_event(f"Split into {len(subjob_ids)} subjobs")
1038
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
1039
+ return parent_job_id
930
1040
 
931
1041
  # For non-PDFs or cases where splitting is not required, submit as normal
932
1042
  if "tracing_options" not in job_spec_dict:
@@ -982,8 +1092,8 @@ async def submit_job_v2(
982
1092
  return parent_job_id
983
1093
 
984
1094
  except Exception as ex:
985
- logger.exception(f"Error submitting job: {str(ex)}")
986
- raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
1095
+ logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
1096
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
987
1097
 
988
1098
 
989
1099
  # GET /v2/fetch_job
@@ -5,7 +5,6 @@
5
5
 
6
6
  import logging
7
7
  from typing import Optional
8
-
9
8
  import ray
10
9
 
11
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
67
66
  # Extract the DataFrame payload.
68
67
  df_ledger = control_message.payload()
69
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
70
-
71
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
72
70
  task_config = remove_task_by_type(control_message, "extract")
73
71
  self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -192,6 +192,27 @@ stages:
192
192
  strategy: "static"
193
193
  value: 1
194
194
 
195
+ - name: "ocr_extractor"
196
+ type: "stage"
197
+ phase: 1 # EXTRACTION
198
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.ocr_extractor:OCRExtractorStage"
199
+ config:
200
+ endpoint_config:
201
+ ocr_endpoints: [
202
+ $OCR_GRPC_ENDPOINT|"ocr:8001",
203
+ $OCR_HTTP_ENDPOINT|"http://ocr:8000/v1/infer",
204
+ ]
205
+ ocr_infer_protocol: $OCR_INFER_PROTOCOL|grpc
206
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
207
+ replicas:
208
+ min_replicas: 0
209
+ max_replicas:
210
+ strategy: "static"
211
+ value: 4
212
+ static_replicas:
213
+ strategy: "static"
214
+ value: 3
215
+
195
216
  - name: "infographic_extractor"
196
217
  type: "stage"
197
218
  phase: 1 # EXTRACTION
@@ -461,6 +482,9 @@ edges:
461
482
  to: "chart_extractor"
462
483
  queue_size: 4
463
484
  - from: "chart_extractor"
485
+ to: "ocr_extractor"
486
+ queue_size: 8
487
+ - from: "ocr_extractor"
464
488
  to: "image_filter"
465
489
  queue_size: 4
466
490
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.10.dev20251110
3
+ Version: 2025.11.19.dev20251119
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -226,7 +226,6 @@ Requires-Dist: isodate>=0.7.2
226
226
  Requires-Dist: langdetect>=1.0.9
227
227
  Requires-Dist: minio>=7.2.12
228
228
  Requires-Dist: librosa>=0.10.2
229
- Requires-Dist: openai>=1.82.0
230
229
  Requires-Dist: opentelemetry-api>=1.27.0
231
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
232
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -9,7 +9,7 @@ nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19
9
9
  nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
10
10
  nv_ingest/api/v2/README.md,sha256=VhpdjEmCyr3qIOhwqISFx9C5WezJFcxYc-NB9S98HMg,7562
11
11
  nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
12
- nv_ingest/api/v2/ingest.py,sha256=CFLRw9y0N0AklQWsH1wYDHUjxrfkvOmE97aFcaBViWw,48525
12
+ nv_ingest/api/v2/ingest.py,sha256=ikbZE2eAjSnFmt5CcpTduY1t9DsUQBhnBQlsd3HaBww,53103
13
13
  nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
14
14
  nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
15
15
  nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -38,12 +38,13 @@ nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=t9lf6zTj
38
38
  nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=GGY6_i6_g5xTFzdo9Qmsu9i4knMTq6pJfgm-aaPEt_o,17226
39
39
  nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
40
40
  nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
41
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=4SdgvzI9oJ_OK5oWGir9wXVIPV4Pont2EKv9mwcWMC0,3631
41
+ nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=UVp_kDmkaBlfO0Mbl_IxKq6imzLvs4-DKHgUHJIh3mo,3629
42
42
  nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py,sha256=rfaDx6PqRCguhSYkJI6iVmMMtAlJNxzKfUrLmw_fKqs,4381
43
43
  nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py,sha256=R4vshPcAUN2U6BIv8BCZQ862wLx8RJhCGXfpQ3K09Bs,3627
44
44
  nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py,sha256=7JrZSVIrK4_wr2s7TOTss7pgTY2F9GPQ7Ze3F_WFlKU,3642
45
45
  nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py,sha256=iY9fEfucfgCmO2ixX6qwn418J97nJz_FQGh7B6yziVo,3980
46
46
  nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py,sha256=v5J7dnJBEaDfjoTz_N_yC3RAt6lwMLgLT28V-ahquLE,3261
47
+ nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py,sha256=pwVoA5-CF9GVWusoFZOMGBvSyW5udD9bdxVJXA_SghE,3188
47
48
  nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=QagIA99AsHLihjRbXm-2BphdoQGHwzOHlqLyz7oDOSk,4992
48
49
  nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py,sha256=RMbbl7Cuj4BT-TcgUx_0k8R-DLdw-o3fHxcIBIgrWt4,3776
49
50
  nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py,sha256=p71ktv6v5T-9npYpCbgbwW6-fS-65UWS7rCm8OWr2Bc,4170
@@ -111,14 +112,14 @@ nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusX
111
112
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
112
113
  nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
113
114
  nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=yNJtjfHQyxtasGa1hQrvgX7UrPa7BAd0oog8EIN8Y_w,15592
114
- nv_ingest/pipeline/default_pipeline_impl.py,sha256=vQvP6VMEOPoFMtHDaMEhBMXQWI8L8iYh-vM6i_EVmBI,15339
115
+ nv_ingest/pipeline/default_pipeline_impl.py,sha256=DhClC17lWUvtBIi2mCC4WkLWT0lxY-CFY0n6nriAxas,16017
115
116
  nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
116
117
  nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
117
118
  nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
118
119
  nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
119
120
  nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
120
- nv_ingest-2025.11.10.dev20251110.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
121
- nv_ingest-2025.11.10.dev20251110.dist-info/METADATA,sha256=XOhTXZjQHdZ0AD9hFmZTZaqukc7fqM2C-Qv0cHZYtHw,15122
122
- nv_ingest-2025.11.10.dev20251110.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
- nv_ingest-2025.11.10.dev20251110.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
124
- nv_ingest-2025.11.10.dev20251110.dist-info/RECORD,,
121
+ nv_ingest-2025.11.19.dev20251119.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
+ nv_ingest-2025.11.19.dev20251119.dist-info/METADATA,sha256=arJTf3Axy2qKAFDlP4lsKCftTw4vnJp3EECP6hmylYU,15092
123
+ nv_ingest-2025.11.19.dev20251119.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
+ nv_ingest-2025.11.19.dev20251119.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
125
+ nv_ingest-2025.11.19.dev20251119.dist-info/RECORD,,