nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,9 @@ import logging
12
12
  import os
13
13
  import time
14
14
  import uuid
15
+ import random
16
+ from pathlib import Path
17
+ import fsspec
15
18
 
16
19
  from fastapi import APIRouter, Request, Response
17
20
  from fastapi import HTTPException
@@ -20,6 +23,8 @@ from redis import RedisError
20
23
 
21
24
  from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
22
25
  from nv_ingest_api.util.service_clients.client_base import FetchMode
26
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader
27
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
23
28
 
24
29
  # For PDF splitting
25
30
  import pypdfium2 as pdfium
@@ -44,6 +49,42 @@ router = APIRouter()
44
49
 
45
50
  DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
46
51
 
52
+ # Default QoS thresholds (pages). Tunable via environment variables:
53
+ # QOS_MAX_PAGES_MICRO, QOS_MAX_PAGES_SMALL, QOS_MAX_PAGES_MEDIUM
54
+ _QOS_DEFAULTS = {
55
+ "micro": 8,
56
+ "small": 64,
57
+ "medium": 256,
58
+ }
59
+
60
+
61
+ def get_qos_tier_for_page_count(page_count: int) -> str:
62
+ """
63
+ Select QoS tier for a document based on its total page count.
64
+ Tiers: 'micro', 'small', 'medium', 'large', 'default'
65
+ Thresholds can be tuned via environment variables:
66
+ - QOS_MAX_PAGES_MICRO (default: 4)
67
+ - QOS_MAX_PAGES_SMALL (default: 16)
68
+ - QOS_MAX_PAGES_MEDIUM (default: 64)
69
+ Anything above MEDIUM is 'large'. Non-positive page_count returns 'default'.
70
+ """
71
+ try:
72
+ micro_max = int(os.getenv("QOS_MAX_PAGES_MICRO", str(_QOS_DEFAULTS["micro"])))
73
+ small_max = int(os.getenv("QOS_MAX_PAGES_SMALL", str(_QOS_DEFAULTS["small"])))
74
+ medium_max = int(os.getenv("QOS_MAX_PAGES_MEDIUM", str(_QOS_DEFAULTS["medium"])))
75
+ except ValueError:
76
+ micro_max, small_max, medium_max = _QOS_DEFAULTS["micro"], _QOS_DEFAULTS["small"], _QOS_DEFAULTS["medium"]
77
+
78
+ if page_count <= 0:
79
+ return "default"
80
+ if page_count <= micro_max:
81
+ return "micro"
82
+ if page_count <= small_max:
83
+ return "small"
84
+ if page_count <= medium_max:
85
+ return "medium"
86
+ return "large"
87
+
47
88
 
48
89
  def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
49
90
  """
@@ -151,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
151
192
  return 1 # Assume single page on error
152
193
 
153
194
 
154
- def _prepare_chunk_submission(
195
+ def _create_subjob_dict(
196
+ job_id: str,
197
+ job_payload: Dict[str, Any],
155
198
  job_spec_template: Dict[str, Any],
156
- chunk: Dict[str, Any],
157
- *,
158
- parent_uuid: uuid.UUID,
159
- parent_job_id: str,
160
199
  current_trace_id: int,
161
- original_source_id: str,
162
- original_source_name: str,
163
- ) -> Tuple[str, MessageWrapper]:
164
- """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
165
-
166
- chunk_number = chunk["chunk_index"] + 1
167
- start_page = chunk["start_page"]
168
- end_page = chunk["end_page"]
169
-
170
- subjob_spec = {
200
+ parent_job_id: str,
201
+ start_key: Dict[str, Any],
202
+ ) -> Dict[str, Any]:
203
+ job_spec = {
171
204
  key: value
172
205
  for key, value in job_spec_template.items()
173
206
  if key not in {"job_payload", "job_id", "tracing_options"}
174
207
  }
208
+ job_spec["job_payload"] = job_payload
209
+ job_spec["job_id"] = job_id
175
210
 
211
+ base_tracing_options = job_spec_template.get("tracing_options") or {}
212
+ tracing_options = dict(base_tracing_options)
213
+ tracing_options.setdefault("trace", True)
214
+ tracing_options["trace_id"] = str(current_trace_id)
215
+ tracing_options["ts_send"] = int(time.time() * 1000)
216
+ tracing_options["parent_job_id"] = parent_job_id
217
+ for key, value in start_key.items():
218
+ tracing_options[key] = value
219
+
220
+ job_spec["tracing_options"] = tracing_options
221
+ return job_spec
222
+
223
+
224
+ def _create_payload_dict(
225
+ job_spec_template: Dict[str, Any],
226
+ content: str,
227
+ source_id: str,
228
+ source_name: str,
229
+ document_type: str,
230
+ ) -> Dict[str, Any]:
176
231
  subjob_payload_template = job_spec_template.get("job_payload", {})
177
232
  subjob_payload = {
178
233
  key: value
@@ -180,27 +235,40 @@ def _prepare_chunk_submission(
180
235
  if key not in {"content", "source_id", "source_name"}
181
236
  }
182
237
 
183
- chunk_bytes = chunk["bytes"]
184
- subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
238
+ subjob_payload["content"] = [content]
185
239
 
186
- page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
187
- subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
188
- subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
240
+ subjob_payload["source_id"] = [source_id]
241
+ subjob_payload["source_name"] = [source_name]
242
+ subjob_payload["document_type"] = [document_type]
243
+ return subjob_payload
244
+
245
+
246
+ def _prepare_chunk_submission(
247
+ job_spec_template: Dict[str, Any],
248
+ chunk: Dict[str, Any],
249
+ *,
250
+ parent_uuid: uuid.UUID,
251
+ parent_job_id: str,
252
+ current_trace_id: int,
253
+ source_id: str,
254
+ source_name: str,
255
+ document_type: str,
256
+ ) -> Tuple[str, MessageWrapper]:
257
+ """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
258
+
259
+ chunk_number = chunk["chunk_index"] + 1
189
260
 
190
261
  subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
191
262
  subjob_id = str(subjob_uuid)
192
- subjob_spec["job_payload"] = subjob_payload
193
- subjob_spec["job_id"] = subjob_id
194
263
 
195
- base_tracing_options = job_spec_template.get("tracing_options") or {}
196
- tracing_options = dict(base_tracing_options)
197
- tracing_options.setdefault("trace", True)
198
- tracing_options["trace_id"] = str(current_trace_id)
199
- tracing_options["ts_send"] = int(time.time() * 1000)
200
- tracing_options["parent_job_id"] = parent_job_id
201
- tracing_options["page_num"] = start_page
264
+ subjob_payload_template = job_spec_template.get("job_payload", {})
265
+ chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
266
+ subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
267
+ start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
202
268
 
203
- subjob_spec["tracing_options"] = tracing_options
269
+ subjob_spec = _create_subjob_dict(
270
+ subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
271
+ )
204
272
 
205
273
  return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
206
274
 
@@ -432,6 +500,76 @@ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, A
432
500
  return trace_dict, annotations_dict
433
501
 
434
502
 
503
+ def _normalize_chunk_records(
504
+ records: Optional[List[Any]],
505
+ descriptor: Dict[str, Any],
506
+ parent_metadata: Dict[str, Any],
507
+ ) -> List[Any]:
508
+ """Re-map chunk-local metadata to document-level context for aggregation."""
509
+
510
+ if not isinstance(records, list):
511
+ return []
512
+
513
+ total_pages = parent_metadata.get("total_pages")
514
+ original_source_id = parent_metadata.get("original_source_id")
515
+ original_source_name = parent_metadata.get("original_source_name")
516
+
517
+ start_page = descriptor.get("start_page")
518
+ page_offset = start_page - 1 if isinstance(start_page, int) and start_page > 0 else 0
519
+
520
+ normalized_entries: List[Any] = []
521
+
522
+ for entry in records:
523
+ if not isinstance(entry, dict):
524
+ normalized_entries.append(entry)
525
+ continue
526
+
527
+ normalized_entry = entry.copy()
528
+ original_metadata = entry.get("metadata")
529
+
530
+ if isinstance(original_metadata, dict):
531
+ normalized_metadata = original_metadata.copy()
532
+ normalized_entry["metadata"] = normalized_metadata
533
+
534
+ original_source_meta = original_metadata.get("source_metadata")
535
+ if isinstance(original_source_meta, dict):
536
+ normalized_source_meta = original_source_meta.copy()
537
+ normalized_metadata["source_metadata"] = normalized_source_meta
538
+
539
+ if original_source_id:
540
+ normalized_source_meta["source_id"] = original_source_id
541
+ if original_source_name:
542
+ normalized_source_meta["source_name"] = original_source_name
543
+
544
+ original_content_meta = original_metadata.get("content_metadata")
545
+ if isinstance(original_content_meta, dict):
546
+ normalized_content_meta = original_content_meta.copy()
547
+ normalized_metadata["content_metadata"] = normalized_content_meta
548
+
549
+ page_number = normalized_content_meta.get("page_number")
550
+ if isinstance(page_number, int) and page_number >= 0:
551
+ normalized_content_meta["page_number"] = page_number + page_offset
552
+
553
+ if isinstance(total_pages, int) and isinstance(normalized_content_meta.get("page_count"), int):
554
+ # Ensure optional per-record page count reflects the full document
555
+ normalized_content_meta["page_count"] = total_pages
556
+
557
+ original_hierarchy = original_content_meta.get("hierarchy")
558
+ if isinstance(original_hierarchy, dict):
559
+ normalized_hierarchy = original_hierarchy.copy()
560
+ normalized_content_meta["hierarchy"] = normalized_hierarchy
561
+
562
+ hierarchy_page = normalized_hierarchy.get("page")
563
+ if isinstance(hierarchy_page, int) and hierarchy_page >= 0:
564
+ normalized_hierarchy["page"] = hierarchy_page + page_offset
565
+ if isinstance(total_pages, int):
566
+ normalized_hierarchy["page_count"] = total_pages
567
+
568
+ normalized_entries.append(normalized_entry)
569
+
570
+ return normalized_entries
571
+
572
+
435
573
  def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
436
574
  """
437
575
  Aggregate chunk-level traces into parent-level metrics.
@@ -574,7 +712,8 @@ def _build_aggregated_response(
574
712
  if result is not None:
575
713
  # Add page data to aggregated result
576
714
  if "data" in result:
577
- aggregated_result["data"].extend(result["data"])
715
+ normalized_records = _normalize_chunk_records(result.get("data"), descriptor, metadata)
716
+ aggregated_result["data"].extend(normalized_records)
578
717
  chunk_entry = dict(descriptor)
579
718
  aggregated_result["metadata"]["chunks"].append(chunk_entry)
580
719
 
@@ -631,6 +770,51 @@ def _build_aggregated_response(
631
770
  return aggregated_result
632
771
 
633
772
 
773
+ # ---------------------------------------------------------------------------
774
+ # Bursty submission helpers (fairness without long-lived in-flight tasks)
775
+ # ---------------------------------------------------------------------------
776
+
777
+
778
+ def _get_submit_burst_params() -> Tuple[int, int, int]:
779
+ """
780
+ Returns (burst_size, pause_ms, jitter_ms) from environment with sane defaults.
781
+ - V2_SUBMIT_BURST_SIZE (default: 16)
782
+ - V2_SUBMIT_BURST_PAUSE_MS (default: 25)
783
+ - V2_SUBMIT_BURST_JITTER_MS (default: 10)
784
+ """
785
+ burst_size = int(os.getenv("V2_SUBMIT_BURST_SIZE", "16"))
786
+ pause_ms = int(os.getenv("V2_SUBMIT_BURST_PAUSE_MS", "50"))
787
+ jitter_ms = int(os.getenv("V2_SUBMIT_BURST_JITTER_MS", "15"))
788
+
789
+ return max(1, burst_size), max(0, pause_ms), max(0, jitter_ms)
790
+
791
+
792
+ async def _submit_subjobs_in_bursts(
793
+ items: List[Tuple[str, MessageWrapper]],
794
+ ingest_service: "INGEST_SERVICE_T",
795
+ *,
796
+ burst_size: int,
797
+ pause_ms: int,
798
+ jitter_ms: int,
799
+ ) -> None:
800
+ """
801
+ Submit subjobs in sequential bursts and await each burst to completion.
802
+ This avoids keeping a large number of pending tasks in the REST handler
803
+ and allows other concurrent requests to interleave enqueue work between bursts.
804
+ """
805
+ for offset in range(0, len(items), burst_size):
806
+ burst = items[offset : offset + burst_size]
807
+ tasks = [ingest_service.submit_job(wrapper, subjob_id) for (subjob_id, wrapper) in burst]
808
+ # Propagate any errors from this burst
809
+ await asyncio.gather(*tasks)
810
+
811
+ # Pause with jitter to yield to other request handlers before next burst
812
+ if offset + burst_size < len(items):
813
+ delay_ms = pause_ms + (random.randint(0, jitter_ms) if jitter_ms > 0 else 0)
814
+ if delay_ms > 0:
815
+ await asyncio.sleep(delay_ms / 1000.0)
816
+
817
+
634
818
  # POST /v2/submit_job
635
819
  @router.post(
636
820
  "/submit_job",
@@ -648,6 +832,8 @@ async def submit_job_v2(
648
832
  request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
649
833
  ):
650
834
  span = trace.get_current_span()
835
+ source_id = None
836
+ document_type = None
651
837
  try:
652
838
  span.add_event("Submitting file for processing (V2)")
653
839
 
@@ -672,28 +858,45 @@ async def submit_job_v2(
672
858
  original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
673
859
  original_source_name = source_names[0] if source_names else "unknown_source.pdf"
674
860
 
861
+ # Track page count for all PDFs (used for both splitting logic and metadata)
862
+ pdf_page_count_cache = None
863
+ submission_items: List[Tuple[str, MessageWrapper]] = []
864
+ subjob_ids: List[str] = []
865
+ subjob_descriptors: List[Dict[str, Any]] = []
866
+ parent_metadata: Dict[str, Any] = {}
867
+ submission_items: List[Tuple[str, MessageWrapper]] = []
868
+ try:
869
+ parent_uuid = uuid.UUID(parent_job_id)
870
+ except ValueError:
871
+ logger.warning(
872
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
873
+ parent_job_id,
874
+ )
875
+ parent_uuid = uuid.uuid4()
675
876
  # Check if this is a PDF that needs splitting
676
877
  if document_types and payloads and document_types[0].lower() == "pdf":
677
878
  # Decode the payload to check page count
678
879
  pdf_content = base64.b64decode(payloads[0])
679
880
  page_count = get_pdf_page_count(pdf_content)
881
+ pdf_page_count_cache = page_count # Cache for later use
882
+ qos_tier = get_qos_tier_for_page_count(page_count)
680
883
  pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
884
+ document_type = DocumentTypeEnum.PDF
681
885
 
682
886
  # Split if the document has more pages than our chunk size
683
887
  if page_count > pages_per_chunk:
684
888
  logger.warning(
685
- "Splitting PDF %s into %s-page chunks (total pages: %s)",
889
+ "Splitting PDF %s into %s-page chunks (total pages: %s) -> (qos_tier: %s)",
686
890
  original_source_name,
687
891
  pages_per_chunk,
688
892
  page_count,
893
+ qos_tier,
689
894
  )
690
-
691
895
  chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
692
896
 
693
897
  subjob_ids: List[str] = []
694
898
  subjob_descriptors: List[Dict[str, Any]] = []
695
- submission_tasks = []
696
-
899
+ submission_items: List[Tuple[str, MessageWrapper]] = []
697
900
  try:
698
901
  parent_uuid = uuid.UUID(parent_job_id)
699
902
  except ValueError:
@@ -704,16 +907,34 @@ async def submit_job_v2(
704
907
  parent_uuid = uuid.uuid4()
705
908
 
706
909
  for chunk in chunks:
910
+ start = chunk["start_page"]
911
+ end = chunk["end_page"]
912
+ page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
913
+ source_id = f"{original_source_id}#{page_suffix}"
914
+ source_name = f"{original_source_name}#{page_suffix}"
707
915
  subjob_id, subjob_wrapper = _prepare_chunk_submission(
708
916
  job_spec_dict,
709
917
  chunk,
918
+ document_type=DocumentTypeEnum.PDF,
710
919
  parent_uuid=parent_uuid,
711
920
  parent_job_id=parent_job_id,
712
921
  current_trace_id=current_trace_id,
713
- original_source_id=original_source_id,
714
- original_source_name=original_source_name,
922
+ source_id=source_id,
923
+ source_name=source_name,
715
924
  )
716
- submission_tasks.append(ingest_service.submit_job(subjob_wrapper, subjob_id))
925
+
926
+ # Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
927
+ try:
928
+ sub_spec = json.loads(subjob_wrapper.payload)
929
+ routing_opts = sub_spec.get("routing_options") or {}
930
+ routing_opts["queue_hint"] = qos_tier
931
+ sub_spec["routing_options"] = routing_opts
932
+ subjob_wrapper = MessageWrapper(payload=json.dumps(sub_spec))
933
+ except Exception:
934
+ # Best-effort; if we cannot inject, fall back to default routing
935
+ pass
936
+
937
+ submission_items.append((subjob_id, subjob_wrapper))
717
938
  subjob_ids.append(subjob_id)
718
939
  subjob_descriptors.append(
719
940
  {
@@ -724,36 +945,113 @@ async def submit_job_v2(
724
945
  "page_count": chunk.get("page_count"),
725
946
  }
726
947
  )
948
+ parent_metadata.update(
949
+ {
950
+ "total_pages": page_count,
951
+ "pages_per_chunk": pages_per_chunk,
952
+ "original_source_id": original_source_id,
953
+ "original_source_name": original_source_name,
954
+ "document_type": document_types[0] if document_types else "pdf",
955
+ "subjob_order": subjob_ids,
956
+ }
957
+ )
958
+ elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
959
+ document_type = document_types[0]
960
+ upload_path = f"./{Path(original_source_id).name}"
961
+ # dump the payload to a file, just came from client
962
+ with fsspec.open(upload_path, "wb") as f:
963
+ f.write(base64.b64decode(payloads[0]))
964
+ dataloader = DataLoader(
965
+ path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
966
+ )
967
+ document_type = DocumentTypeEnum.MP3
968
+
969
+ parent_uuid = uuid.UUID(parent_job_id)
970
+ for task in job_spec_dict["tasks"]:
971
+ if "task_properties" in task and "document_type" in task["task_properties"]:
972
+ task["task_properties"]["document_type"] = document_type
973
+ end = 0
974
+ for idx, (file_path, duration) in enumerate(dataloader.files_completed):
975
+ start = end
976
+ end = int(start + duration)
977
+ chunk = {
978
+ "bytes": file_path.encode("utf-8"),
979
+ "chunk_index": idx,
980
+ "start": start,
981
+ "end": end,
982
+ }
983
+
984
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
985
+ job_spec_dict,
986
+ chunk,
987
+ parent_uuid=parent_uuid,
988
+ parent_job_id=parent_job_id,
989
+ current_trace_id=current_trace_id,
990
+ source_id=file_path,
991
+ source_name=upload_path,
992
+ document_type=document_type,
993
+ )
727
994
 
728
- if submission_tasks:
729
- await asyncio.gather(*submission_tasks)
995
+ submission_items.append((subjob_id, subjob_wrapper))
996
+ subjob_ids.append(subjob_id)
997
+ subjob_descriptors.append(
998
+ {
999
+ "job_id": subjob_id,
1000
+ "chunk_index": idx + 1,
1001
+ "start_page": chunk.get("start"),
1002
+ "end_page": chunk.get("end"),
1003
+ "page_count": chunk.get("page_count", 0),
1004
+ }
1005
+ )
1006
+ logger.error(f"Removing uploaded file {upload_path}")
1007
+ os.remove(upload_path)
1008
+
1009
+ if submission_items:
1010
+ burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
1011
+ await _submit_subjobs_in_bursts(
1012
+ submission_items,
1013
+ ingest_service,
1014
+ burst_size=burst_size,
1015
+ pause_ms=pause_ms,
1016
+ jitter_ms=jitter_ms,
1017
+ )
730
1018
 
731
- parent_metadata: Dict[str, Any] = {
732
- "total_pages": page_count,
733
- "pages_per_chunk": pages_per_chunk,
1019
+ parent_metadata.update(
1020
+ {
734
1021
  "original_source_id": original_source_id,
735
1022
  "original_source_name": original_source_name,
736
- "document_type": document_types[0] if document_types else "pdf",
1023
+ "document_type": document_type,
737
1024
  "subjob_order": subjob_ids,
738
1025
  }
1026
+ )
1027
+ # raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
1028
+ await ingest_service.set_parent_job_mapping(
1029
+ parent_job_id,
1030
+ subjob_ids,
1031
+ parent_metadata,
1032
+ subjob_descriptors=subjob_descriptors,
1033
+ )
739
1034
 
740
- await ingest_service.set_parent_job_mapping(
741
- parent_job_id,
742
- subjob_ids,
743
- parent_metadata,
744
- subjob_descriptors=subjob_descriptors,
745
- )
746
-
747
- await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
1035
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
748
1036
 
749
- span.add_event(f"Split into {len(subjob_ids)} subjobs")
750
- response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
751
- return parent_job_id
1037
+ span.add_event(f"Split into {len(subjob_ids)} subjobs")
1038
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
1039
+ return parent_job_id
752
1040
 
753
1041
  # For non-PDFs or cases where splitting is not required, submit as normal
754
1042
  if "tracing_options" not in job_spec_dict:
755
1043
  job_spec_dict["tracing_options"] = {"trace": True}
756
1044
  job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
1045
+ # If this was a PDF and we computed page_count, route the single job using the same QoS tier
1046
+ try:
1047
+ if (
1048
+ document_types
1049
+ and document_types[0].lower() == "pdf"
1050
+ and "queue_hint" not in (job_spec_dict.get("routing_options") or {})
1051
+ ):
1052
+ job_spec_dict.setdefault("routing_options", {})["queue_hint"] = qos_tier
1053
+ except Exception:
1054
+ pass
757
1055
  updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
758
1056
 
759
1057
  span.add_event("Submitting as single job (no split needed)")
@@ -762,12 +1060,40 @@ async def submit_job_v2(
762
1060
  await ingest_service.submit_job(updated_job_spec, parent_job_id)
763
1061
  await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
764
1062
 
1063
+ # If this was a PDF (even if not split), store page count metadata for tracking
1064
+ if pdf_page_count_cache is not None:
1065
+ try:
1066
+ # Use cached page count from earlier check to avoid re-decoding
1067
+ # Store minimal metadata for non-split PDFs (consistent with split PDFs)
1068
+ single_pdf_metadata: Dict[str, Any] = {
1069
+ "total_pages": pdf_page_count_cache,
1070
+ "pages_per_chunk": pdf_page_count_cache, # Single chunk = entire document
1071
+ "original_source_id": original_source_id,
1072
+ "original_source_name": original_source_name,
1073
+ "document_type": document_types[0],
1074
+ "subjob_order": [], # No subjobs for non-split PDFs
1075
+ }
1076
+
1077
+ # Store as parent job metadata with empty subjob list for consistency
1078
+ await ingest_service.set_parent_job_mapping(
1079
+ parent_job_id,
1080
+ [], # Empty subjob list
1081
+ single_pdf_metadata,
1082
+ subjob_descriptors=[],
1083
+ )
1084
+ logger.debug(
1085
+ f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
1086
+ )
1087
+ except Exception as metadata_err:
1088
+ # Don't fail the job if metadata storage fails
1089
+ logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
1090
+
765
1091
  response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
766
1092
  return parent_job_id
767
1093
 
768
1094
  except Exception as ex:
769
- logger.exception(f"Error submitting job: {str(ex)}")
770
- raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
1095
+ logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
1096
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
771
1097
 
772
1098
 
773
1099
  # GET /v2/fetch_job
@@ -898,6 +1224,32 @@ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
898
1224
 
899
1225
  logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
900
1226
 
1227
+ # Special case: Non-split PDFs have metadata but no subjobs
1228
+ # Fetch the result directly and augment with page count metadata
1229
+ if len(subjob_ids) == 0:
1230
+ logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
1231
+ try:
1232
+ job_response = await ingest_service.fetch_job(job_id)
1233
+
1234
+ # Augment response with page count metadata
1235
+ if isinstance(job_response, dict):
1236
+ if "metadata" not in job_response:
1237
+ job_response["metadata"] = {}
1238
+ job_response["metadata"]["total_pages"] = metadata.get("total_pages")
1239
+ job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
1240
+ job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
1241
+
1242
+ # Update job state after successful fetch
1243
+ await _update_job_state_after_fetch(job_id, ingest_service)
1244
+
1245
+ return _stream_json_response(job_response)
1246
+ except (TimeoutError, RedisError, ConnectionError):
1247
+ logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
1248
+ raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
1249
+ except Exception as e:
1250
+ logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
1251
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
1252
+
901
1253
  # Build ordered descriptors for subjobs
902
1254
  stored_descriptors = subjob_info.get("subjob_descriptors") or []
903
1255
  descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
@@ -18,6 +18,18 @@ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import Simp
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
+ def _broker_server_target(host, port, max_queue_size):
22
+ """
23
+ Target function to be run in a separate process for the SimpleMessageBroker.
24
+ """
25
+ server = SimpleMessageBroker(host, port, max_queue_size)
26
+ try:
27
+ server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
28
+ except Exception:
29
+ pass
30
+ server.serve_forever()
31
+
32
+
21
33
  def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
22
34
  """
23
35
  Starts a SimpleMessageBroker server in a separate process.
@@ -58,16 +70,11 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
58
70
  f"continuing to spawn a broker process (tests expect a Process to be returned)"
59
71
  )
60
72
 
61
- def broker_server():
62
- # Optionally, set socket options here for reuse (note: binding occurs in server __init__).
63
- server = SimpleMessageBroker(server_host, server_port, max_queue_size)
64
- try:
65
- server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
66
- except Exception:
67
- pass
68
- server.serve_forever()
69
-
70
- p = multiprocessing.Process(target=broker_server)
73
+ p = multiprocessing.Process(
74
+ target=_broker_server_target,
75
+ args=(server_host, server_port, max_queue_size),
76
+ daemon=True,
77
+ )
71
78
  # If we're launching from inside the pipeline subprocess, mark daemon so the
72
79
  # broker dies automatically when the subprocess exits.
73
80
  p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
@@ -11,9 +11,10 @@ Strategy pattern for clean separation of execution concerns.
11
11
  """
12
12
 
13
13
  import atexit
14
- import os
15
14
  import logging
16
15
  import multiprocessing
16
+ import os
17
+ import sys
17
18
  import time
18
19
  from abc import ABC, abstractmethod
19
20
 
@@ -132,7 +133,10 @@ class SubprocessStrategy(ProcessExecutionStrategy):
132
133
  logger.info("Launching pipeline in Python subprocess using multiprocessing.")
133
134
 
134
135
  # Create subprocess using fork context
135
- ctx = multiprocessing.get_context("fork")
136
+ start_method = "fork"
137
+ if sys.platform.lower() == "darwin":
138
+ start_method = "spawn"
139
+ ctx = multiprocessing.get_context(start_method)
136
140
  process = ctx.Process(
137
141
  target=run_pipeline_process,
138
142
  args=(