nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/v2/README.md +44 -18
- nv_ingest/api/v2/ingest.py +409 -57
- nv_ingest/framework/orchestration/process/dependent_services.py +17 -10
- nv_ingest/framework/orchestration/process/strategies.py +6 -2
- nv_ingest/framework/orchestration/process/termination.py +49 -9
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +41 -8
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +33 -11
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +2 -2
- nv_ingest/pipeline/default_pipeline_impl.py +46 -21
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/METADATA +1 -2
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/RECORD +17 -16
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/top_level.txt +0 -0
nv_ingest/api/v2/ingest.py
CHANGED
|
@@ -12,6 +12,9 @@ import logging
|
|
|
12
12
|
import os
|
|
13
13
|
import time
|
|
14
14
|
import uuid
|
|
15
|
+
import random
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import fsspec
|
|
15
18
|
|
|
16
19
|
from fastapi import APIRouter, Request, Response
|
|
17
20
|
from fastapi import HTTPException
|
|
@@ -20,6 +23,8 @@ from redis import RedisError
|
|
|
20
23
|
|
|
21
24
|
from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
|
|
22
25
|
from nv_ingest_api.util.service_clients.client_base import FetchMode
|
|
26
|
+
from nv_ingest_api.util.dataloader.dataloader import DataLoader
|
|
27
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
|
|
23
28
|
|
|
24
29
|
# For PDF splitting
|
|
25
30
|
import pypdfium2 as pdfium
|
|
@@ -44,6 +49,42 @@ router = APIRouter()
|
|
|
44
49
|
|
|
45
50
|
DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
|
|
46
51
|
|
|
52
|
+
# Default QoS thresholds (pages). Tunable via environment variables:
|
|
53
|
+
# QOS_MAX_PAGES_MICRO, QOS_MAX_PAGES_SMALL, QOS_MAX_PAGES_MEDIUM
|
|
54
|
+
_QOS_DEFAULTS = {
|
|
55
|
+
"micro": 8,
|
|
56
|
+
"small": 64,
|
|
57
|
+
"medium": 256,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_qos_tier_for_page_count(page_count: int) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Select QoS tier for a document based on its total page count.
|
|
64
|
+
Tiers: 'micro', 'small', 'medium', 'large', 'default'
|
|
65
|
+
Thresholds can be tuned via environment variables:
|
|
66
|
+
- QOS_MAX_PAGES_MICRO (default: 4)
|
|
67
|
+
- QOS_MAX_PAGES_SMALL (default: 16)
|
|
68
|
+
- QOS_MAX_PAGES_MEDIUM (default: 64)
|
|
69
|
+
Anything above MEDIUM is 'large'. Non-positive page_count returns 'default'.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
micro_max = int(os.getenv("QOS_MAX_PAGES_MICRO", str(_QOS_DEFAULTS["micro"])))
|
|
73
|
+
small_max = int(os.getenv("QOS_MAX_PAGES_SMALL", str(_QOS_DEFAULTS["small"])))
|
|
74
|
+
medium_max = int(os.getenv("QOS_MAX_PAGES_MEDIUM", str(_QOS_DEFAULTS["medium"])))
|
|
75
|
+
except ValueError:
|
|
76
|
+
micro_max, small_max, medium_max = _QOS_DEFAULTS["micro"], _QOS_DEFAULTS["small"], _QOS_DEFAULTS["medium"]
|
|
77
|
+
|
|
78
|
+
if page_count <= 0:
|
|
79
|
+
return "default"
|
|
80
|
+
if page_count <= micro_max:
|
|
81
|
+
return "micro"
|
|
82
|
+
if page_count <= small_max:
|
|
83
|
+
return "small"
|
|
84
|
+
if page_count <= medium_max:
|
|
85
|
+
return "medium"
|
|
86
|
+
return "large"
|
|
87
|
+
|
|
47
88
|
|
|
48
89
|
def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
|
|
49
90
|
"""
|
|
@@ -151,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
|
|
|
151
192
|
return 1 # Assume single page on error
|
|
152
193
|
|
|
153
194
|
|
|
154
|
-
def
|
|
195
|
+
def _create_subjob_dict(
|
|
196
|
+
job_id: str,
|
|
197
|
+
job_payload: Dict[str, Any],
|
|
155
198
|
job_spec_template: Dict[str, Any],
|
|
156
|
-
chunk: Dict[str, Any],
|
|
157
|
-
*,
|
|
158
|
-
parent_uuid: uuid.UUID,
|
|
159
|
-
parent_job_id: str,
|
|
160
199
|
current_trace_id: int,
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
) ->
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
chunk_number = chunk["chunk_index"] + 1
|
|
167
|
-
start_page = chunk["start_page"]
|
|
168
|
-
end_page = chunk["end_page"]
|
|
169
|
-
|
|
170
|
-
subjob_spec = {
|
|
200
|
+
parent_job_id: str,
|
|
201
|
+
start_key: Dict[str, Any],
|
|
202
|
+
) -> Dict[str, Any]:
|
|
203
|
+
job_spec = {
|
|
171
204
|
key: value
|
|
172
205
|
for key, value in job_spec_template.items()
|
|
173
206
|
if key not in {"job_payload", "job_id", "tracing_options"}
|
|
174
207
|
}
|
|
208
|
+
job_spec["job_payload"] = job_payload
|
|
209
|
+
job_spec["job_id"] = job_id
|
|
175
210
|
|
|
211
|
+
base_tracing_options = job_spec_template.get("tracing_options") or {}
|
|
212
|
+
tracing_options = dict(base_tracing_options)
|
|
213
|
+
tracing_options.setdefault("trace", True)
|
|
214
|
+
tracing_options["trace_id"] = str(current_trace_id)
|
|
215
|
+
tracing_options["ts_send"] = int(time.time() * 1000)
|
|
216
|
+
tracing_options["parent_job_id"] = parent_job_id
|
|
217
|
+
for key, value in start_key.items():
|
|
218
|
+
tracing_options[key] = value
|
|
219
|
+
|
|
220
|
+
job_spec["tracing_options"] = tracing_options
|
|
221
|
+
return job_spec
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _create_payload_dict(
|
|
225
|
+
job_spec_template: Dict[str, Any],
|
|
226
|
+
content: str,
|
|
227
|
+
source_id: str,
|
|
228
|
+
source_name: str,
|
|
229
|
+
document_type: str,
|
|
230
|
+
) -> Dict[str, Any]:
|
|
176
231
|
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
177
232
|
subjob_payload = {
|
|
178
233
|
key: value
|
|
@@ -180,27 +235,40 @@ def _prepare_chunk_submission(
|
|
|
180
235
|
if key not in {"content", "source_id", "source_name"}
|
|
181
236
|
}
|
|
182
237
|
|
|
183
|
-
|
|
184
|
-
subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
|
|
238
|
+
subjob_payload["content"] = [content]
|
|
185
239
|
|
|
186
|
-
|
|
187
|
-
subjob_payload["
|
|
188
|
-
subjob_payload["
|
|
240
|
+
subjob_payload["source_id"] = [source_id]
|
|
241
|
+
subjob_payload["source_name"] = [source_name]
|
|
242
|
+
subjob_payload["document_type"] = [document_type]
|
|
243
|
+
return subjob_payload
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _prepare_chunk_submission(
|
|
247
|
+
job_spec_template: Dict[str, Any],
|
|
248
|
+
chunk: Dict[str, Any],
|
|
249
|
+
*,
|
|
250
|
+
parent_uuid: uuid.UUID,
|
|
251
|
+
parent_job_id: str,
|
|
252
|
+
current_trace_id: int,
|
|
253
|
+
source_id: str,
|
|
254
|
+
source_name: str,
|
|
255
|
+
document_type: str,
|
|
256
|
+
) -> Tuple[str, MessageWrapper]:
|
|
257
|
+
"""Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
|
|
258
|
+
|
|
259
|
+
chunk_number = chunk["chunk_index"] + 1
|
|
189
260
|
|
|
190
261
|
subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
|
|
191
262
|
subjob_id = str(subjob_uuid)
|
|
192
|
-
subjob_spec["job_payload"] = subjob_payload
|
|
193
|
-
subjob_spec["job_id"] = subjob_id
|
|
194
263
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
tracing_options["ts_send"] = int(time.time() * 1000)
|
|
200
|
-
tracing_options["parent_job_id"] = parent_job_id
|
|
201
|
-
tracing_options["page_num"] = start_page
|
|
264
|
+
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
265
|
+
chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
|
|
266
|
+
subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
|
|
267
|
+
start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
|
|
202
268
|
|
|
203
|
-
subjob_spec
|
|
269
|
+
subjob_spec = _create_subjob_dict(
|
|
270
|
+
subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
|
|
271
|
+
)
|
|
204
272
|
|
|
205
273
|
return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
|
|
206
274
|
|
|
@@ -432,6 +500,76 @@ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, A
|
|
|
432
500
|
return trace_dict, annotations_dict
|
|
433
501
|
|
|
434
502
|
|
|
503
|
+
def _normalize_chunk_records(
|
|
504
|
+
records: Optional[List[Any]],
|
|
505
|
+
descriptor: Dict[str, Any],
|
|
506
|
+
parent_metadata: Dict[str, Any],
|
|
507
|
+
) -> List[Any]:
|
|
508
|
+
"""Re-map chunk-local metadata to document-level context for aggregation."""
|
|
509
|
+
|
|
510
|
+
if not isinstance(records, list):
|
|
511
|
+
return []
|
|
512
|
+
|
|
513
|
+
total_pages = parent_metadata.get("total_pages")
|
|
514
|
+
original_source_id = parent_metadata.get("original_source_id")
|
|
515
|
+
original_source_name = parent_metadata.get("original_source_name")
|
|
516
|
+
|
|
517
|
+
start_page = descriptor.get("start_page")
|
|
518
|
+
page_offset = start_page - 1 if isinstance(start_page, int) and start_page > 0 else 0
|
|
519
|
+
|
|
520
|
+
normalized_entries: List[Any] = []
|
|
521
|
+
|
|
522
|
+
for entry in records:
|
|
523
|
+
if not isinstance(entry, dict):
|
|
524
|
+
normalized_entries.append(entry)
|
|
525
|
+
continue
|
|
526
|
+
|
|
527
|
+
normalized_entry = entry.copy()
|
|
528
|
+
original_metadata = entry.get("metadata")
|
|
529
|
+
|
|
530
|
+
if isinstance(original_metadata, dict):
|
|
531
|
+
normalized_metadata = original_metadata.copy()
|
|
532
|
+
normalized_entry["metadata"] = normalized_metadata
|
|
533
|
+
|
|
534
|
+
original_source_meta = original_metadata.get("source_metadata")
|
|
535
|
+
if isinstance(original_source_meta, dict):
|
|
536
|
+
normalized_source_meta = original_source_meta.copy()
|
|
537
|
+
normalized_metadata["source_metadata"] = normalized_source_meta
|
|
538
|
+
|
|
539
|
+
if original_source_id:
|
|
540
|
+
normalized_source_meta["source_id"] = original_source_id
|
|
541
|
+
if original_source_name:
|
|
542
|
+
normalized_source_meta["source_name"] = original_source_name
|
|
543
|
+
|
|
544
|
+
original_content_meta = original_metadata.get("content_metadata")
|
|
545
|
+
if isinstance(original_content_meta, dict):
|
|
546
|
+
normalized_content_meta = original_content_meta.copy()
|
|
547
|
+
normalized_metadata["content_metadata"] = normalized_content_meta
|
|
548
|
+
|
|
549
|
+
page_number = normalized_content_meta.get("page_number")
|
|
550
|
+
if isinstance(page_number, int) and page_number >= 0:
|
|
551
|
+
normalized_content_meta["page_number"] = page_number + page_offset
|
|
552
|
+
|
|
553
|
+
if isinstance(total_pages, int) and isinstance(normalized_content_meta.get("page_count"), int):
|
|
554
|
+
# Ensure optional per-record page count reflects the full document
|
|
555
|
+
normalized_content_meta["page_count"] = total_pages
|
|
556
|
+
|
|
557
|
+
original_hierarchy = original_content_meta.get("hierarchy")
|
|
558
|
+
if isinstance(original_hierarchy, dict):
|
|
559
|
+
normalized_hierarchy = original_hierarchy.copy()
|
|
560
|
+
normalized_content_meta["hierarchy"] = normalized_hierarchy
|
|
561
|
+
|
|
562
|
+
hierarchy_page = normalized_hierarchy.get("page")
|
|
563
|
+
if isinstance(hierarchy_page, int) and hierarchy_page >= 0:
|
|
564
|
+
normalized_hierarchy["page"] = hierarchy_page + page_offset
|
|
565
|
+
if isinstance(total_pages, int):
|
|
566
|
+
normalized_hierarchy["page_count"] = total_pages
|
|
567
|
+
|
|
568
|
+
normalized_entries.append(normalized_entry)
|
|
569
|
+
|
|
570
|
+
return normalized_entries
|
|
571
|
+
|
|
572
|
+
|
|
435
573
|
def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
|
|
436
574
|
"""
|
|
437
575
|
Aggregate chunk-level traces into parent-level metrics.
|
|
@@ -574,7 +712,8 @@ def _build_aggregated_response(
|
|
|
574
712
|
if result is not None:
|
|
575
713
|
# Add page data to aggregated result
|
|
576
714
|
if "data" in result:
|
|
577
|
-
|
|
715
|
+
normalized_records = _normalize_chunk_records(result.get("data"), descriptor, metadata)
|
|
716
|
+
aggregated_result["data"].extend(normalized_records)
|
|
578
717
|
chunk_entry = dict(descriptor)
|
|
579
718
|
aggregated_result["metadata"]["chunks"].append(chunk_entry)
|
|
580
719
|
|
|
@@ -631,6 +770,51 @@ def _build_aggregated_response(
|
|
|
631
770
|
return aggregated_result
|
|
632
771
|
|
|
633
772
|
|
|
773
|
+
# ---------------------------------------------------------------------------
|
|
774
|
+
# Bursty submission helpers (fairness without long-lived in-flight tasks)
|
|
775
|
+
# ---------------------------------------------------------------------------
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _get_submit_burst_params() -> Tuple[int, int, int]:
|
|
779
|
+
"""
|
|
780
|
+
Returns (burst_size, pause_ms, jitter_ms) from environment with sane defaults.
|
|
781
|
+
- V2_SUBMIT_BURST_SIZE (default: 16)
|
|
782
|
+
- V2_SUBMIT_BURST_PAUSE_MS (default: 25)
|
|
783
|
+
- V2_SUBMIT_BURST_JITTER_MS (default: 10)
|
|
784
|
+
"""
|
|
785
|
+
burst_size = int(os.getenv("V2_SUBMIT_BURST_SIZE", "16"))
|
|
786
|
+
pause_ms = int(os.getenv("V2_SUBMIT_BURST_PAUSE_MS", "50"))
|
|
787
|
+
jitter_ms = int(os.getenv("V2_SUBMIT_BURST_JITTER_MS", "15"))
|
|
788
|
+
|
|
789
|
+
return max(1, burst_size), max(0, pause_ms), max(0, jitter_ms)
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
async def _submit_subjobs_in_bursts(
|
|
793
|
+
items: List[Tuple[str, MessageWrapper]],
|
|
794
|
+
ingest_service: "INGEST_SERVICE_T",
|
|
795
|
+
*,
|
|
796
|
+
burst_size: int,
|
|
797
|
+
pause_ms: int,
|
|
798
|
+
jitter_ms: int,
|
|
799
|
+
) -> None:
|
|
800
|
+
"""
|
|
801
|
+
Submit subjobs in sequential bursts and await each burst to completion.
|
|
802
|
+
This avoids keeping a large number of pending tasks in the REST handler
|
|
803
|
+
and allows other concurrent requests to interleave enqueue work between bursts.
|
|
804
|
+
"""
|
|
805
|
+
for offset in range(0, len(items), burst_size):
|
|
806
|
+
burst = items[offset : offset + burst_size]
|
|
807
|
+
tasks = [ingest_service.submit_job(wrapper, subjob_id) for (subjob_id, wrapper) in burst]
|
|
808
|
+
# Propagate any errors from this burst
|
|
809
|
+
await asyncio.gather(*tasks)
|
|
810
|
+
|
|
811
|
+
# Pause with jitter to yield to other request handlers before next burst
|
|
812
|
+
if offset + burst_size < len(items):
|
|
813
|
+
delay_ms = pause_ms + (random.randint(0, jitter_ms) if jitter_ms > 0 else 0)
|
|
814
|
+
if delay_ms > 0:
|
|
815
|
+
await asyncio.sleep(delay_ms / 1000.0)
|
|
816
|
+
|
|
817
|
+
|
|
634
818
|
# POST /v2/submit_job
|
|
635
819
|
@router.post(
|
|
636
820
|
"/submit_job",
|
|
@@ -648,6 +832,8 @@ async def submit_job_v2(
|
|
|
648
832
|
request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
|
|
649
833
|
):
|
|
650
834
|
span = trace.get_current_span()
|
|
835
|
+
source_id = None
|
|
836
|
+
document_type = None
|
|
651
837
|
try:
|
|
652
838
|
span.add_event("Submitting file for processing (V2)")
|
|
653
839
|
|
|
@@ -672,28 +858,45 @@ async def submit_job_v2(
|
|
|
672
858
|
original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
|
|
673
859
|
original_source_name = source_names[0] if source_names else "unknown_source.pdf"
|
|
674
860
|
|
|
861
|
+
# Track page count for all PDFs (used for both splitting logic and metadata)
|
|
862
|
+
pdf_page_count_cache = None
|
|
863
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
864
|
+
subjob_ids: List[str] = []
|
|
865
|
+
subjob_descriptors: List[Dict[str, Any]] = []
|
|
866
|
+
parent_metadata: Dict[str, Any] = {}
|
|
867
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
868
|
+
try:
|
|
869
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
870
|
+
except ValueError:
|
|
871
|
+
logger.warning(
|
|
872
|
+
"Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
|
|
873
|
+
parent_job_id,
|
|
874
|
+
)
|
|
875
|
+
parent_uuid = uuid.uuid4()
|
|
675
876
|
# Check if this is a PDF that needs splitting
|
|
676
877
|
if document_types and payloads and document_types[0].lower() == "pdf":
|
|
677
878
|
# Decode the payload to check page count
|
|
678
879
|
pdf_content = base64.b64decode(payloads[0])
|
|
679
880
|
page_count = get_pdf_page_count(pdf_content)
|
|
881
|
+
pdf_page_count_cache = page_count # Cache for later use
|
|
882
|
+
qos_tier = get_qos_tier_for_page_count(page_count)
|
|
680
883
|
pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
|
|
884
|
+
document_type = DocumentTypeEnum.PDF
|
|
681
885
|
|
|
682
886
|
# Split if the document has more pages than our chunk size
|
|
683
887
|
if page_count > pages_per_chunk:
|
|
684
888
|
logger.warning(
|
|
685
|
-
"Splitting PDF %s into %s-page chunks (total pages: %s)",
|
|
889
|
+
"Splitting PDF %s into %s-page chunks (total pages: %s) -> (qos_tier: %s)",
|
|
686
890
|
original_source_name,
|
|
687
891
|
pages_per_chunk,
|
|
688
892
|
page_count,
|
|
893
|
+
qos_tier,
|
|
689
894
|
)
|
|
690
|
-
|
|
691
895
|
chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
|
|
692
896
|
|
|
693
897
|
subjob_ids: List[str] = []
|
|
694
898
|
subjob_descriptors: List[Dict[str, Any]] = []
|
|
695
|
-
|
|
696
|
-
|
|
899
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
697
900
|
try:
|
|
698
901
|
parent_uuid = uuid.UUID(parent_job_id)
|
|
699
902
|
except ValueError:
|
|
@@ -704,16 +907,34 @@ async def submit_job_v2(
|
|
|
704
907
|
parent_uuid = uuid.uuid4()
|
|
705
908
|
|
|
706
909
|
for chunk in chunks:
|
|
910
|
+
start = chunk["start_page"]
|
|
911
|
+
end = chunk["end_page"]
|
|
912
|
+
page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
|
|
913
|
+
source_id = f"{original_source_id}#{page_suffix}"
|
|
914
|
+
source_name = f"{original_source_name}#{page_suffix}"
|
|
707
915
|
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
708
916
|
job_spec_dict,
|
|
709
917
|
chunk,
|
|
918
|
+
document_type=DocumentTypeEnum.PDF,
|
|
710
919
|
parent_uuid=parent_uuid,
|
|
711
920
|
parent_job_id=parent_job_id,
|
|
712
921
|
current_trace_id=current_trace_id,
|
|
713
|
-
|
|
714
|
-
|
|
922
|
+
source_id=source_id,
|
|
923
|
+
source_name=source_name,
|
|
715
924
|
)
|
|
716
|
-
|
|
925
|
+
|
|
926
|
+
# Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
|
|
927
|
+
try:
|
|
928
|
+
sub_spec = json.loads(subjob_wrapper.payload)
|
|
929
|
+
routing_opts = sub_spec.get("routing_options") or {}
|
|
930
|
+
routing_opts["queue_hint"] = qos_tier
|
|
931
|
+
sub_spec["routing_options"] = routing_opts
|
|
932
|
+
subjob_wrapper = MessageWrapper(payload=json.dumps(sub_spec))
|
|
933
|
+
except Exception:
|
|
934
|
+
# Best-effort; if we cannot inject, fall back to default routing
|
|
935
|
+
pass
|
|
936
|
+
|
|
937
|
+
submission_items.append((subjob_id, subjob_wrapper))
|
|
717
938
|
subjob_ids.append(subjob_id)
|
|
718
939
|
subjob_descriptors.append(
|
|
719
940
|
{
|
|
@@ -724,36 +945,113 @@ async def submit_job_v2(
|
|
|
724
945
|
"page_count": chunk.get("page_count"),
|
|
725
946
|
}
|
|
726
947
|
)
|
|
948
|
+
parent_metadata.update(
|
|
949
|
+
{
|
|
950
|
+
"total_pages": page_count,
|
|
951
|
+
"pages_per_chunk": pages_per_chunk,
|
|
952
|
+
"original_source_id": original_source_id,
|
|
953
|
+
"original_source_name": original_source_name,
|
|
954
|
+
"document_type": document_types[0] if document_types else "pdf",
|
|
955
|
+
"subjob_order": subjob_ids,
|
|
956
|
+
}
|
|
957
|
+
)
|
|
958
|
+
elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
|
|
959
|
+
document_type = document_types[0]
|
|
960
|
+
upload_path = f"./{Path(original_source_id).name}"
|
|
961
|
+
# dump the payload to a file, just came from client
|
|
962
|
+
with fsspec.open(upload_path, "wb") as f:
|
|
963
|
+
f.write(base64.b64decode(payloads[0]))
|
|
964
|
+
dataloader = DataLoader(
|
|
965
|
+
path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
|
|
966
|
+
)
|
|
967
|
+
document_type = DocumentTypeEnum.MP3
|
|
968
|
+
|
|
969
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
970
|
+
for task in job_spec_dict["tasks"]:
|
|
971
|
+
if "task_properties" in task and "document_type" in task["task_properties"]:
|
|
972
|
+
task["task_properties"]["document_type"] = document_type
|
|
973
|
+
end = 0
|
|
974
|
+
for idx, (file_path, duration) in enumerate(dataloader.files_completed):
|
|
975
|
+
start = end
|
|
976
|
+
end = int(start + duration)
|
|
977
|
+
chunk = {
|
|
978
|
+
"bytes": file_path.encode("utf-8"),
|
|
979
|
+
"chunk_index": idx,
|
|
980
|
+
"start": start,
|
|
981
|
+
"end": end,
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
985
|
+
job_spec_dict,
|
|
986
|
+
chunk,
|
|
987
|
+
parent_uuid=parent_uuid,
|
|
988
|
+
parent_job_id=parent_job_id,
|
|
989
|
+
current_trace_id=current_trace_id,
|
|
990
|
+
source_id=file_path,
|
|
991
|
+
source_name=upload_path,
|
|
992
|
+
document_type=document_type,
|
|
993
|
+
)
|
|
727
994
|
|
|
728
|
-
|
|
729
|
-
|
|
995
|
+
submission_items.append((subjob_id, subjob_wrapper))
|
|
996
|
+
subjob_ids.append(subjob_id)
|
|
997
|
+
subjob_descriptors.append(
|
|
998
|
+
{
|
|
999
|
+
"job_id": subjob_id,
|
|
1000
|
+
"chunk_index": idx + 1,
|
|
1001
|
+
"start_page": chunk.get("start"),
|
|
1002
|
+
"end_page": chunk.get("end"),
|
|
1003
|
+
"page_count": chunk.get("page_count", 0),
|
|
1004
|
+
}
|
|
1005
|
+
)
|
|
1006
|
+
logger.error(f"Removing uploaded file {upload_path}")
|
|
1007
|
+
os.remove(upload_path)
|
|
1008
|
+
|
|
1009
|
+
if submission_items:
|
|
1010
|
+
burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
|
|
1011
|
+
await _submit_subjobs_in_bursts(
|
|
1012
|
+
submission_items,
|
|
1013
|
+
ingest_service,
|
|
1014
|
+
burst_size=burst_size,
|
|
1015
|
+
pause_ms=pause_ms,
|
|
1016
|
+
jitter_ms=jitter_ms,
|
|
1017
|
+
)
|
|
730
1018
|
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
"pages_per_chunk": pages_per_chunk,
|
|
1019
|
+
parent_metadata.update(
|
|
1020
|
+
{
|
|
734
1021
|
"original_source_id": original_source_id,
|
|
735
1022
|
"original_source_name": original_source_name,
|
|
736
|
-
"document_type":
|
|
1023
|
+
"document_type": document_type,
|
|
737
1024
|
"subjob_order": subjob_ids,
|
|
738
1025
|
}
|
|
1026
|
+
)
|
|
1027
|
+
# raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
|
|
1028
|
+
await ingest_service.set_parent_job_mapping(
|
|
1029
|
+
parent_job_id,
|
|
1030
|
+
subjob_ids,
|
|
1031
|
+
parent_metadata,
|
|
1032
|
+
subjob_descriptors=subjob_descriptors,
|
|
1033
|
+
)
|
|
739
1034
|
|
|
740
|
-
|
|
741
|
-
parent_job_id,
|
|
742
|
-
subjob_ids,
|
|
743
|
-
parent_metadata,
|
|
744
|
-
subjob_descriptors=subjob_descriptors,
|
|
745
|
-
)
|
|
746
|
-
|
|
747
|
-
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
1035
|
+
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
748
1036
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
1037
|
+
span.add_event(f"Split into {len(subjob_ids)} subjobs")
|
|
1038
|
+
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
1039
|
+
return parent_job_id
|
|
752
1040
|
|
|
753
1041
|
# For non-PDFs or cases where splitting is not required, submit as normal
|
|
754
1042
|
if "tracing_options" not in job_spec_dict:
|
|
755
1043
|
job_spec_dict["tracing_options"] = {"trace": True}
|
|
756
1044
|
job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
|
|
1045
|
+
# If this was a PDF and we computed page_count, route the single job using the same QoS tier
|
|
1046
|
+
try:
|
|
1047
|
+
if (
|
|
1048
|
+
document_types
|
|
1049
|
+
and document_types[0].lower() == "pdf"
|
|
1050
|
+
and "queue_hint" not in (job_spec_dict.get("routing_options") or {})
|
|
1051
|
+
):
|
|
1052
|
+
job_spec_dict.setdefault("routing_options", {})["queue_hint"] = qos_tier
|
|
1053
|
+
except Exception:
|
|
1054
|
+
pass
|
|
757
1055
|
updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
|
|
758
1056
|
|
|
759
1057
|
span.add_event("Submitting as single job (no split needed)")
|
|
@@ -762,12 +1060,40 @@ async def submit_job_v2(
|
|
|
762
1060
|
await ingest_service.submit_job(updated_job_spec, parent_job_id)
|
|
763
1061
|
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
764
1062
|
|
|
1063
|
+
# If this was a PDF (even if not split), store page count metadata for tracking
|
|
1064
|
+
if pdf_page_count_cache is not None:
|
|
1065
|
+
try:
|
|
1066
|
+
# Use cached page count from earlier check to avoid re-decoding
|
|
1067
|
+
# Store minimal metadata for non-split PDFs (consistent with split PDFs)
|
|
1068
|
+
single_pdf_metadata: Dict[str, Any] = {
|
|
1069
|
+
"total_pages": pdf_page_count_cache,
|
|
1070
|
+
"pages_per_chunk": pdf_page_count_cache, # Single chunk = entire document
|
|
1071
|
+
"original_source_id": original_source_id,
|
|
1072
|
+
"original_source_name": original_source_name,
|
|
1073
|
+
"document_type": document_types[0],
|
|
1074
|
+
"subjob_order": [], # No subjobs for non-split PDFs
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
# Store as parent job metadata with empty subjob list for consistency
|
|
1078
|
+
await ingest_service.set_parent_job_mapping(
|
|
1079
|
+
parent_job_id,
|
|
1080
|
+
[], # Empty subjob list
|
|
1081
|
+
single_pdf_metadata,
|
|
1082
|
+
subjob_descriptors=[],
|
|
1083
|
+
)
|
|
1084
|
+
logger.debug(
|
|
1085
|
+
f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
|
|
1086
|
+
)
|
|
1087
|
+
except Exception as metadata_err:
|
|
1088
|
+
# Don't fail the job if metadata storage fails
|
|
1089
|
+
logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
|
|
1090
|
+
|
|
765
1091
|
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
766
1092
|
return parent_job_id
|
|
767
1093
|
|
|
768
1094
|
except Exception as ex:
|
|
769
|
-
logger.exception(f"Error submitting job: {str(ex)}")
|
|
770
|
-
raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
|
|
1095
|
+
logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
|
|
1096
|
+
raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
|
|
771
1097
|
|
|
772
1098
|
|
|
773
1099
|
# GET /v2/fetch_job
|
|
@@ -898,6 +1224,32 @@ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
|
|
|
898
1224
|
|
|
899
1225
|
logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
|
|
900
1226
|
|
|
1227
|
+
# Special case: Non-split PDFs have metadata but no subjobs
|
|
1228
|
+
# Fetch the result directly and augment with page count metadata
|
|
1229
|
+
if len(subjob_ids) == 0:
|
|
1230
|
+
logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
|
|
1231
|
+
try:
|
|
1232
|
+
job_response = await ingest_service.fetch_job(job_id)
|
|
1233
|
+
|
|
1234
|
+
# Augment response with page count metadata
|
|
1235
|
+
if isinstance(job_response, dict):
|
|
1236
|
+
if "metadata" not in job_response:
|
|
1237
|
+
job_response["metadata"] = {}
|
|
1238
|
+
job_response["metadata"]["total_pages"] = metadata.get("total_pages")
|
|
1239
|
+
job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
|
|
1240
|
+
job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
|
|
1241
|
+
|
|
1242
|
+
# Update job state after successful fetch
|
|
1243
|
+
await _update_job_state_after_fetch(job_id, ingest_service)
|
|
1244
|
+
|
|
1245
|
+
return _stream_json_response(job_response)
|
|
1246
|
+
except (TimeoutError, RedisError, ConnectionError):
|
|
1247
|
+
logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
|
|
1248
|
+
raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
|
|
1249
|
+
except Exception as e:
|
|
1250
|
+
logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
|
|
1251
|
+
raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
|
|
1252
|
+
|
|
901
1253
|
# Build ordered descriptors for subjobs
|
|
902
1254
|
stored_descriptors = subjob_info.get("subjob_descriptors") or []
|
|
903
1255
|
descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
|
|
@@ -18,6 +18,18 @@ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import Simp
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
def _broker_server_target(host, port, max_queue_size):
|
|
22
|
+
"""
|
|
23
|
+
Target function to be run in a separate process for the SimpleMessageBroker.
|
|
24
|
+
"""
|
|
25
|
+
server = SimpleMessageBroker(host, port, max_queue_size)
|
|
26
|
+
try:
|
|
27
|
+
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
server.serve_forever()
|
|
31
|
+
|
|
32
|
+
|
|
21
33
|
def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
22
34
|
"""
|
|
23
35
|
Starts a SimpleMessageBroker server in a separate process.
|
|
@@ -58,16 +70,11 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
|
58
70
|
f"continuing to spawn a broker process (tests expect a Process to be returned)"
|
|
59
71
|
)
|
|
60
72
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
except Exception:
|
|
67
|
-
pass
|
|
68
|
-
server.serve_forever()
|
|
69
|
-
|
|
70
|
-
p = multiprocessing.Process(target=broker_server)
|
|
73
|
+
p = multiprocessing.Process(
|
|
74
|
+
target=_broker_server_target,
|
|
75
|
+
args=(server_host, server_port, max_queue_size),
|
|
76
|
+
daemon=True,
|
|
77
|
+
)
|
|
71
78
|
# If we're launching from inside the pipeline subprocess, mark daemon so the
|
|
72
79
|
# broker dies automatically when the subprocess exits.
|
|
73
80
|
p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
|
|
@@ -11,9 +11,10 @@ Strategy pattern for clean separation of execution concerns.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import atexit
|
|
14
|
-
import os
|
|
15
14
|
import logging
|
|
16
15
|
import multiprocessing
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
17
18
|
import time
|
|
18
19
|
from abc import ABC, abstractmethod
|
|
19
20
|
|
|
@@ -132,7 +133,10 @@ class SubprocessStrategy(ProcessExecutionStrategy):
|
|
|
132
133
|
logger.info("Launching pipeline in Python subprocess using multiprocessing.")
|
|
133
134
|
|
|
134
135
|
# Create subprocess using fork context
|
|
135
|
-
|
|
136
|
+
start_method = "fork"
|
|
137
|
+
if sys.platform.lower() == "darwin":
|
|
138
|
+
start_method = "spawn"
|
|
139
|
+
ctx = multiprocessing.get_context(start_method)
|
|
136
140
|
process = ctx.Process(
|
|
137
141
|
target=run_pipeline_process,
|
|
138
142
|
args=(
|