nv-ingest 2025.10.8.dev20251008__py3-none-any.whl → 2025.10.10.dev20251010__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

@@ -0,0 +1,816 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # pylint: skip-file
5
+
6
+ import asyncio
7
+ from io import BytesIO
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ import base64
10
+ import json
11
+ import logging
12
+ import os
13
+ import time
14
+ import uuid
15
+
16
+ from fastapi import APIRouter, Request, Response
17
+ from fastapi import HTTPException
18
+ from fastapi.responses import StreamingResponse
19
+ from redis import RedisError
20
+
21
+ from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
22
+ from nv_ingest_api.util.service_clients.client_base import FetchMode
23
+
24
+ # For PDF splitting
25
+ import pypdfium2 as pdfium
26
+
27
+ # Reuse V1 state constants and helper functions
28
+ from ..v1.ingest import (
29
+ trace_id_to_uuid,
30
+ INGEST_SERVICE_T,
31
+ STATE_RETRIEVED_DESTRUCTIVE,
32
+ STATE_RETRIEVED_NON_DESTRUCTIVE,
33
+ STATE_RETRIEVED_CACHED,
34
+ STATE_FAILED,
35
+ STATE_SUBMITTED,
36
+ INTERMEDIATE_STATES,
37
+ )
38
+ from .. import traced_endpoint
39
+ from opentelemetry import trace
40
+
41
+ logger = logging.getLogger("uvicorn")
42
+
43
+ router = APIRouter()
44
+
45
+ DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
46
+
47
+
48
+ def get_pdf_split_page_count() -> int:
49
+ """Resolve the configured page chunk size for PDF splitting."""
50
+
51
+ raw_value = os.environ.get("PDF_SPLIT_PAGE_COUNT")
52
+ if raw_value is None:
53
+ return DEFAULT_PDF_SPLIT_PAGE_COUNT
54
+
55
+ try:
56
+ parsed = int(raw_value)
57
+ except ValueError:
58
+ logger.warning(
59
+ "Invalid PDF_SPLIT_PAGE_COUNT '%s'; falling back to default %s", raw_value, DEFAULT_PDF_SPLIT_PAGE_COUNT
60
+ )
61
+ return DEFAULT_PDF_SPLIT_PAGE_COUNT
62
+
63
+ if parsed <= 0:
64
+ logger.warning("PDF_SPLIT_PAGE_COUNT must be >= 1; received %s. Using 1.", parsed)
65
+ return 1
66
+
67
+ return parsed
68
+
69
+
70
+ def split_pdf_to_chunks(pdf_content: bytes, pages_per_chunk: int) -> List[Dict[str, Any]]:
71
+ """
72
+ Split a PDF into multi-page chunks using pypdfium2.
73
+
74
+ Returns a list of dictionaries containing the chunk bytes and page range metadata.
75
+ Note: this currently buffers each chunk in-memory; consider streaming in future upgrades.
76
+ """
77
+
78
+ chunks: List[Dict[str, Any]] = []
79
+
80
+ if pages_per_chunk <= 0:
81
+ pages_per_chunk = 1
82
+
83
+ pdf = pdfium.PdfDocument(pdf_content)
84
+ total_pages = len(pdf)
85
+
86
+ try:
87
+ for chunk_index, start_zero in enumerate(range(0, total_pages, pages_per_chunk)):
88
+ end_zero = min(start_zero + pages_per_chunk, total_pages)
89
+ page_indices = list(range(start_zero, end_zero))
90
+
91
+ new_pdf = pdfium.PdfDocument.new()
92
+ try:
93
+ new_pdf.import_pages(pdf, page_indices)
94
+
95
+ buffer = BytesIO()
96
+ try:
97
+ new_pdf.save(buffer)
98
+ chunk_bytes = buffer.getvalue()
99
+ finally:
100
+ buffer.close()
101
+ finally:
102
+ new_pdf.close()
103
+
104
+ start_page = start_zero + 1
105
+ end_page = end_zero
106
+ chunk_info: Dict[str, Any] = {
107
+ "bytes": chunk_bytes,
108
+ "chunk_index": chunk_index,
109
+ "start_page": start_page,
110
+ "end_page": end_page,
111
+ "page_count": end_page - start_page + 1,
112
+ }
113
+ chunks.append(chunk_info)
114
+
115
+ finally:
116
+ pdf.close()
117
+
118
+ return chunks
119
+
120
+
121
+ def get_pdf_page_count(pdf_content: bytes) -> int:
122
+ """Get the number of pages in a PDF using pypdfium2."""
123
+ try:
124
+ pdf = pdfium.PdfDocument(pdf_content)
125
+ page_count = len(pdf)
126
+ pdf.close()
127
+ return page_count
128
+ except Exception as e:
129
+ logger.warning(f"Failed to get PDF page count: {e}")
130
+ return 1 # Assume single page on error
131
+
132
+
133
+ def _prepare_chunk_submission(
134
+ job_spec_template: Dict[str, Any],
135
+ chunk: Dict[str, Any],
136
+ *,
137
+ parent_uuid: uuid.UUID,
138
+ parent_job_id: str,
139
+ current_trace_id: int,
140
+ original_source_id: str,
141
+ original_source_name: str,
142
+ ) -> Tuple[str, MessageWrapper]:
143
+ """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
144
+
145
+ chunk_number = chunk["chunk_index"] + 1
146
+ start_page = chunk["start_page"]
147
+ end_page = chunk["end_page"]
148
+
149
+ subjob_spec = {
150
+ key: value
151
+ for key, value in job_spec_template.items()
152
+ if key not in {"job_payload", "job_id", "tracing_options"}
153
+ }
154
+
155
+ subjob_payload_template = job_spec_template.get("job_payload", {})
156
+ subjob_payload = {
157
+ key: value
158
+ for key, value in subjob_payload_template.items()
159
+ if key not in {"content", "source_id", "source_name"}
160
+ }
161
+
162
+ chunk_bytes = chunk["bytes"]
163
+ subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
164
+
165
+ page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
166
+ subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
167
+ subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
168
+
169
+ subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
170
+ subjob_id = str(subjob_uuid)
171
+ subjob_spec["job_payload"] = subjob_payload
172
+ subjob_spec["job_id"] = subjob_id
173
+
174
+ base_tracing_options = job_spec_template.get("tracing_options") or {}
175
+ tracing_options = dict(base_tracing_options)
176
+ tracing_options.setdefault("trace", True)
177
+ tracing_options["trace_id"] = str(current_trace_id)
178
+ tracing_options["ts_send"] = int(time.time() * 1000)
179
+ tracing_options["parent_job_id"] = parent_job_id
180
+ tracing_options["page_num"] = start_page
181
+
182
+ subjob_spec["tracing_options"] = tracing_options
183
+
184
+ return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
185
+
186
+
187
+ # ============================================================================
188
+ # Helper Functions for Fetch Job Aggregation
189
+ # ============================================================================
190
+
191
+
192
+ async def _gather_in_batches(coroutines: List, batch_size: int, return_exceptions: bool = False) -> List[Any]:
193
+ """
194
+ Execute coroutines in batches to respect concurrency limits.
195
+
196
+ Parameters
197
+ ----------
198
+ coroutines : List
199
+ List of coroutines to execute
200
+ batch_size : int
201
+ Maximum number of coroutines to execute concurrently
202
+ return_exceptions : bool
203
+ Whether to return exceptions as results (passed to asyncio.gather)
204
+
205
+ Returns
206
+ -------
207
+ List[Any]
208
+ Results from all coroutines in original order
209
+ """
210
+ results: List[Any] = []
211
+ for offset in range(0, len(coroutines), batch_size):
212
+ batch = coroutines[offset : offset + batch_size]
213
+ batch_results = await asyncio.gather(*batch, return_exceptions=return_exceptions)
214
+ results.extend(batch_results)
215
+ return results
216
+
217
+
218
+ async def _update_job_state_after_fetch(job_id: str, ingest_service: INGEST_SERVICE_T) -> None:
219
+ """
220
+ Update job state after successful fetch based on configured fetch mode.
221
+
222
+ Parameters
223
+ ----------
224
+ job_id : str
225
+ The job identifier
226
+ ingest_service : IngestServiceMeta
227
+ The ingest service instance
228
+ """
229
+ try:
230
+ current_fetch_mode = await ingest_service.get_fetch_mode()
231
+ if current_fetch_mode == FetchMode.DESTRUCTIVE:
232
+ target_state = STATE_RETRIEVED_DESTRUCTIVE
233
+ elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
234
+ target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
235
+ else:
236
+ target_state = STATE_RETRIEVED_CACHED
237
+
238
+ await ingest_service.set_job_state(job_id, target_state)
239
+ logger.debug(f"Updated job {job_id} state to {target_state}")
240
+ except Exception as e:
241
+ logger.error(f"Failed to update job state for {job_id}: {e}")
242
+
243
+
244
+ def _stream_json_response(data: Dict[str, Any]) -> StreamingResponse:
245
+ """
246
+ Create a StreamingResponse for JSON data.
247
+
248
+ Parameters
249
+ ----------
250
+ data : Dict[str, Any]
251
+ The data to serialize and stream
252
+
253
+ Returns
254
+ -------
255
+ StreamingResponse
256
+ FastAPI streaming response with JSON content
257
+ """
258
+ json_bytes = json.dumps(data).encode("utf-8")
259
+ return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
260
+
261
+
262
+ async def _check_all_subjob_states(
263
+ ordered_descriptors: List[Dict[str, Any]], max_parallel_ops: int, ingest_service: INGEST_SERVICE_T
264
+ ) -> Tuple[List[Optional[str]], List[Dict[str, object]]]:
265
+ """
266
+ Check the state of all subjobs in parallel batches.
267
+
268
+ Parameters
269
+ ----------
270
+ ordered_descriptors : List[Dict[str, Any]]
271
+ List of subjob descriptors with job_id and chunk_index
272
+ max_parallel_ops : int
273
+ Maximum number of parallel operations
274
+ ingest_service : IngestServiceMeta
275
+ The ingest service instance
276
+
277
+ Returns
278
+ -------
279
+ Tuple[List[Optional[str]], List[Dict[str, object]]]
280
+ Tuple of (subjob_states, failed_subjobs_list)
281
+
282
+ Raises
283
+ ------
284
+ HTTPException
285
+ If any subjob is still processing (202)
286
+ """
287
+ # Gather all subjob states in parallel batches
288
+ state_coroutines = [ingest_service.get_job_state(descriptor.get("job_id")) for descriptor in ordered_descriptors]
289
+ subjob_states = await _gather_in_batches(state_coroutines, max_parallel_ops)
290
+
291
+ # Check for failures and pending work
292
+ failed_subjobs: List[Dict[str, object]] = []
293
+
294
+ for page_index, (descriptor, subjob_state) in enumerate(zip(ordered_descriptors, subjob_states), start=1):
295
+ subjob_id = descriptor.get("job_id")
296
+
297
+ if subjob_state == STATE_FAILED:
298
+ logger.warning(f"Subjob {subjob_id} failed")
299
+ failed_subjobs.append({"subjob_id": subjob_id, "chunk_index": page_index})
300
+ elif subjob_state in INTERMEDIATE_STATES:
301
+ raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
302
+
303
+ return subjob_states, failed_subjobs
304
+
305
+
306
+ async def _fetch_all_subjob_results(
307
+ ordered_descriptors: List[Dict[str, Any]],
308
+ subjob_states: List[Optional[str]],
309
+ failed_subjobs: List[Dict[str, object]],
310
+ max_parallel_ops: int,
311
+ ingest_service: INGEST_SERVICE_T,
312
+ ) -> List[Optional[Dict[str, Any]]]:
313
+ """
314
+ Fetch results for all completed subjobs in parallel batches.
315
+
316
+ Parameters
317
+ ----------
318
+ ordered_descriptors : List[Dict[str, Any]]
319
+ List of subjob descriptors
320
+ subjob_states : List[Optional[str]]
321
+ States of all subjobs (from _check_all_subjob_states)
322
+ failed_subjobs : List[Dict[str, object]]
323
+ List to append failed fetch attempts to (modified in place)
324
+ max_parallel_ops : int
325
+ Maximum number of parallel operations
326
+ ingest_service : IngestServiceMeta
327
+ The ingest service instance
328
+
329
+ Returns
330
+ -------
331
+ List[Optional[Dict[str, Any]]]
332
+ Results for each subjob (None for failed ones)
333
+
334
+ Raises
335
+ ------
336
+ HTTPException
337
+ If any subjob is not ready yet (202)
338
+ """
339
+ # Initialize results array with None placeholders
340
+ subjob_results: List[Optional[Dict[str, Any]]] = [None] * len(ordered_descriptors)
341
+
342
+ # Build list of fetch tasks (only for non-failed subjobs)
343
+ fetch_coroutines = []
344
+ fetch_targets: List[Dict[str, Any]] = []
345
+
346
+ for list_index, (page_index, descriptor, subjob_state) in enumerate(
347
+ zip(range(1, len(ordered_descriptors) + 1), ordered_descriptors, subjob_states)
348
+ ):
349
+ subjob_id = descriptor.get("job_id")
350
+
351
+ # Skip failed subjobs (already recorded in failed_subjobs)
352
+ if subjob_state == STATE_FAILED:
353
+ continue
354
+
355
+ # Skip intermediate states (should have been caught earlier, but defensive)
356
+ if subjob_state in INTERMEDIATE_STATES:
357
+ continue
358
+
359
+ # Queue this subjob for fetching
360
+ fetch_coroutines.append(ingest_service.fetch_job(subjob_id))
361
+ fetch_targets.append(
362
+ {
363
+ "list_index": list_index,
364
+ "page_index": page_index,
365
+ "subjob_id": subjob_id,
366
+ }
367
+ )
368
+
369
+ # Fetch all results in parallel batches
370
+ if fetch_coroutines:
371
+ fetch_results = await _gather_in_batches(fetch_coroutines, max_parallel_ops, return_exceptions=True)
372
+
373
+ # Process results and handle errors
374
+ for target, fetch_result in zip(fetch_targets, fetch_results):
375
+ subjob_id = target["subjob_id"]
376
+ page_index = target["page_index"]
377
+ list_index = target["list_index"]
378
+
379
+ if isinstance(fetch_result, TimeoutError):
380
+ logger.debug(f"Subjob {subjob_id} not ready yet; deferring aggregation")
381
+ raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
382
+
383
+ if isinstance(fetch_result, Exception):
384
+ logger.error(f"Failed to fetch subjob {subjob_id}: {fetch_result}")
385
+ failed_subjobs.append(
386
+ {
387
+ "subjob_id": subjob_id,
388
+ "chunk_index": page_index,
389
+ "error": str(fetch_result),
390
+ }
391
+ )
392
+ continue
393
+
394
+ subjob_results[list_index] = fetch_result
395
+
396
+ return subjob_results
397
+
398
+
399
+ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
400
+ """Return the trace and annotation dictionaries emitted by the sink stage."""
401
+
402
+ if not isinstance(result, dict):
403
+ return None, None
404
+
405
+ trace = result.get("trace")
406
+ annotations = result.get("annotations")
407
+
408
+ trace_dict = trace if isinstance(trace, dict) else None
409
+ annotations_dict = annotations if isinstance(annotations, dict) else None
410
+
411
+ return trace_dict, annotations_dict
412
+
413
+
414
+ def _build_aggregated_response(
415
+ parent_job_id: str,
416
+ subjob_results: List[Optional[Dict[str, Any]]],
417
+ failed_subjobs: List[Dict[str, object]],
418
+ ordered_descriptors: List[Dict[str, Any]],
419
+ metadata: Dict[str, Any],
420
+ ) -> Dict[str, Any]:
421
+ """
422
+ Build the aggregated response from subjob results.
423
+
424
+ Parameters
425
+ ----------
426
+ parent_job_id : str
427
+ The parent job identifier
428
+ subjob_results : List[Optional[Dict[str, Any]]]
429
+ Results from all subjobs (None for failed ones)
430
+ failed_subjobs : List[Dict[str, object]]
431
+ List of failed subjob information
432
+ ordered_descriptors : List[Dict[str, Any]]
433
+ Subjob descriptors in original order
434
+ metadata : Dict[str, Any]
435
+ Parent job metadata
436
+
437
+ Returns
438
+ -------
439
+ Dict[str, Any]
440
+ Aggregated response with combined data and metadata
441
+ """
442
+ any_failed = len(failed_subjobs) > 0
443
+ subjob_ids = [desc.get("job_id") for desc in ordered_descriptors]
444
+
445
+ aggregated_result = {
446
+ "data": [],
447
+ "status": "failed" if any_failed else "success",
448
+ "description": (
449
+ "One or more subjobs failed to complete" if any_failed else "Aggregated result composed from subjob outputs"
450
+ ),
451
+ "metadata": {
452
+ "parent_job_id": parent_job_id,
453
+ "total_pages": metadata.get("total_pages", len(subjob_ids)),
454
+ "pages_per_chunk": metadata.get("pages_per_chunk"),
455
+ "original_source_id": metadata.get("original_source_id"),
456
+ "original_source_name": metadata.get("original_source_name"),
457
+ "subjobs_failed": sum(1 for r in subjob_results if r is None),
458
+ "failed_subjobs": failed_subjobs,
459
+ "subjob_ids": subjob_ids,
460
+ "chunks": [],
461
+ "trace_segments": [],
462
+ "annotation_segments": [],
463
+ },
464
+ }
465
+
466
+ # Aggregate subjob data in page order
467
+ for page_num, (result, descriptor) in enumerate(zip(subjob_results, ordered_descriptors), 1):
468
+ if result is not None:
469
+ # Add page data to aggregated result
470
+ if "data" in result:
471
+ aggregated_result["data"].extend(result["data"])
472
+ chunk_entry = dict(descriptor)
473
+ aggregated_result["metadata"]["chunks"].append(chunk_entry)
474
+
475
+ trace_data, annotation_data = _extract_ray_telemetry(result)
476
+ start_page = descriptor.get("start_page")
477
+ end_page = descriptor.get("end_page")
478
+
479
+ if trace_data:
480
+ aggregated_result["metadata"]["trace_segments"].append(
481
+ {
482
+ "job_id": descriptor.get("job_id"),
483
+ "chunk_index": descriptor.get("chunk_index"),
484
+ "start_page": start_page,
485
+ "end_page": end_page,
486
+ "trace": trace_data,
487
+ }
488
+ )
489
+
490
+ if annotation_data:
491
+ aggregated_result["metadata"]["annotation_segments"].append(
492
+ {
493
+ "job_id": descriptor.get("job_id"),
494
+ "chunk_index": descriptor.get("chunk_index"),
495
+ "start_page": start_page,
496
+ "end_page": end_page,
497
+ "annotations": annotation_data,
498
+ }
499
+ )
500
+ else:
501
+ # Note failed page
502
+ logger.warning(f"Page {page_num} failed or missing")
503
+
504
+ return aggregated_result
505
+
506
+
507
+ # POST /v2/submit_job
508
+ @router.post(
509
+ "/submit_job",
510
+ responses={
511
+ 200: {"description": "Jobs were successfully submitted"},
512
+ 500: {"description": "Error encountered while submitting jobs."},
513
+ 503: {"description": "Service unavailable."},
514
+ },
515
+ tags=["Ingestion"],
516
+ summary="submit jobs to the core nv ingestion service for processing with PDF splitting",
517
+ operation_id="submit_job_v2",
518
+ )
519
+ @traced_endpoint("http-submit-job-v2")
520
+ async def submit_job_v2(
521
+ request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
522
+ ):
523
+ span = trace.get_current_span()
524
+ try:
525
+ span.add_event("Submitting file for processing (V2)")
526
+
527
+ current_trace_id = span.get_span_context().trace_id
528
+ parent_job_id = trace_id_to_uuid(current_trace_id)
529
+
530
+ # Parse job spec
531
+ job_spec_dict = json.loads(job_spec.payload)
532
+
533
+ # Extract document type and payload from the proper structure
534
+ job_payload = job_spec_dict.get("job_payload", {})
535
+ document_types = job_payload.get("document_type", [])
536
+ payloads = job_payload.get("content", [])
537
+
538
+ # Resolve original source metadata up front for logging / subjob naming
539
+ source_ids = job_payload.get("source_id", ["unknown_source.pdf"])
540
+ source_names = job_payload.get("source_name", ["unknown_source.pdf"])
541
+ original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
542
+ original_source_name = source_names[0] if source_names else "unknown_source.pdf"
543
+
544
+ # Check if this is a PDF that needs splitting
545
+ if document_types and payloads and document_types[0].lower() == "pdf":
546
+ # Decode the payload to check page count
547
+ pdf_content = base64.b64decode(payloads[0])
548
+ page_count = get_pdf_page_count(pdf_content)
549
+ pages_per_chunk = get_pdf_split_page_count()
550
+
551
+ # Split if the document has more pages than our chunk size
552
+ if page_count > pages_per_chunk:
553
+ logger.warning(
554
+ "[dev-reload-check] Splitting PDF %s into %s-page chunks (total pages: %s)",
555
+ original_source_name,
556
+ pages_per_chunk,
557
+ page_count,
558
+ )
559
+
560
+ chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
561
+
562
+ subjob_ids: List[str] = []
563
+ subjob_descriptors: List[Dict[str, Any]] = []
564
+ submission_tasks = []
565
+
566
+ try:
567
+ parent_uuid = uuid.UUID(parent_job_id)
568
+ except ValueError:
569
+ logger.warning(
570
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
571
+ parent_job_id,
572
+ )
573
+ parent_uuid = uuid.uuid4()
574
+
575
+ for chunk in chunks:
576
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
577
+ job_spec_dict,
578
+ chunk,
579
+ parent_uuid=parent_uuid,
580
+ parent_job_id=parent_job_id,
581
+ current_trace_id=current_trace_id,
582
+ original_source_id=original_source_id,
583
+ original_source_name=original_source_name,
584
+ )
585
+ submission_tasks.append(ingest_service.submit_job(subjob_wrapper, subjob_id))
586
+ subjob_ids.append(subjob_id)
587
+ subjob_descriptors.append(
588
+ {
589
+ "job_id": subjob_id,
590
+ "chunk_index": len(subjob_descriptors) + 1,
591
+ "start_page": chunk.get("start_page"),
592
+ "end_page": chunk.get("end_page"),
593
+ "page_count": chunk.get("page_count"),
594
+ }
595
+ )
596
+
597
+ if submission_tasks:
598
+ await asyncio.gather(*submission_tasks)
599
+
600
+ parent_metadata: Dict[str, Any] = {
601
+ "total_pages": page_count,
602
+ "original_source_id": original_source_id,
603
+ "original_source_name": original_source_name,
604
+ "document_type": document_types[0] if document_types else "pdf",
605
+ "subjob_order": subjob_ids,
606
+ }
607
+
608
+ await ingest_service.set_parent_job_mapping(
609
+ parent_job_id,
610
+ subjob_ids,
611
+ parent_metadata,
612
+ subjob_descriptors=subjob_descriptors,
613
+ )
614
+
615
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
616
+
617
+ span.add_event(f"Split into {len(subjob_ids)} subjobs")
618
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
619
+ return parent_job_id
620
+
621
+ # For non-PDFs or cases where splitting is not required, submit as normal
622
+ if "tracing_options" not in job_spec_dict:
623
+ job_spec_dict["tracing_options"] = {"trace": True}
624
+ job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
625
+ updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
626
+
627
+ span.add_event("Submitting as single job (no split needed)")
628
+
629
+ # Submit the job to the pipeline task queue
630
+ await ingest_service.submit_job(updated_job_spec, parent_job_id)
631
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
632
+
633
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
634
+ return parent_job_id
635
+
636
+ except Exception as ex:
637
+ logger.exception(f"Error submitting job: {str(ex)}")
638
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
639
+
640
+
641
+ # GET /v2/fetch_job
642
+ @router.get(
643
+ "/fetch_job/{job_id}",
644
+ responses={
645
+ 200: {"description": "Job result successfully retrieved."},
646
+ 202: {"description": "Job is processing or result not yet available. Retry later."},
647
+ 404: {"description": "Job ID not found or associated state has expired."},
648
+ 410: {"description": "Job result existed but is now gone (expired or retrieved destructively/cached)."},
649
+ 500: {"description": "Internal server error during fetch processing."},
650
+ 503: {"description": "Job processing failed, or backend service temporarily unavailable preventing fetch."},
651
+ },
652
+ tags=["Ingestion"],
653
+ summary="Fetch the result of a previously submitted job by its job_id (V2 with aggregation)",
654
+ operation_id="fetch_job_v2",
655
+ )
656
+ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
657
+ """
658
+ V2 fetch that handles parent job aggregation.
659
+ """
660
+ try:
661
+ # Check if this is a parent job with subjobs
662
+ subjob_info = await ingest_service.get_parent_job_info(job_id)
663
+
664
+ if subjob_info is None:
665
+ # Not a parent job, fetch identical to V1
666
+ current_state = await ingest_service.get_job_state(job_id)
667
+ logger.debug(f"Initial state check for job {job_id}: {current_state}")
668
+
669
+ if current_state is None:
670
+ logger.warning(f"Job {job_id} not found or expired. Returning 404.")
671
+ raise HTTPException(status_code=404, detail="Job ID not found or state has expired.")
672
+
673
+ if current_state == STATE_FAILED:
674
+ logger.error(f"Job {job_id} failed. Returning 503.")
675
+ raise HTTPException(status_code=503, detail="Job processing failed.")
676
+
677
+ if current_state == STATE_RETRIEVED_DESTRUCTIVE:
678
+ logger.warning(f"Job {job_id} was destructively retrieved. Returning 410.")
679
+ raise HTTPException(status_code=410, detail="Job result is gone (destructive read).")
680
+
681
+ if current_state in INTERMEDIATE_STATES or current_state in {
682
+ STATE_RETRIEVED_NON_DESTRUCTIVE,
683
+ STATE_RETRIEVED_CACHED,
684
+ }:
685
+ logger.debug(f"Attempting fetch for job {job_id} in state {current_state}.")
686
+
687
+ try:
688
+ job_response = await ingest_service.fetch_job(job_id)
689
+ logger.debug(f"Fetched result for job {job_id}.")
690
+
691
+ try:
692
+ current_fetch_mode = await ingest_service.get_fetch_mode()
693
+ if current_fetch_mode == FetchMode.DESTRUCTIVE:
694
+ target_state = STATE_RETRIEVED_DESTRUCTIVE
695
+ elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
696
+ target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
697
+ elif current_fetch_mode == FetchMode.CACHE_BEFORE_DELETE:
698
+ target_state = STATE_RETRIEVED_CACHED
699
+ else:
700
+ target_state = "RETRIEVED_UNKNOWN"
701
+
702
+ if target_state != "RETRIEVED_UNKNOWN":
703
+ await ingest_service.set_job_state(job_id, target_state)
704
+ logger.debug(f"Updated job {job_id} state to {target_state}.")
705
+ except Exception as state_err:
706
+ logger.error(f"Failed to set job state for {job_id} after fetch: {state_err}")
707
+
708
+ try:
709
+ json_bytes = json.dumps(job_response).encode("utf-8")
710
+ return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
711
+ except TypeError as json_err:
712
+ logger.exception(f"Serialization error for job {job_id}: {json_err}")
713
+ raise HTTPException(
714
+ status_code=500, detail="Internal server error: Failed to serialize result."
715
+ )
716
+
717
+ except (TimeoutError, RedisError, ConnectionError) as fetch_err:
718
+ # Handle timeout/error cases same as V1
719
+ fetch_err_type = type(fetch_err).__name__
720
+
721
+ if isinstance(fetch_err, TimeoutError):
722
+ logger.debug(
723
+ f"Job {job_id} still processing (state: {current_state}), fetch attempt timed out cleanly."
724
+ )
725
+ else:
726
+ logger.warning(
727
+ f"Backend error ({fetch_err_type}) during fetch attempt for job {job_id} "
728
+ f"(state: {current_state}): {fetch_err}"
729
+ )
730
+
731
+ if current_state == STATE_RETRIEVED_NON_DESTRUCTIVE:
732
+ if isinstance(fetch_err, TimeoutError):
733
+ raise HTTPException(status_code=410, detail="Job result is gone (TTL expired).")
734
+ else:
735
+ raise HTTPException(
736
+ status_code=503, detail="Backend service unavailable preventing access to job result."
737
+ )
738
+ elif current_state == STATE_RETRIEVED_CACHED:
739
+ raise HTTPException(
740
+ status_code=410, detail="Job result is gone (previously cached, fetch failed)."
741
+ )
742
+ elif current_state in INTERMEDIATE_STATES:
743
+ if isinstance(fetch_err, TimeoutError):
744
+ raise HTTPException(
745
+ status_code=202, detail=f"Job is processing (state: {current_state}). Retry later."
746
+ )
747
+ else:
748
+ raise HTTPException(
749
+ status_code=503, detail="Backend service unavailable preventing fetch of job result."
750
+ )
751
+ else:
752
+ logger.error(f"Unexpected state '{current_state}' for job {job_id} after fetch failure.")
753
+ raise HTTPException(
754
+ status_code=500, detail="Internal server error: Unexpected job state after fetch failure."
755
+ )
756
+ else:
757
+ logger.error(f"Unknown job state '{current_state}' for job {job_id}.")
758
+ raise HTTPException(
759
+ status_code=500, detail=f"Internal server error: Unknown job state '{current_state}'."
760
+ )
761
+
762
+ else:
763
+ # This is a parent job - orchestrate aggregation using declarative helpers
764
+ subjob_ids = subjob_info.get("subjob_ids", [])
765
+ metadata = subjob_info.get("metadata", {})
766
+
767
+ logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
768
+
769
+ # Build ordered descriptors for subjobs
770
+ stored_descriptors = subjob_info.get("subjob_descriptors") or []
771
+ descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
772
+
773
+ ordered_descriptors: List[Dict[str, Any]] = []
774
+ for idx, subjob_id in enumerate(subjob_ids, 1):
775
+ descriptor = descriptor_lookup.get(subjob_id, {})
776
+ ordered_descriptors.append(
777
+ {
778
+ "job_id": subjob_id,
779
+ "chunk_index": descriptor.get("chunk_index", idx),
780
+ "start_page": descriptor.get("start_page"),
781
+ "end_page": descriptor.get("end_page"),
782
+ "page_count": descriptor.get("page_count"),
783
+ }
784
+ )
785
+
786
+ # Calculate max parallel operations (stay within Redis connection pool)
787
+ max_parallel_ops = max(
788
+ 1, min(len(ordered_descriptors), getattr(ingest_service, "_concurrency_level", 10) // 2)
789
+ )
790
+
791
+ # Check all subjob states (raises 202 if any still processing)
792
+ subjob_states, failed_subjobs = await _check_all_subjob_states(
793
+ ordered_descriptors, max_parallel_ops, ingest_service
794
+ )
795
+
796
+ # Fetch all subjob results (raises 202 if any not ready)
797
+ subjob_results = await _fetch_all_subjob_results(
798
+ ordered_descriptors, subjob_states, failed_subjobs, max_parallel_ops, ingest_service
799
+ )
800
+
801
+ # Build aggregated response from all subjob results
802
+ aggregated_result = _build_aggregated_response(
803
+ job_id, subjob_results, failed_subjobs, ordered_descriptors, metadata
804
+ )
805
+
806
+ # Update parent job state after successful aggregation
807
+ await _update_job_state_after_fetch(job_id, ingest_service)
808
+
809
+ # Return aggregated result as streaming response
810
+ return _stream_json_response(aggregated_result)
811
+
812
+ except HTTPException:
813
+ raise
814
+ except Exception as e:
815
+ logger.exception(f"Unexpected error in fetch_job_v2: {e}")
816
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")