nv-ingest 2025.10.4.dev20251004__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/process/dependent_services.py +17 -10
  8. nv_ingest/framework/orchestration/process/strategies.py +6 -2
  9. nv_ingest/framework/orchestration/process/termination.py +49 -9
  10. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
  11. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
  12. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  13. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +41 -8
  14. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
  15. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  16. nv_ingest/pipeline/config/replica_resolver.py +12 -2
  17. nv_ingest/pipeline/default_libmode_pipeline_impl.py +32 -18
  18. nv_ingest/pipeline/default_pipeline_impl.py +75 -33
  19. {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +4 -2
  20. {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +23 -18
  21. {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
  22. {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
  23. {nv_ingest-2025.10.4.dev20251004.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1300 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # pylint: skip-file
5
+
6
+ import asyncio
7
+ from io import BytesIO
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ import base64
10
+ import json
11
+ import logging
12
+ import os
13
+ import time
14
+ import uuid
15
+ import random
16
+ from pathlib import Path
17
+ import fsspec
18
+
19
+ from fastapi import APIRouter, Request, Response
20
+ from fastapi import HTTPException
21
+ from fastapi.responses import StreamingResponse
22
+ from redis import RedisError
23
+
24
+ from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
25
+ from nv_ingest_api.util.service_clients.client_base import FetchMode
26
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader
27
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
28
+
29
+ # For PDF splitting
30
+ import pypdfium2 as pdfium
31
+
32
+ # Reuse V1 state constants and helper functions
33
+ from ..v1.ingest import (
34
+ trace_id_to_uuid,
35
+ INGEST_SERVICE_T,
36
+ STATE_RETRIEVED_DESTRUCTIVE,
37
+ STATE_RETRIEVED_NON_DESTRUCTIVE,
38
+ STATE_RETRIEVED_CACHED,
39
+ STATE_FAILED,
40
+ STATE_SUBMITTED,
41
+ INTERMEDIATE_STATES,
42
+ )
43
+ from .. import traced_endpoint
44
+ from opentelemetry import trace
45
+
46
+ logger = logging.getLogger("uvicorn")
47
+
48
+ router = APIRouter()
49
+
50
+ DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
51
+
52
+ # Default QoS thresholds (pages). Tunable via environment variables:
53
+ # QOS_MAX_PAGES_MICRO, QOS_MAX_PAGES_SMALL, QOS_MAX_PAGES_MEDIUM
54
+ _QOS_DEFAULTS = {
55
+ "micro": 8,
56
+ "small": 64,
57
+ "medium": 256,
58
+ }
59
+
60
+
61
+ def get_qos_tier_for_page_count(page_count: int) -> str:
62
+ """
63
+ Select QoS tier for a document based on its total page count.
64
+ Tiers: 'micro', 'small', 'medium', 'large', 'default'
65
+ Thresholds can be tuned via environment variables:
66
+ - QOS_MAX_PAGES_MICRO (default: 4)
67
+ - QOS_MAX_PAGES_SMALL (default: 16)
68
+ - QOS_MAX_PAGES_MEDIUM (default: 64)
69
+ Anything above MEDIUM is 'large'. Non-positive page_count returns 'default'.
70
+ """
71
+ try:
72
+ micro_max = int(os.getenv("QOS_MAX_PAGES_MICRO", str(_QOS_DEFAULTS["micro"])))
73
+ small_max = int(os.getenv("QOS_MAX_PAGES_SMALL", str(_QOS_DEFAULTS["small"])))
74
+ medium_max = int(os.getenv("QOS_MAX_PAGES_MEDIUM", str(_QOS_DEFAULTS["medium"])))
75
+ except ValueError:
76
+ micro_max, small_max, medium_max = _QOS_DEFAULTS["micro"], _QOS_DEFAULTS["small"], _QOS_DEFAULTS["medium"]
77
+
78
+ if page_count <= 0:
79
+ return "default"
80
+ if page_count <= micro_max:
81
+ return "micro"
82
+ if page_count <= small_max:
83
+ return "small"
84
+ if page_count <= medium_max:
85
+ return "medium"
86
+ return "large"
87
+
88
+
89
+ def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
90
+ """
91
+ Resolve the page chunk size for PDF splitting with client override support.
92
+
93
+ Priority: client_override (clamped) > env var > default (32)
94
+ Enforces boundaries: min=1, max=128
95
+ """
96
+ MIN_PAGES = 1
97
+ MAX_PAGES = 128
98
+
99
+ # Client override takes precedence if provided
100
+ if client_override is not None:
101
+ clamped = max(MIN_PAGES, min(client_override, MAX_PAGES))
102
+ if clamped != client_override:
103
+ logger.warning(
104
+ "Client requested split_page_count=%s; clamped to %s (min=%s, max=%s)",
105
+ client_override,
106
+ clamped,
107
+ MIN_PAGES,
108
+ MAX_PAGES,
109
+ )
110
+ return clamped
111
+
112
+ # Fall back to environment variable
113
+ raw_value = os.environ.get("PDF_SPLIT_PAGE_COUNT")
114
+ if raw_value is None:
115
+ return DEFAULT_PDF_SPLIT_PAGE_COUNT
116
+
117
+ try:
118
+ parsed = int(raw_value)
119
+ except ValueError:
120
+ logger.warning(
121
+ "Invalid PDF_SPLIT_PAGE_COUNT '%s'; falling back to default %s", raw_value, DEFAULT_PDF_SPLIT_PAGE_COUNT
122
+ )
123
+ return DEFAULT_PDF_SPLIT_PAGE_COUNT
124
+
125
+ if parsed <= 0:
126
+ logger.warning("PDF_SPLIT_PAGE_COUNT must be >= 1; received %s. Using 1.", parsed)
127
+ return 1
128
+
129
+ return parsed
130
+
131
+
132
+ def split_pdf_to_chunks(pdf_content: bytes, pages_per_chunk: int) -> List[Dict[str, Any]]:
133
+ """
134
+ Split a PDF into multi-page chunks using pypdfium2.
135
+
136
+ Returns a list of dictionaries containing the chunk bytes and page range metadata.
137
+ Note: this currently buffers each chunk in-memory; consider streaming in future upgrades.
138
+ """
139
+
140
+ chunks: List[Dict[str, Any]] = []
141
+
142
+ if pages_per_chunk <= 0:
143
+ pages_per_chunk = 1
144
+
145
+ pdf = pdfium.PdfDocument(pdf_content)
146
+ total_pages = len(pdf)
147
+
148
+ try:
149
+ for chunk_index, start_zero in enumerate(range(0, total_pages, pages_per_chunk)):
150
+ end_zero = min(start_zero + pages_per_chunk, total_pages)
151
+ page_indices = list(range(start_zero, end_zero))
152
+
153
+ new_pdf = pdfium.PdfDocument.new()
154
+ try:
155
+ new_pdf.import_pages(pdf, page_indices)
156
+
157
+ buffer = BytesIO()
158
+ try:
159
+ new_pdf.save(buffer)
160
+ chunk_bytes = buffer.getvalue()
161
+ finally:
162
+ buffer.close()
163
+ finally:
164
+ new_pdf.close()
165
+
166
+ start_page = start_zero + 1
167
+ end_page = end_zero
168
+ chunk_info: Dict[str, Any] = {
169
+ "bytes": chunk_bytes,
170
+ "chunk_index": chunk_index,
171
+ "start_page": start_page,
172
+ "end_page": end_page,
173
+ "page_count": end_page - start_page + 1,
174
+ }
175
+ chunks.append(chunk_info)
176
+
177
+ finally:
178
+ pdf.close()
179
+
180
+ return chunks
181
+
182
+
183
+ def get_pdf_page_count(pdf_content: bytes) -> int:
184
+ """Get the number of pages in a PDF using pypdfium2."""
185
+ try:
186
+ pdf = pdfium.PdfDocument(pdf_content)
187
+ page_count = len(pdf)
188
+ pdf.close()
189
+ return page_count
190
+ except Exception as e:
191
+ logger.warning(f"Failed to get PDF page count: {e}")
192
+ return 1 # Assume single page on error
193
+
194
+
195
+ def _create_subjob_dict(
196
+ job_id: str,
197
+ job_payload: Dict[str, Any],
198
+ job_spec_template: Dict[str, Any],
199
+ current_trace_id: int,
200
+ parent_job_id: str,
201
+ start_key: Dict[str, Any],
202
+ ) -> Dict[str, Any]:
203
+ job_spec = {
204
+ key: value
205
+ for key, value in job_spec_template.items()
206
+ if key not in {"job_payload", "job_id", "tracing_options"}
207
+ }
208
+ job_spec["job_payload"] = job_payload
209
+ job_spec["job_id"] = job_id
210
+
211
+ base_tracing_options = job_spec_template.get("tracing_options") or {}
212
+ tracing_options = dict(base_tracing_options)
213
+ tracing_options.setdefault("trace", True)
214
+ tracing_options["trace_id"] = str(current_trace_id)
215
+ tracing_options["ts_send"] = int(time.time() * 1000)
216
+ tracing_options["parent_job_id"] = parent_job_id
217
+ for key, value in start_key.items():
218
+ tracing_options[key] = value
219
+
220
+ job_spec["tracing_options"] = tracing_options
221
+ return job_spec
222
+
223
+
224
+ def _create_payload_dict(
225
+ job_spec_template: Dict[str, Any],
226
+ content: str,
227
+ source_id: str,
228
+ source_name: str,
229
+ document_type: str,
230
+ ) -> Dict[str, Any]:
231
+ subjob_payload_template = job_spec_template.get("job_payload", {})
232
+ subjob_payload = {
233
+ key: value
234
+ for key, value in subjob_payload_template.items()
235
+ if key not in {"content", "source_id", "source_name"}
236
+ }
237
+
238
+ subjob_payload["content"] = [content]
239
+
240
+ subjob_payload["source_id"] = [source_id]
241
+ subjob_payload["source_name"] = [source_name]
242
+ subjob_payload["document_type"] = [document_type]
243
+ return subjob_payload
244
+
245
+
246
+ def _prepare_chunk_submission(
247
+ job_spec_template: Dict[str, Any],
248
+ chunk: Dict[str, Any],
249
+ *,
250
+ parent_uuid: uuid.UUID,
251
+ parent_job_id: str,
252
+ current_trace_id: int,
253
+ source_id: str,
254
+ source_name: str,
255
+ document_type: str,
256
+ ) -> Tuple[str, MessageWrapper]:
257
+ """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
258
+
259
+ chunk_number = chunk["chunk_index"] + 1
260
+
261
+ subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
262
+ subjob_id = str(subjob_uuid)
263
+
264
+ subjob_payload_template = job_spec_template.get("job_payload", {})
265
+ chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
266
+ subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
267
+ start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
268
+
269
+ subjob_spec = _create_subjob_dict(
270
+ subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
271
+ )
272
+
273
+ return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
274
+
275
+
276
+ # ============================================================================
277
+ # Helper Functions for Fetch Job Aggregation
278
+ # ============================================================================
279
+
280
+
281
+ async def _gather_in_batches(coroutines: List, batch_size: int, return_exceptions: bool = False) -> List[Any]:
282
+ """
283
+ Execute coroutines in batches to respect concurrency limits.
284
+
285
+ Parameters
286
+ ----------
287
+ coroutines : List
288
+ List of coroutines to execute
289
+ batch_size : int
290
+ Maximum number of coroutines to execute concurrently
291
+ return_exceptions : bool
292
+ Whether to return exceptions as results (passed to asyncio.gather)
293
+
294
+ Returns
295
+ -------
296
+ List[Any]
297
+ Results from all coroutines in original order
298
+ """
299
+ results: List[Any] = []
300
+ for offset in range(0, len(coroutines), batch_size):
301
+ batch = coroutines[offset : offset + batch_size]
302
+ batch_results = await asyncio.gather(*batch, return_exceptions=return_exceptions)
303
+ results.extend(batch_results)
304
+ return results
305
+
306
+
307
+ async def _update_job_state_after_fetch(job_id: str, ingest_service: INGEST_SERVICE_T) -> None:
308
+ """
309
+ Update job state after successful fetch based on configured fetch mode.
310
+
311
+ Parameters
312
+ ----------
313
+ job_id : str
314
+ The job identifier
315
+ ingest_service : IngestServiceMeta
316
+ The ingest service instance
317
+ """
318
+ try:
319
+ current_fetch_mode = await ingest_service.get_fetch_mode()
320
+ if current_fetch_mode == FetchMode.DESTRUCTIVE:
321
+ target_state = STATE_RETRIEVED_DESTRUCTIVE
322
+ elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
323
+ target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
324
+ else:
325
+ target_state = STATE_RETRIEVED_CACHED
326
+
327
+ await ingest_service.set_job_state(job_id, target_state)
328
+ logger.debug(f"Updated job {job_id} state to {target_state}")
329
+ except Exception as e:
330
+ logger.error(f"Failed to update job state for {job_id}: {e}")
331
+
332
+
333
+ def _stream_json_response(data: Dict[str, Any]) -> StreamingResponse:
334
+ """
335
+ Create a StreamingResponse for JSON data.
336
+
337
+ Parameters
338
+ ----------
339
+ data : Dict[str, Any]
340
+ The data to serialize and stream
341
+
342
+ Returns
343
+ -------
344
+ StreamingResponse
345
+ FastAPI streaming response with JSON content
346
+ """
347
+ json_bytes = json.dumps(data).encode("utf-8")
348
+ return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
349
+
350
+
351
+ async def _check_all_subjob_states(
352
+ ordered_descriptors: List[Dict[str, Any]], max_parallel_ops: int, ingest_service: INGEST_SERVICE_T
353
+ ) -> Tuple[List[Optional[str]], List[Dict[str, object]]]:
354
+ """
355
+ Check the state of all subjobs in parallel batches.
356
+
357
+ Parameters
358
+ ----------
359
+ ordered_descriptors : List[Dict[str, Any]]
360
+ List of subjob descriptors with job_id and chunk_index
361
+ max_parallel_ops : int
362
+ Maximum number of parallel operations
363
+ ingest_service : IngestServiceMeta
364
+ The ingest service instance
365
+
366
+ Returns
367
+ -------
368
+ Tuple[List[Optional[str]], List[Dict[str, object]]]
369
+ Tuple of (subjob_states, failed_subjobs_list)
370
+
371
+ Raises
372
+ ------
373
+ HTTPException
374
+ If any subjob is still processing (202)
375
+ """
376
+ # Gather all subjob states in parallel batches
377
+ state_coroutines = [ingest_service.get_job_state(descriptor.get("job_id")) for descriptor in ordered_descriptors]
378
+ subjob_states = await _gather_in_batches(state_coroutines, max_parallel_ops)
379
+
380
+ # Check for failures and pending work
381
+ failed_subjobs: List[Dict[str, object]] = []
382
+
383
+ for page_index, (descriptor, subjob_state) in enumerate(zip(ordered_descriptors, subjob_states), start=1):
384
+ subjob_id = descriptor.get("job_id")
385
+
386
+ if subjob_state == STATE_FAILED:
387
+ logger.warning(f"Subjob {subjob_id} failed")
388
+ failed_subjobs.append({"subjob_id": subjob_id, "chunk_index": page_index})
389
+ elif subjob_state in INTERMEDIATE_STATES:
390
+ raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
391
+
392
+ return subjob_states, failed_subjobs
393
+
394
+
395
+ async def _fetch_all_subjob_results(
396
+ ordered_descriptors: List[Dict[str, Any]],
397
+ subjob_states: List[Optional[str]],
398
+ failed_subjobs: List[Dict[str, object]],
399
+ max_parallel_ops: int,
400
+ ingest_service: INGEST_SERVICE_T,
401
+ ) -> List[Optional[Dict[str, Any]]]:
402
+ """
403
+ Fetch results for all completed subjobs in parallel batches.
404
+
405
+ Parameters
406
+ ----------
407
+ ordered_descriptors : List[Dict[str, Any]]
408
+ List of subjob descriptors
409
+ subjob_states : List[Optional[str]]
410
+ States of all subjobs (from _check_all_subjob_states)
411
+ failed_subjobs : List[Dict[str, object]]
412
+ List to append failed fetch attempts to (modified in place)
413
+ max_parallel_ops : int
414
+ Maximum number of parallel operations
415
+ ingest_service : IngestServiceMeta
416
+ The ingest service instance
417
+
418
+ Returns
419
+ -------
420
+ List[Optional[Dict[str, Any]]]
421
+ Results for each subjob (None for failed ones)
422
+
423
+ Raises
424
+ ------
425
+ HTTPException
426
+ If any subjob is not ready yet (202)
427
+ """
428
+ # Initialize results array with None placeholders
429
+ subjob_results: List[Optional[Dict[str, Any]]] = [None] * len(ordered_descriptors)
430
+
431
+ # Build list of fetch tasks (only for non-failed subjobs)
432
+ fetch_coroutines = []
433
+ fetch_targets: List[Dict[str, Any]] = []
434
+
435
+ for list_index, (page_index, descriptor, subjob_state) in enumerate(
436
+ zip(range(1, len(ordered_descriptors) + 1), ordered_descriptors, subjob_states)
437
+ ):
438
+ subjob_id = descriptor.get("job_id")
439
+
440
+ # Skip failed subjobs (already recorded in failed_subjobs)
441
+ if subjob_state == STATE_FAILED:
442
+ continue
443
+
444
+ # Skip intermediate states (should have been caught earlier, but defensive)
445
+ if subjob_state in INTERMEDIATE_STATES:
446
+ continue
447
+
448
+ # Queue this subjob for fetching
449
+ fetch_coroutines.append(ingest_service.fetch_job(subjob_id))
450
+ fetch_targets.append(
451
+ {
452
+ "list_index": list_index,
453
+ "page_index": page_index,
454
+ "subjob_id": subjob_id,
455
+ }
456
+ )
457
+
458
+ # Fetch all results in parallel batches
459
+ if fetch_coroutines:
460
+ fetch_results = await _gather_in_batches(fetch_coroutines, max_parallel_ops, return_exceptions=True)
461
+
462
+ # Process results and handle errors
463
+ for target, fetch_result in zip(fetch_targets, fetch_results):
464
+ subjob_id = target["subjob_id"]
465
+ page_index = target["page_index"]
466
+ list_index = target["list_index"]
467
+
468
+ if isinstance(fetch_result, TimeoutError):
469
+ logger.debug(f"Subjob {subjob_id} not ready yet; deferring aggregation")
470
+ raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
471
+
472
+ if isinstance(fetch_result, Exception):
473
+ logger.error(f"Failed to fetch subjob {subjob_id}: {fetch_result}")
474
+ failed_subjobs.append(
475
+ {
476
+ "subjob_id": subjob_id,
477
+ "chunk_index": page_index,
478
+ "error": str(fetch_result),
479
+ }
480
+ )
481
+ continue
482
+
483
+ subjob_results[list_index] = fetch_result
484
+
485
+ return subjob_results
486
+
487
+
488
+ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
489
+ """Return the trace and annotation dictionaries emitted by the sink stage."""
490
+
491
+ if not isinstance(result, dict):
492
+ return None, None
493
+
494
+ trace = result.get("trace")
495
+ annotations = result.get("annotations")
496
+
497
+ trace_dict = trace if isinstance(trace, dict) else None
498
+ annotations_dict = annotations if isinstance(annotations, dict) else None
499
+
500
+ return trace_dict, annotations_dict
501
+
502
+
503
+ def _normalize_chunk_records(
504
+ records: Optional[List[Any]],
505
+ descriptor: Dict[str, Any],
506
+ parent_metadata: Dict[str, Any],
507
+ ) -> List[Any]:
508
+ """Re-map chunk-local metadata to document-level context for aggregation."""
509
+
510
+ if not isinstance(records, list):
511
+ return []
512
+
513
+ total_pages = parent_metadata.get("total_pages")
514
+ original_source_id = parent_metadata.get("original_source_id")
515
+ original_source_name = parent_metadata.get("original_source_name")
516
+
517
+ start_page = descriptor.get("start_page")
518
+ page_offset = start_page - 1 if isinstance(start_page, int) and start_page > 0 else 0
519
+
520
+ normalized_entries: List[Any] = []
521
+
522
+ for entry in records:
523
+ if not isinstance(entry, dict):
524
+ normalized_entries.append(entry)
525
+ continue
526
+
527
+ normalized_entry = entry.copy()
528
+ original_metadata = entry.get("metadata")
529
+
530
+ if isinstance(original_metadata, dict):
531
+ normalized_metadata = original_metadata.copy()
532
+ normalized_entry["metadata"] = normalized_metadata
533
+
534
+ original_source_meta = original_metadata.get("source_metadata")
535
+ if isinstance(original_source_meta, dict):
536
+ normalized_source_meta = original_source_meta.copy()
537
+ normalized_metadata["source_metadata"] = normalized_source_meta
538
+
539
+ if original_source_id:
540
+ normalized_source_meta["source_id"] = original_source_id
541
+ if original_source_name:
542
+ normalized_source_meta["source_name"] = original_source_name
543
+
544
+ original_content_meta = original_metadata.get("content_metadata")
545
+ if isinstance(original_content_meta, dict):
546
+ normalized_content_meta = original_content_meta.copy()
547
+ normalized_metadata["content_metadata"] = normalized_content_meta
548
+
549
+ page_number = normalized_content_meta.get("page_number")
550
+ if isinstance(page_number, int) and page_number >= 0:
551
+ normalized_content_meta["page_number"] = page_number + page_offset
552
+
553
+ if isinstance(total_pages, int) and isinstance(normalized_content_meta.get("page_count"), int):
554
+ # Ensure optional per-record page count reflects the full document
555
+ normalized_content_meta["page_count"] = total_pages
556
+
557
+ original_hierarchy = original_content_meta.get("hierarchy")
558
+ if isinstance(original_hierarchy, dict):
559
+ normalized_hierarchy = original_hierarchy.copy()
560
+ normalized_content_meta["hierarchy"] = normalized_hierarchy
561
+
562
+ hierarchy_page = normalized_hierarchy.get("page")
563
+ if isinstance(hierarchy_page, int) and hierarchy_page >= 0:
564
+ normalized_hierarchy["page"] = hierarchy_page + page_offset
565
+ if isinstance(total_pages, int):
566
+ normalized_hierarchy["page_count"] = total_pages
567
+
568
+ normalized_entries.append(normalized_entry)
569
+
570
+ return normalized_entries
571
+
572
+
573
+ def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
574
+ """
575
+ Aggregate chunk-level traces into parent-level metrics.
576
+
577
+ For each stage found in chunk traces:
578
+ - trace::entry::<stage> = min(all chunk entries) - earliest start
579
+ - trace::exit::<stage> = max(all chunk exits) - latest finish
580
+ - trace::resident_time::<stage> = sum(chunk durations) - total compute
581
+
582
+ Parameters
583
+ ----------
584
+ chunk_traces : Dict[str, Any]
585
+ Trace dict with chunk-prefixed keys (chunk_N::trace::entry::stage_name)
586
+
587
+ Returns
588
+ -------
589
+ Dict[str, Any]
590
+ Parent-level aggregated traces (trace::entry::stage_name, etc.)
591
+ """
592
+ # Group by stage: {stage_name: {chunk_idx: {entry: float, exit: float}}}
593
+ stage_data: Dict[str, Dict[int, Dict[str, Any]]] = {}
594
+
595
+ for key, value in chunk_traces.items():
596
+ if not key.startswith("chunk_"):
597
+ continue
598
+
599
+ parts = key.split("::")
600
+ if len(parts) < 4: # Minimum: chunk_N::trace::entry/exit::stage_name
601
+ continue
602
+
603
+ if parts[1] != "trace": # Ensure it's a trace key
604
+ continue
605
+
606
+ chunk_idx_str = parts[0].split("_")[1] # "chunk_1" -> "1"
607
+ try:
608
+ chunk_idx = int(chunk_idx_str)
609
+ except ValueError:
610
+ continue
611
+
612
+ event_type = parts[2] # "entry" or "exit"
613
+
614
+ # Stage name is everything after trace::entry:: or trace::exit::
615
+ # Handles both simple (pdf_extractor) and nested (pdf_extractor::pdf_extraction::pdfium_0)
616
+ stage_name = "::".join(parts[3:]) # Join remaining parts
617
+
618
+ if event_type not in ("entry", "exit"):
619
+ continue
620
+
621
+ if stage_name not in stage_data:
622
+ stage_data[stage_name] = {}
623
+ if chunk_idx not in stage_data[stage_name]:
624
+ stage_data[stage_name][chunk_idx] = {}
625
+
626
+ stage_data[stage_name][chunk_idx][event_type] = value
627
+
628
+ # Compute aggregated metrics
629
+ parent_traces: Dict[str, Any] = {}
630
+
631
+ for stage_name, chunks in stage_data.items():
632
+ entries = []
633
+ exits = []
634
+ durations = []
635
+
636
+ for chunk_data in chunks.values():
637
+ entry = chunk_data.get("entry")
638
+ exit_time = chunk_data.get("exit")
639
+
640
+ # Both entry and exit must exist for valid pair
641
+ if entry is not None and exit_time is not None:
642
+ entries.append(entry)
643
+ exits.append(exit_time)
644
+ durations.append(exit_time - entry)
645
+
646
+ # Only add parent traces if we have valid data
647
+ if entries and exits:
648
+ parent_traces[f"trace::entry::{stage_name}"] = min(entries)
649
+ parent_traces[f"trace::exit::{stage_name}"] = max(exits)
650
+ parent_traces[f"trace::resident_time::{stage_name}"] = sum(durations)
651
+
652
+ return parent_traces
653
+
654
+
655
+ def _build_aggregated_response(
656
+ parent_job_id: str,
657
+ subjob_results: List[Optional[Dict[str, Any]]],
658
+ failed_subjobs: List[Dict[str, object]],
659
+ ordered_descriptors: List[Dict[str, Any]],
660
+ metadata: Dict[str, Any],
661
+ ) -> Dict[str, Any]:
662
+ """
663
+ Build the aggregated response from subjob results.
664
+
665
+ Parameters
666
+ ----------
667
+ parent_job_id : str
668
+ The parent job identifier
669
+ subjob_results : List[Optional[Dict[str, Any]]]
670
+ Results from all subjobs (None for failed ones)
671
+ failed_subjobs : List[Dict[str, object]]
672
+ List of failed subjob information
673
+ ordered_descriptors : List[Dict[str, Any]]
674
+ Subjob descriptors in original order
675
+ metadata : Dict[str, Any]
676
+ Parent job metadata
677
+
678
+ Returns
679
+ -------
680
+ Dict[str, Any]
681
+ Aggregated response with combined data and metadata
682
+ """
683
+ any_failed = len(failed_subjobs) > 0
684
+ subjob_ids = [desc.get("job_id") for desc in ordered_descriptors]
685
+
686
+ aggregated_result = {
687
+ "data": [],
688
+ "status": "failed" if any_failed else "success",
689
+ "description": (
690
+ "One or more subjobs failed to complete" if any_failed else "Aggregated result composed from subjob outputs"
691
+ ),
692
+ # Top-level trace/annotations for V1 compatibility
693
+ "trace": {},
694
+ "annotations": {},
695
+ "metadata": {
696
+ "parent_job_id": parent_job_id,
697
+ "total_pages": metadata.get("total_pages", len(subjob_ids)),
698
+ "pages_per_chunk": metadata.get("pages_per_chunk"),
699
+ "original_source_id": metadata.get("original_source_id"),
700
+ "original_source_name": metadata.get("original_source_name"),
701
+ "subjobs_failed": sum(1 for r in subjob_results if r is None),
702
+ "failed_subjobs": failed_subjobs,
703
+ "subjob_ids": subjob_ids,
704
+ "chunks": [],
705
+ "trace_segments": [],
706
+ "annotation_segments": [],
707
+ },
708
+ }
709
+
710
+ # Aggregate subjob data in page order
711
+ for page_num, (result, descriptor) in enumerate(zip(subjob_results, ordered_descriptors), 1):
712
+ if result is not None:
713
+ # Add page data to aggregated result
714
+ if "data" in result:
715
+ normalized_records = _normalize_chunk_records(result.get("data"), descriptor, metadata)
716
+ aggregated_result["data"].extend(normalized_records)
717
+ chunk_entry = dict(descriptor)
718
+ aggregated_result["metadata"]["chunks"].append(chunk_entry)
719
+
720
+ trace_data, annotation_data = _extract_ray_telemetry(result)
721
+ start_page = descriptor.get("start_page")
722
+ end_page = descriptor.get("end_page")
723
+
724
+ if trace_data:
725
+ # Add to trace_segments (detailed, per-chunk view)
726
+ aggregated_result["metadata"]["trace_segments"].append(
727
+ {
728
+ "job_id": descriptor.get("job_id"),
729
+ "chunk_index": descriptor.get("chunk_index"),
730
+ "start_page": start_page,
731
+ "end_page": end_page,
732
+ "trace": trace_data,
733
+ }
734
+ )
735
+ # Chunk traces stay in metadata.trace_segments only (not in top-level)
736
+
737
+ if annotation_data:
738
+ # Add to annotation_segments (detailed, per-chunk view)
739
+ aggregated_result["metadata"]["annotation_segments"].append(
740
+ {
741
+ "job_id": descriptor.get("job_id"),
742
+ "chunk_index": descriptor.get("chunk_index"),
743
+ "start_page": start_page,
744
+ "end_page": end_page,
745
+ "annotations": annotation_data,
746
+ }
747
+ )
748
+ # Merge into top-level annotations (annotations have unique UUIDs, safe to merge)
749
+ aggregated_result["annotations"].update(annotation_data)
750
+ else:
751
+ # Note failed page
752
+ logger.warning(f"Page {page_num} failed or missing")
753
+
754
+ # Compute parent-level trace aggregations from trace_segments
755
+ trace_segments = aggregated_result["metadata"]["trace_segments"]
756
+ if trace_segments:
757
+ # Build a temporary chunk trace dict for aggregation
758
+ temp_chunk_traces = {}
759
+ for segment in trace_segments:
760
+ chunk_idx = segment.get("chunk_index")
761
+ chunk_trace = segment.get("trace", {})
762
+ for trace_key, trace_value in chunk_trace.items():
763
+ prefixed_key = f"chunk_{chunk_idx}::{trace_key}"
764
+ temp_chunk_traces[prefixed_key] = trace_value
765
+
766
+ # Aggregate and set as top-level trace (only parent traces, no chunk traces)
767
+ parent_level_traces = _aggregate_parent_traces(temp_chunk_traces)
768
+ aggregated_result["trace"] = parent_level_traces
769
+
770
+ return aggregated_result
771
+
772
+
773
+ # ---------------------------------------------------------------------------
774
+ # Bursty submission helpers (fairness without long-lived in-flight tasks)
775
+ # ---------------------------------------------------------------------------
776
+
777
+
778
+ def _get_submit_burst_params() -> Tuple[int, int, int]:
779
+ """
780
+ Returns (burst_size, pause_ms, jitter_ms) from environment with sane defaults.
781
+ - V2_SUBMIT_BURST_SIZE (default: 16)
782
+ - V2_SUBMIT_BURST_PAUSE_MS (default: 25)
783
+ - V2_SUBMIT_BURST_JITTER_MS (default: 10)
784
+ """
785
+ burst_size = int(os.getenv("V2_SUBMIT_BURST_SIZE", "16"))
786
+ pause_ms = int(os.getenv("V2_SUBMIT_BURST_PAUSE_MS", "50"))
787
+ jitter_ms = int(os.getenv("V2_SUBMIT_BURST_JITTER_MS", "15"))
788
+
789
+ return max(1, burst_size), max(0, pause_ms), max(0, jitter_ms)
790
+
791
+
792
+ async def _submit_subjobs_in_bursts(
793
+ items: List[Tuple[str, MessageWrapper]],
794
+ ingest_service: "INGEST_SERVICE_T",
795
+ *,
796
+ burst_size: int,
797
+ pause_ms: int,
798
+ jitter_ms: int,
799
+ ) -> None:
800
+ """
801
+ Submit subjobs in sequential bursts and await each burst to completion.
802
+ This avoids keeping a large number of pending tasks in the REST handler
803
+ and allows other concurrent requests to interleave enqueue work between bursts.
804
+ """
805
+ for offset in range(0, len(items), burst_size):
806
+ burst = items[offset : offset + burst_size]
807
+ tasks = [ingest_service.submit_job(wrapper, subjob_id) for (subjob_id, wrapper) in burst]
808
+ # Propagate any errors from this burst
809
+ await asyncio.gather(*tasks)
810
+
811
+ # Pause with jitter to yield to other request handlers before next burst
812
+ if offset + burst_size < len(items):
813
+ delay_ms = pause_ms + (random.randint(0, jitter_ms) if jitter_ms > 0 else 0)
814
+ if delay_ms > 0:
815
+ await asyncio.sleep(delay_ms / 1000.0)
816
+
817
+
818
+ # POST /v2/submit_job
819
+ @router.post(
820
+ "/submit_job",
821
+ responses={
822
+ 200: {"description": "Jobs were successfully submitted"},
823
+ 500: {"description": "Error encountered while submitting jobs."},
824
+ 503: {"description": "Service unavailable."},
825
+ },
826
+ tags=["Ingestion"],
827
+ summary="submit jobs to the core nv ingestion service for processing with PDF splitting",
828
+ operation_id="submit_job_v2",
829
+ )
830
+ @traced_endpoint("http-submit-job-v2")
831
+ async def submit_job_v2(
832
+ request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
833
+ ):
834
+ span = trace.get_current_span()
835
+ source_id = None
836
+ document_type = None
837
+ try:
838
+ span.add_event("Submitting file for processing (V2)")
839
+
840
+ current_trace_id = span.get_span_context().trace_id
841
+ parent_job_id = trace_id_to_uuid(current_trace_id)
842
+
843
+ # Parse job spec
844
+ job_spec_dict = json.loads(job_spec.payload)
845
+
846
+ # Extract PDF configuration if provided by client
847
+ pdf_config = job_spec_dict.get("pdf_config", {})
848
+ client_split_page_count = pdf_config.get("split_page_count") if pdf_config else None
849
+
850
+ # Extract document type and payload from the proper structure
851
+ job_payload = job_spec_dict.get("job_payload", {})
852
+ document_types = job_payload.get("document_type", [])
853
+ payloads = job_payload.get("content", [])
854
+
855
+ # Resolve original source metadata up front for logging / subjob naming
856
+ source_ids = job_payload.get("source_id", ["unknown_source.pdf"])
857
+ source_names = job_payload.get("source_name", ["unknown_source.pdf"])
858
+ original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
859
+ original_source_name = source_names[0] if source_names else "unknown_source.pdf"
860
+
861
+ # Track page count for all PDFs (used for both splitting logic and metadata)
862
+ pdf_page_count_cache = None
863
+ submission_items: List[Tuple[str, MessageWrapper]] = []
864
+ subjob_ids: List[str] = []
865
+ subjob_descriptors: List[Dict[str, Any]] = []
866
+ parent_metadata: Dict[str, Any] = {}
867
+ submission_items: List[Tuple[str, MessageWrapper]] = []
868
+ try:
869
+ parent_uuid = uuid.UUID(parent_job_id)
870
+ except ValueError:
871
+ logger.warning(
872
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
873
+ parent_job_id,
874
+ )
875
+ parent_uuid = uuid.uuid4()
876
+ # Check if this is a PDF that needs splitting
877
+ if document_types and payloads and document_types[0].lower() == "pdf":
878
+ # Decode the payload to check page count
879
+ pdf_content = base64.b64decode(payloads[0])
880
+ page_count = get_pdf_page_count(pdf_content)
881
+ pdf_page_count_cache = page_count # Cache for later use
882
+ qos_tier = get_qos_tier_for_page_count(page_count)
883
+ pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
884
+ document_type = DocumentTypeEnum.PDF
885
+
886
+ # Split if the document has more pages than our chunk size
887
+ if page_count > pages_per_chunk:
888
+ logger.warning(
889
+ "Splitting PDF %s into %s-page chunks (total pages: %s) -> (qos_tier: %s)",
890
+ original_source_name,
891
+ pages_per_chunk,
892
+ page_count,
893
+ qos_tier,
894
+ )
895
+ chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
896
+
897
+ subjob_ids: List[str] = []
898
+ subjob_descriptors: List[Dict[str, Any]] = []
899
+ submission_items: List[Tuple[str, MessageWrapper]] = []
900
+ try:
901
+ parent_uuid = uuid.UUID(parent_job_id)
902
+ except ValueError:
903
+ logger.warning(
904
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
905
+ parent_job_id,
906
+ )
907
+ parent_uuid = uuid.uuid4()
908
+
909
+ for chunk in chunks:
910
+ start = chunk["start_page"]
911
+ end = chunk["end_page"]
912
+ page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
913
+ source_id = f"{original_source_id}#{page_suffix}"
914
+ source_name = f"{original_source_name}#{page_suffix}"
915
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
916
+ job_spec_dict,
917
+ chunk,
918
+ document_type=DocumentTypeEnum.PDF,
919
+ parent_uuid=parent_uuid,
920
+ parent_job_id=parent_job_id,
921
+ current_trace_id=current_trace_id,
922
+ source_id=source_id,
923
+ source_name=source_name,
924
+ )
925
+
926
+ # Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
927
+ try:
928
+ sub_spec = json.loads(subjob_wrapper.payload)
929
+ routing_opts = sub_spec.get("routing_options") or {}
930
+ routing_opts["queue_hint"] = qos_tier
931
+ sub_spec["routing_options"] = routing_opts
932
+ subjob_wrapper = MessageWrapper(payload=json.dumps(sub_spec))
933
+ except Exception:
934
+ # Best-effort; if we cannot inject, fall back to default routing
935
+ pass
936
+
937
+ submission_items.append((subjob_id, subjob_wrapper))
938
+ subjob_ids.append(subjob_id)
939
+ subjob_descriptors.append(
940
+ {
941
+ "job_id": subjob_id,
942
+ "chunk_index": len(subjob_descriptors) + 1,
943
+ "start_page": chunk.get("start_page"),
944
+ "end_page": chunk.get("end_page"),
945
+ "page_count": chunk.get("page_count"),
946
+ }
947
+ )
948
+ parent_metadata.update(
949
+ {
950
+ "total_pages": page_count,
951
+ "pages_per_chunk": pages_per_chunk,
952
+ "original_source_id": original_source_id,
953
+ "original_source_name": original_source_name,
954
+ "document_type": document_types[0] if document_types else "pdf",
955
+ "subjob_order": subjob_ids,
956
+ }
957
+ )
958
+ elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
959
+ document_type = document_types[0]
960
+ upload_path = f"./{Path(original_source_id).name}"
961
+ # dump the payload to a file, just came from client
962
+ with fsspec.open(upload_path, "wb") as f:
963
+ f.write(base64.b64decode(payloads[0]))
964
+ dataloader = DataLoader(
965
+ path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
966
+ )
967
+ document_type = DocumentTypeEnum.MP3
968
+
969
+ parent_uuid = uuid.UUID(parent_job_id)
970
+ for task in job_spec_dict["tasks"]:
971
+ if "task_properties" in task and "document_type" in task["task_properties"]:
972
+ task["task_properties"]["document_type"] = document_type
973
+ end = 0
974
+ for idx, (file_path, duration) in enumerate(dataloader.files_completed):
975
+ start = end
976
+ end = int(start + duration)
977
+ chunk = {
978
+ "bytes": file_path.encode("utf-8"),
979
+ "chunk_index": idx,
980
+ "start": start,
981
+ "end": end,
982
+ }
983
+
984
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
985
+ job_spec_dict,
986
+ chunk,
987
+ parent_uuid=parent_uuid,
988
+ parent_job_id=parent_job_id,
989
+ current_trace_id=current_trace_id,
990
+ source_id=file_path,
991
+ source_name=upload_path,
992
+ document_type=document_type,
993
+ )
994
+
995
+ submission_items.append((subjob_id, subjob_wrapper))
996
+ subjob_ids.append(subjob_id)
997
+ subjob_descriptors.append(
998
+ {
999
+ "job_id": subjob_id,
1000
+ "chunk_index": idx + 1,
1001
+ "start_page": chunk.get("start"),
1002
+ "end_page": chunk.get("end"),
1003
+ "page_count": chunk.get("page_count", 0),
1004
+ }
1005
+ )
1006
+ logger.debug(f"Removing uploaded file {upload_path}")
1007
+ os.remove(upload_path)
1008
+
1009
+ if submission_items:
1010
+ burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
1011
+ await _submit_subjobs_in_bursts(
1012
+ submission_items,
1013
+ ingest_service,
1014
+ burst_size=burst_size,
1015
+ pause_ms=pause_ms,
1016
+ jitter_ms=jitter_ms,
1017
+ )
1018
+
1019
+ parent_metadata.update(
1020
+ {
1021
+ "original_source_id": original_source_id,
1022
+ "original_source_name": original_source_name,
1023
+ "document_type": document_type,
1024
+ "subjob_order": subjob_ids,
1025
+ }
1026
+ )
1027
+ # raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
1028
+ await ingest_service.set_parent_job_mapping(
1029
+ parent_job_id,
1030
+ subjob_ids,
1031
+ parent_metadata,
1032
+ subjob_descriptors=subjob_descriptors,
1033
+ )
1034
+
1035
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
1036
+
1037
+ span.add_event(f"Split into {len(subjob_ids)} subjobs")
1038
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
1039
+ return parent_job_id
1040
+
1041
+ # For non-PDFs or cases where splitting is not required, submit as normal
1042
+ if "tracing_options" not in job_spec_dict:
1043
+ job_spec_dict["tracing_options"] = {"trace": True}
1044
+ job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
1045
+ # If this was a PDF and we computed page_count, route the single job using the same QoS tier
1046
+ try:
1047
+ if (
1048
+ document_types
1049
+ and document_types[0].lower() == "pdf"
1050
+ and "queue_hint" not in (job_spec_dict.get("routing_options") or {})
1051
+ ):
1052
+ job_spec_dict.setdefault("routing_options", {})["queue_hint"] = qos_tier
1053
+ except Exception:
1054
+ pass
1055
+ updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
1056
+
1057
+ span.add_event("Submitting as single job (no split needed)")
1058
+
1059
+ # Submit the job to the pipeline task queue
1060
+ await ingest_service.submit_job(updated_job_spec, parent_job_id)
1061
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
1062
+
1063
+ # If this was a PDF (even if not split), store page count metadata for tracking
1064
+ if pdf_page_count_cache is not None:
1065
+ try:
1066
+ # Use cached page count from earlier check to avoid re-decoding
1067
+ # Store minimal metadata for non-split PDFs (consistent with split PDFs)
1068
+ single_pdf_metadata: Dict[str, Any] = {
1069
+ "total_pages": pdf_page_count_cache,
1070
+ "pages_per_chunk": pdf_page_count_cache, # Single chunk = entire document
1071
+ "original_source_id": original_source_id,
1072
+ "original_source_name": original_source_name,
1073
+ "document_type": document_types[0],
1074
+ "subjob_order": [], # No subjobs for non-split PDFs
1075
+ }
1076
+
1077
+ # Store as parent job metadata with empty subjob list for consistency
1078
+ await ingest_service.set_parent_job_mapping(
1079
+ parent_job_id,
1080
+ [], # Empty subjob list
1081
+ single_pdf_metadata,
1082
+ subjob_descriptors=[],
1083
+ )
1084
+ logger.debug(
1085
+ f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
1086
+ )
1087
+ except Exception as metadata_err:
1088
+ # Don't fail the job if metadata storage fails
1089
+ logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
1090
+
1091
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
1092
+ return parent_job_id
1093
+
1094
+ except Exception as ex:
1095
+ logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
1096
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
1097
+
1098
+
1099
+ # GET /v2/fetch_job
1100
+ @router.get(
1101
+ "/fetch_job/{job_id}",
1102
+ responses={
1103
+ 200: {"description": "Job result successfully retrieved."},
1104
+ 202: {"description": "Job is processing or result not yet available. Retry later."},
1105
+ 404: {"description": "Job ID not found or associated state has expired."},
1106
+ 410: {"description": "Job result existed but is now gone (expired or retrieved destructively/cached)."},
1107
+ 500: {"description": "Internal server error during fetch processing."},
1108
+ 503: {"description": "Job processing failed, or backend service temporarily unavailable preventing fetch."},
1109
+ },
1110
+ tags=["Ingestion"],
1111
+ summary="Fetch the result of a previously submitted job by its job_id (V2 with aggregation)",
1112
+ operation_id="fetch_job_v2",
1113
+ )
1114
+ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
1115
+ """
1116
+ V2 fetch that handles parent job aggregation.
1117
+ """
1118
+ try:
1119
+ # Check if this is a parent job with subjobs
1120
+ subjob_info = await ingest_service.get_parent_job_info(job_id)
1121
+
1122
+ if subjob_info is None:
1123
+ # Not a parent job, fetch identical to V1
1124
+ current_state = await ingest_service.get_job_state(job_id)
1125
+ logger.debug(f"Initial state check for job {job_id}: {current_state}")
1126
+
1127
+ if current_state is None:
1128
+ logger.warning(f"Job {job_id} not found or expired. Returning 404.")
1129
+ raise HTTPException(status_code=404, detail="Job ID not found or state has expired.")
1130
+
1131
+ if current_state == STATE_FAILED:
1132
+ logger.error(f"Job {job_id} failed. Returning 503.")
1133
+ raise HTTPException(status_code=503, detail="Job processing failed.")
1134
+
1135
+ if current_state == STATE_RETRIEVED_DESTRUCTIVE:
1136
+ logger.warning(f"Job {job_id} was destructively retrieved. Returning 410.")
1137
+ raise HTTPException(status_code=410, detail="Job result is gone (destructive read).")
1138
+
1139
+ if current_state in INTERMEDIATE_STATES or current_state in {
1140
+ STATE_RETRIEVED_NON_DESTRUCTIVE,
1141
+ STATE_RETRIEVED_CACHED,
1142
+ }:
1143
+ logger.debug(f"Attempting fetch for job {job_id} in state {current_state}.")
1144
+
1145
+ try:
1146
+ job_response = await ingest_service.fetch_job(job_id)
1147
+ logger.debug(f"Fetched result for job {job_id}.")
1148
+
1149
+ try:
1150
+ current_fetch_mode = await ingest_service.get_fetch_mode()
1151
+ if current_fetch_mode == FetchMode.DESTRUCTIVE:
1152
+ target_state = STATE_RETRIEVED_DESTRUCTIVE
1153
+ elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
1154
+ target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
1155
+ elif current_fetch_mode == FetchMode.CACHE_BEFORE_DELETE:
1156
+ target_state = STATE_RETRIEVED_CACHED
1157
+ else:
1158
+ target_state = "RETRIEVED_UNKNOWN"
1159
+
1160
+ if target_state != "RETRIEVED_UNKNOWN":
1161
+ await ingest_service.set_job_state(job_id, target_state)
1162
+ logger.debug(f"Updated job {job_id} state to {target_state}.")
1163
+ except Exception as state_err:
1164
+ logger.error(f"Failed to set job state for {job_id} after fetch: {state_err}")
1165
+
1166
+ try:
1167
+ json_bytes = json.dumps(job_response).encode("utf-8")
1168
+ return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
1169
+ except TypeError as json_err:
1170
+ logger.exception(f"Serialization error for job {job_id}: {json_err}")
1171
+ raise HTTPException(
1172
+ status_code=500, detail="Internal server error: Failed to serialize result."
1173
+ )
1174
+
1175
+ except (TimeoutError, RedisError, ConnectionError) as fetch_err:
1176
+ # Handle timeout/error cases same as V1
1177
+ fetch_err_type = type(fetch_err).__name__
1178
+
1179
+ if isinstance(fetch_err, TimeoutError):
1180
+ logger.debug(
1181
+ f"Job {job_id} still processing (state: {current_state}), fetch attempt timed out cleanly."
1182
+ )
1183
+ else:
1184
+ logger.warning(
1185
+ f"Backend error ({fetch_err_type}) during fetch attempt for job {job_id} "
1186
+ f"(state: {current_state}): {fetch_err}"
1187
+ )
1188
+
1189
+ if current_state == STATE_RETRIEVED_NON_DESTRUCTIVE:
1190
+ if isinstance(fetch_err, TimeoutError):
1191
+ raise HTTPException(status_code=410, detail="Job result is gone (TTL expired).")
1192
+ else:
1193
+ raise HTTPException(
1194
+ status_code=503, detail="Backend service unavailable preventing access to job result."
1195
+ )
1196
+ elif current_state == STATE_RETRIEVED_CACHED:
1197
+ raise HTTPException(
1198
+ status_code=410, detail="Job result is gone (previously cached, fetch failed)."
1199
+ )
1200
+ elif current_state in INTERMEDIATE_STATES:
1201
+ if isinstance(fetch_err, TimeoutError):
1202
+ raise HTTPException(
1203
+ status_code=202, detail=f"Job is processing (state: {current_state}). Retry later."
1204
+ )
1205
+ else:
1206
+ raise HTTPException(
1207
+ status_code=503, detail="Backend service unavailable preventing fetch of job result."
1208
+ )
1209
+ else:
1210
+ logger.error(f"Unexpected state '{current_state}' for job {job_id} after fetch failure.")
1211
+ raise HTTPException(
1212
+ status_code=500, detail="Internal server error: Unexpected job state after fetch failure."
1213
+ )
1214
+ else:
1215
+ logger.error(f"Unknown job state '{current_state}' for job {job_id}.")
1216
+ raise HTTPException(
1217
+ status_code=500, detail=f"Internal server error: Unknown job state '{current_state}'."
1218
+ )
1219
+
1220
+ else:
1221
+ # This is a parent job - orchestrate aggregation using declarative helpers
1222
+ subjob_ids = subjob_info.get("subjob_ids", [])
1223
+ metadata = subjob_info.get("metadata", {})
1224
+
1225
+ logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
1226
+
1227
+ # Special case: Non-split PDFs have metadata but no subjobs
1228
+ # Fetch the result directly and augment with page count metadata
1229
+ if len(subjob_ids) == 0:
1230
+ logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
1231
+ try:
1232
+ job_response = await ingest_service.fetch_job(job_id)
1233
+
1234
+ # Augment response with page count metadata
1235
+ if isinstance(job_response, dict):
1236
+ if "metadata" not in job_response:
1237
+ job_response["metadata"] = {}
1238
+ job_response["metadata"]["total_pages"] = metadata.get("total_pages")
1239
+ job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
1240
+ job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
1241
+
1242
+ # Update job state after successful fetch
1243
+ await _update_job_state_after_fetch(job_id, ingest_service)
1244
+
1245
+ return _stream_json_response(job_response)
1246
+ except (TimeoutError, RedisError, ConnectionError):
1247
+ logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
1248
+ raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
1249
+ except Exception as e:
1250
+ logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
1251
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
1252
+
1253
+ # Build ordered descriptors for subjobs
1254
+ stored_descriptors = subjob_info.get("subjob_descriptors") or []
1255
+ descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
1256
+
1257
+ ordered_descriptors: List[Dict[str, Any]] = []
1258
+ for idx, subjob_id in enumerate(subjob_ids, 1):
1259
+ descriptor = descriptor_lookup.get(subjob_id, {})
1260
+ ordered_descriptors.append(
1261
+ {
1262
+ "job_id": subjob_id,
1263
+ "chunk_index": descriptor.get("chunk_index", idx),
1264
+ "start_page": descriptor.get("start_page"),
1265
+ "end_page": descriptor.get("end_page"),
1266
+ "page_count": descriptor.get("page_count"),
1267
+ }
1268
+ )
1269
+
1270
+ # Calculate max parallel operations (stay within Redis connection pool)
1271
+ max_parallel_ops = max(
1272
+ 1, min(len(ordered_descriptors), getattr(ingest_service, "_concurrency_level", 10) // 2)
1273
+ )
1274
+
1275
+ # Check all subjob states (raises 202 if any still processing)
1276
+ subjob_states, failed_subjobs = await _check_all_subjob_states(
1277
+ ordered_descriptors, max_parallel_ops, ingest_service
1278
+ )
1279
+
1280
+ # Fetch all subjob results (raises 202 if any not ready)
1281
+ subjob_results = await _fetch_all_subjob_results(
1282
+ ordered_descriptors, subjob_states, failed_subjobs, max_parallel_ops, ingest_service
1283
+ )
1284
+
1285
+ # Build aggregated response from all subjob results
1286
+ aggregated_result = _build_aggregated_response(
1287
+ job_id, subjob_results, failed_subjobs, ordered_descriptors, metadata
1288
+ )
1289
+
1290
+ # Update parent job state after successful aggregation
1291
+ await _update_job_state_after_fetch(job_id, ingest_service)
1292
+
1293
+ # Return aggregated result as streaming response
1294
+ return _stream_json_response(aggregated_result)
1295
+
1296
+ except HTTPException:
1297
+ raise
1298
+ except Exception as e:
1299
+ logger.exception(f"Unexpected error in fetch_job_v2: {e}")
1300
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")