nv-ingest 2025.8.16.dev20250816__py3-none-any.whl → 2025.11.21.dev20251121__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +43 -14
- nv_ingest/framework/orchestration/process/execution.py +92 -94
- nv_ingest/framework/orchestration/process/lifecycle.py +98 -6
- nv_ingest/framework/orchestration/process/strategies.py +41 -5
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +9 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +2 -3
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +46 -9
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +5 -1
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +5 -1
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +4 -3
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/config/loaders.py +33 -2
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +514 -0
- nv_ingest/pipeline/default_pipeline_impl.py +111 -88
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/METADATA +4 -3
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/RECORD +38 -31
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1300 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# pylint: skip-file
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
import uuid
|
|
15
|
+
import random
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import fsspec
|
|
18
|
+
|
|
19
|
+
from fastapi import APIRouter, Request, Response
|
|
20
|
+
from fastapi import HTTPException
|
|
21
|
+
from fastapi.responses import StreamingResponse
|
|
22
|
+
from redis import RedisError
|
|
23
|
+
|
|
24
|
+
from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
|
|
25
|
+
from nv_ingest_api.util.service_clients.client_base import FetchMode
|
|
26
|
+
from nv_ingest_api.util.dataloader.dataloader import DataLoader
|
|
27
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
|
|
28
|
+
|
|
29
|
+
# For PDF splitting
|
|
30
|
+
import pypdfium2 as pdfium
|
|
31
|
+
|
|
32
|
+
# Reuse V1 state constants and helper functions
|
|
33
|
+
from ..v1.ingest import (
|
|
34
|
+
trace_id_to_uuid,
|
|
35
|
+
INGEST_SERVICE_T,
|
|
36
|
+
STATE_RETRIEVED_DESTRUCTIVE,
|
|
37
|
+
STATE_RETRIEVED_NON_DESTRUCTIVE,
|
|
38
|
+
STATE_RETRIEVED_CACHED,
|
|
39
|
+
STATE_FAILED,
|
|
40
|
+
STATE_SUBMITTED,
|
|
41
|
+
INTERMEDIATE_STATES,
|
|
42
|
+
)
|
|
43
|
+
from .. import traced_endpoint
|
|
44
|
+
from opentelemetry import trace
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger("uvicorn")
|
|
47
|
+
|
|
48
|
+
router = APIRouter()
|
|
49
|
+
|
|
50
|
+
DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
|
|
51
|
+
|
|
52
|
+
# Default QoS thresholds (pages). Tunable via environment variables:
|
|
53
|
+
# QOS_MAX_PAGES_MICRO, QOS_MAX_PAGES_SMALL, QOS_MAX_PAGES_MEDIUM
|
|
54
|
+
_QOS_DEFAULTS = {
|
|
55
|
+
"micro": 8,
|
|
56
|
+
"small": 64,
|
|
57
|
+
"medium": 256,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_qos_tier_for_page_count(page_count: int) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Select QoS tier for a document based on its total page count.
|
|
64
|
+
Tiers: 'micro', 'small', 'medium', 'large', 'default'
|
|
65
|
+
Thresholds can be tuned via environment variables:
|
|
66
|
+
- QOS_MAX_PAGES_MICRO (default: 4)
|
|
67
|
+
- QOS_MAX_PAGES_SMALL (default: 16)
|
|
68
|
+
- QOS_MAX_PAGES_MEDIUM (default: 64)
|
|
69
|
+
Anything above MEDIUM is 'large'. Non-positive page_count returns 'default'.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
micro_max = int(os.getenv("QOS_MAX_PAGES_MICRO", str(_QOS_DEFAULTS["micro"])))
|
|
73
|
+
small_max = int(os.getenv("QOS_MAX_PAGES_SMALL", str(_QOS_DEFAULTS["small"])))
|
|
74
|
+
medium_max = int(os.getenv("QOS_MAX_PAGES_MEDIUM", str(_QOS_DEFAULTS["medium"])))
|
|
75
|
+
except ValueError:
|
|
76
|
+
micro_max, small_max, medium_max = _QOS_DEFAULTS["micro"], _QOS_DEFAULTS["small"], _QOS_DEFAULTS["medium"]
|
|
77
|
+
|
|
78
|
+
if page_count <= 0:
|
|
79
|
+
return "default"
|
|
80
|
+
if page_count <= micro_max:
|
|
81
|
+
return "micro"
|
|
82
|
+
if page_count <= small_max:
|
|
83
|
+
return "small"
|
|
84
|
+
if page_count <= medium_max:
|
|
85
|
+
return "medium"
|
|
86
|
+
return "large"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
|
|
90
|
+
"""
|
|
91
|
+
Resolve the page chunk size for PDF splitting with client override support.
|
|
92
|
+
|
|
93
|
+
Priority: client_override (clamped) > env var > default (32)
|
|
94
|
+
Enforces boundaries: min=1, max=128
|
|
95
|
+
"""
|
|
96
|
+
MIN_PAGES = 1
|
|
97
|
+
MAX_PAGES = 128
|
|
98
|
+
|
|
99
|
+
# Client override takes precedence if provided
|
|
100
|
+
if client_override is not None:
|
|
101
|
+
clamped = max(MIN_PAGES, min(client_override, MAX_PAGES))
|
|
102
|
+
if clamped != client_override:
|
|
103
|
+
logger.warning(
|
|
104
|
+
"Client requested split_page_count=%s; clamped to %s (min=%s, max=%s)",
|
|
105
|
+
client_override,
|
|
106
|
+
clamped,
|
|
107
|
+
MIN_PAGES,
|
|
108
|
+
MAX_PAGES,
|
|
109
|
+
)
|
|
110
|
+
return clamped
|
|
111
|
+
|
|
112
|
+
# Fall back to environment variable
|
|
113
|
+
raw_value = os.environ.get("PDF_SPLIT_PAGE_COUNT")
|
|
114
|
+
if raw_value is None:
|
|
115
|
+
return DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
parsed = int(raw_value)
|
|
119
|
+
except ValueError:
|
|
120
|
+
logger.warning(
|
|
121
|
+
"Invalid PDF_SPLIT_PAGE_COUNT '%s'; falling back to default %s", raw_value, DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
122
|
+
)
|
|
123
|
+
return DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
124
|
+
|
|
125
|
+
if parsed <= 0:
|
|
126
|
+
logger.warning("PDF_SPLIT_PAGE_COUNT must be >= 1; received %s. Using 1.", parsed)
|
|
127
|
+
return 1
|
|
128
|
+
|
|
129
|
+
return parsed
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def split_pdf_to_chunks(pdf_content: bytes, pages_per_chunk: int) -> List[Dict[str, Any]]:
|
|
133
|
+
"""
|
|
134
|
+
Split a PDF into multi-page chunks using pypdfium2.
|
|
135
|
+
|
|
136
|
+
Returns a list of dictionaries containing the chunk bytes and page range metadata.
|
|
137
|
+
Note: this currently buffers each chunk in-memory; consider streaming in future upgrades.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
chunks: List[Dict[str, Any]] = []
|
|
141
|
+
|
|
142
|
+
if pages_per_chunk <= 0:
|
|
143
|
+
pages_per_chunk = 1
|
|
144
|
+
|
|
145
|
+
pdf = pdfium.PdfDocument(pdf_content)
|
|
146
|
+
total_pages = len(pdf)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
for chunk_index, start_zero in enumerate(range(0, total_pages, pages_per_chunk)):
|
|
150
|
+
end_zero = min(start_zero + pages_per_chunk, total_pages)
|
|
151
|
+
page_indices = list(range(start_zero, end_zero))
|
|
152
|
+
|
|
153
|
+
new_pdf = pdfium.PdfDocument.new()
|
|
154
|
+
try:
|
|
155
|
+
new_pdf.import_pages(pdf, page_indices)
|
|
156
|
+
|
|
157
|
+
buffer = BytesIO()
|
|
158
|
+
try:
|
|
159
|
+
new_pdf.save(buffer)
|
|
160
|
+
chunk_bytes = buffer.getvalue()
|
|
161
|
+
finally:
|
|
162
|
+
buffer.close()
|
|
163
|
+
finally:
|
|
164
|
+
new_pdf.close()
|
|
165
|
+
|
|
166
|
+
start_page = start_zero + 1
|
|
167
|
+
end_page = end_zero
|
|
168
|
+
chunk_info: Dict[str, Any] = {
|
|
169
|
+
"bytes": chunk_bytes,
|
|
170
|
+
"chunk_index": chunk_index,
|
|
171
|
+
"start_page": start_page,
|
|
172
|
+
"end_page": end_page,
|
|
173
|
+
"page_count": end_page - start_page + 1,
|
|
174
|
+
}
|
|
175
|
+
chunks.append(chunk_info)
|
|
176
|
+
|
|
177
|
+
finally:
|
|
178
|
+
pdf.close()
|
|
179
|
+
|
|
180
|
+
return chunks
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_pdf_page_count(pdf_content: bytes) -> int:
|
|
184
|
+
"""Get the number of pages in a PDF using pypdfium2."""
|
|
185
|
+
try:
|
|
186
|
+
pdf = pdfium.PdfDocument(pdf_content)
|
|
187
|
+
page_count = len(pdf)
|
|
188
|
+
pdf.close()
|
|
189
|
+
return page_count
|
|
190
|
+
except Exception as e:
|
|
191
|
+
logger.warning(f"Failed to get PDF page count: {e}")
|
|
192
|
+
return 1 # Assume single page on error
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _create_subjob_dict(
|
|
196
|
+
job_id: str,
|
|
197
|
+
job_payload: Dict[str, Any],
|
|
198
|
+
job_spec_template: Dict[str, Any],
|
|
199
|
+
current_trace_id: int,
|
|
200
|
+
parent_job_id: str,
|
|
201
|
+
start_key: Dict[str, Any],
|
|
202
|
+
) -> Dict[str, Any]:
|
|
203
|
+
job_spec = {
|
|
204
|
+
key: value
|
|
205
|
+
for key, value in job_spec_template.items()
|
|
206
|
+
if key not in {"job_payload", "job_id", "tracing_options"}
|
|
207
|
+
}
|
|
208
|
+
job_spec["job_payload"] = job_payload
|
|
209
|
+
job_spec["job_id"] = job_id
|
|
210
|
+
|
|
211
|
+
base_tracing_options = job_spec_template.get("tracing_options") or {}
|
|
212
|
+
tracing_options = dict(base_tracing_options)
|
|
213
|
+
tracing_options.setdefault("trace", True)
|
|
214
|
+
tracing_options["trace_id"] = str(current_trace_id)
|
|
215
|
+
tracing_options["ts_send"] = int(time.time() * 1000)
|
|
216
|
+
tracing_options["parent_job_id"] = parent_job_id
|
|
217
|
+
for key, value in start_key.items():
|
|
218
|
+
tracing_options[key] = value
|
|
219
|
+
|
|
220
|
+
job_spec["tracing_options"] = tracing_options
|
|
221
|
+
return job_spec
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _create_payload_dict(
|
|
225
|
+
job_spec_template: Dict[str, Any],
|
|
226
|
+
content: str,
|
|
227
|
+
source_id: str,
|
|
228
|
+
source_name: str,
|
|
229
|
+
document_type: str,
|
|
230
|
+
) -> Dict[str, Any]:
|
|
231
|
+
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
232
|
+
subjob_payload = {
|
|
233
|
+
key: value
|
|
234
|
+
for key, value in subjob_payload_template.items()
|
|
235
|
+
if key not in {"content", "source_id", "source_name"}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
subjob_payload["content"] = [content]
|
|
239
|
+
|
|
240
|
+
subjob_payload["source_id"] = [source_id]
|
|
241
|
+
subjob_payload["source_name"] = [source_name]
|
|
242
|
+
subjob_payload["document_type"] = [document_type]
|
|
243
|
+
return subjob_payload
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _prepare_chunk_submission(
|
|
247
|
+
job_spec_template: Dict[str, Any],
|
|
248
|
+
chunk: Dict[str, Any],
|
|
249
|
+
*,
|
|
250
|
+
parent_uuid: uuid.UUID,
|
|
251
|
+
parent_job_id: str,
|
|
252
|
+
current_trace_id: int,
|
|
253
|
+
source_id: str,
|
|
254
|
+
source_name: str,
|
|
255
|
+
document_type: str,
|
|
256
|
+
) -> Tuple[str, MessageWrapper]:
|
|
257
|
+
"""Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
|
|
258
|
+
|
|
259
|
+
chunk_number = chunk["chunk_index"] + 1
|
|
260
|
+
|
|
261
|
+
subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
|
|
262
|
+
subjob_id = str(subjob_uuid)
|
|
263
|
+
|
|
264
|
+
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
265
|
+
chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
|
|
266
|
+
subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
|
|
267
|
+
start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
|
|
268
|
+
|
|
269
|
+
subjob_spec = _create_subjob_dict(
|
|
270
|
+
subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# ============================================================================
|
|
277
|
+
# Helper Functions for Fetch Job Aggregation
|
|
278
|
+
# ============================================================================
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
async def _gather_in_batches(coroutines: List, batch_size: int, return_exceptions: bool = False) -> List[Any]:
|
|
282
|
+
"""
|
|
283
|
+
Execute coroutines in batches to respect concurrency limits.
|
|
284
|
+
|
|
285
|
+
Parameters
|
|
286
|
+
----------
|
|
287
|
+
coroutines : List
|
|
288
|
+
List of coroutines to execute
|
|
289
|
+
batch_size : int
|
|
290
|
+
Maximum number of coroutines to execute concurrently
|
|
291
|
+
return_exceptions : bool
|
|
292
|
+
Whether to return exceptions as results (passed to asyncio.gather)
|
|
293
|
+
|
|
294
|
+
Returns
|
|
295
|
+
-------
|
|
296
|
+
List[Any]
|
|
297
|
+
Results from all coroutines in original order
|
|
298
|
+
"""
|
|
299
|
+
results: List[Any] = []
|
|
300
|
+
for offset in range(0, len(coroutines), batch_size):
|
|
301
|
+
batch = coroutines[offset : offset + batch_size]
|
|
302
|
+
batch_results = await asyncio.gather(*batch, return_exceptions=return_exceptions)
|
|
303
|
+
results.extend(batch_results)
|
|
304
|
+
return results
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
async def _update_job_state_after_fetch(job_id: str, ingest_service: INGEST_SERVICE_T) -> None:
|
|
308
|
+
"""
|
|
309
|
+
Update job state after successful fetch based on configured fetch mode.
|
|
310
|
+
|
|
311
|
+
Parameters
|
|
312
|
+
----------
|
|
313
|
+
job_id : str
|
|
314
|
+
The job identifier
|
|
315
|
+
ingest_service : IngestServiceMeta
|
|
316
|
+
The ingest service instance
|
|
317
|
+
"""
|
|
318
|
+
try:
|
|
319
|
+
current_fetch_mode = await ingest_service.get_fetch_mode()
|
|
320
|
+
if current_fetch_mode == FetchMode.DESTRUCTIVE:
|
|
321
|
+
target_state = STATE_RETRIEVED_DESTRUCTIVE
|
|
322
|
+
elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
|
|
323
|
+
target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
|
|
324
|
+
else:
|
|
325
|
+
target_state = STATE_RETRIEVED_CACHED
|
|
326
|
+
|
|
327
|
+
await ingest_service.set_job_state(job_id, target_state)
|
|
328
|
+
logger.debug(f"Updated job {job_id} state to {target_state}")
|
|
329
|
+
except Exception as e:
|
|
330
|
+
logger.error(f"Failed to update job state for {job_id}: {e}")
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _stream_json_response(data: Dict[str, Any]) -> StreamingResponse:
|
|
334
|
+
"""
|
|
335
|
+
Create a StreamingResponse for JSON data.
|
|
336
|
+
|
|
337
|
+
Parameters
|
|
338
|
+
----------
|
|
339
|
+
data : Dict[str, Any]
|
|
340
|
+
The data to serialize and stream
|
|
341
|
+
|
|
342
|
+
Returns
|
|
343
|
+
-------
|
|
344
|
+
StreamingResponse
|
|
345
|
+
FastAPI streaming response with JSON content
|
|
346
|
+
"""
|
|
347
|
+
json_bytes = json.dumps(data).encode("utf-8")
|
|
348
|
+
return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
async def _check_all_subjob_states(
|
|
352
|
+
ordered_descriptors: List[Dict[str, Any]], max_parallel_ops: int, ingest_service: INGEST_SERVICE_T
|
|
353
|
+
) -> Tuple[List[Optional[str]], List[Dict[str, object]]]:
|
|
354
|
+
"""
|
|
355
|
+
Check the state of all subjobs in parallel batches.
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
ordered_descriptors : List[Dict[str, Any]]
|
|
360
|
+
List of subjob descriptors with job_id and chunk_index
|
|
361
|
+
max_parallel_ops : int
|
|
362
|
+
Maximum number of parallel operations
|
|
363
|
+
ingest_service : IngestServiceMeta
|
|
364
|
+
The ingest service instance
|
|
365
|
+
|
|
366
|
+
Returns
|
|
367
|
+
-------
|
|
368
|
+
Tuple[List[Optional[str]], List[Dict[str, object]]]
|
|
369
|
+
Tuple of (subjob_states, failed_subjobs_list)
|
|
370
|
+
|
|
371
|
+
Raises
|
|
372
|
+
------
|
|
373
|
+
HTTPException
|
|
374
|
+
If any subjob is still processing (202)
|
|
375
|
+
"""
|
|
376
|
+
# Gather all subjob states in parallel batches
|
|
377
|
+
state_coroutines = [ingest_service.get_job_state(descriptor.get("job_id")) for descriptor in ordered_descriptors]
|
|
378
|
+
subjob_states = await _gather_in_batches(state_coroutines, max_parallel_ops)
|
|
379
|
+
|
|
380
|
+
# Check for failures and pending work
|
|
381
|
+
failed_subjobs: List[Dict[str, object]] = []
|
|
382
|
+
|
|
383
|
+
for page_index, (descriptor, subjob_state) in enumerate(zip(ordered_descriptors, subjob_states), start=1):
|
|
384
|
+
subjob_id = descriptor.get("job_id")
|
|
385
|
+
|
|
386
|
+
if subjob_state == STATE_FAILED:
|
|
387
|
+
logger.warning(f"Subjob {subjob_id} failed")
|
|
388
|
+
failed_subjobs.append({"subjob_id": subjob_id, "chunk_index": page_index})
|
|
389
|
+
elif subjob_state in INTERMEDIATE_STATES:
|
|
390
|
+
raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
|
|
391
|
+
|
|
392
|
+
return subjob_states, failed_subjobs
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
async def _fetch_all_subjob_results(
|
|
396
|
+
ordered_descriptors: List[Dict[str, Any]],
|
|
397
|
+
subjob_states: List[Optional[str]],
|
|
398
|
+
failed_subjobs: List[Dict[str, object]],
|
|
399
|
+
max_parallel_ops: int,
|
|
400
|
+
ingest_service: INGEST_SERVICE_T,
|
|
401
|
+
) -> List[Optional[Dict[str, Any]]]:
|
|
402
|
+
"""
|
|
403
|
+
Fetch results for all completed subjobs in parallel batches.
|
|
404
|
+
|
|
405
|
+
Parameters
|
|
406
|
+
----------
|
|
407
|
+
ordered_descriptors : List[Dict[str, Any]]
|
|
408
|
+
List of subjob descriptors
|
|
409
|
+
subjob_states : List[Optional[str]]
|
|
410
|
+
States of all subjobs (from _check_all_subjob_states)
|
|
411
|
+
failed_subjobs : List[Dict[str, object]]
|
|
412
|
+
List to append failed fetch attempts to (modified in place)
|
|
413
|
+
max_parallel_ops : int
|
|
414
|
+
Maximum number of parallel operations
|
|
415
|
+
ingest_service : IngestServiceMeta
|
|
416
|
+
The ingest service instance
|
|
417
|
+
|
|
418
|
+
Returns
|
|
419
|
+
-------
|
|
420
|
+
List[Optional[Dict[str, Any]]]
|
|
421
|
+
Results for each subjob (None for failed ones)
|
|
422
|
+
|
|
423
|
+
Raises
|
|
424
|
+
------
|
|
425
|
+
HTTPException
|
|
426
|
+
If any subjob is not ready yet (202)
|
|
427
|
+
"""
|
|
428
|
+
# Initialize results array with None placeholders
|
|
429
|
+
subjob_results: List[Optional[Dict[str, Any]]] = [None] * len(ordered_descriptors)
|
|
430
|
+
|
|
431
|
+
# Build list of fetch tasks (only for non-failed subjobs)
|
|
432
|
+
fetch_coroutines = []
|
|
433
|
+
fetch_targets: List[Dict[str, Any]] = []
|
|
434
|
+
|
|
435
|
+
for list_index, (page_index, descriptor, subjob_state) in enumerate(
|
|
436
|
+
zip(range(1, len(ordered_descriptors) + 1), ordered_descriptors, subjob_states)
|
|
437
|
+
):
|
|
438
|
+
subjob_id = descriptor.get("job_id")
|
|
439
|
+
|
|
440
|
+
# Skip failed subjobs (already recorded in failed_subjobs)
|
|
441
|
+
if subjob_state == STATE_FAILED:
|
|
442
|
+
continue
|
|
443
|
+
|
|
444
|
+
# Skip intermediate states (should have been caught earlier, but defensive)
|
|
445
|
+
if subjob_state in INTERMEDIATE_STATES:
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# Queue this subjob for fetching
|
|
449
|
+
fetch_coroutines.append(ingest_service.fetch_job(subjob_id))
|
|
450
|
+
fetch_targets.append(
|
|
451
|
+
{
|
|
452
|
+
"list_index": list_index,
|
|
453
|
+
"page_index": page_index,
|
|
454
|
+
"subjob_id": subjob_id,
|
|
455
|
+
}
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Fetch all results in parallel batches
|
|
459
|
+
if fetch_coroutines:
|
|
460
|
+
fetch_results = await _gather_in_batches(fetch_coroutines, max_parallel_ops, return_exceptions=True)
|
|
461
|
+
|
|
462
|
+
# Process results and handle errors
|
|
463
|
+
for target, fetch_result in zip(fetch_targets, fetch_results):
|
|
464
|
+
subjob_id = target["subjob_id"]
|
|
465
|
+
page_index = target["page_index"]
|
|
466
|
+
list_index = target["list_index"]
|
|
467
|
+
|
|
468
|
+
if isinstance(fetch_result, TimeoutError):
|
|
469
|
+
logger.debug(f"Subjob {subjob_id} not ready yet; deferring aggregation")
|
|
470
|
+
raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
|
|
471
|
+
|
|
472
|
+
if isinstance(fetch_result, Exception):
|
|
473
|
+
logger.error(f"Failed to fetch subjob {subjob_id}: {fetch_result}")
|
|
474
|
+
failed_subjobs.append(
|
|
475
|
+
{
|
|
476
|
+
"subjob_id": subjob_id,
|
|
477
|
+
"chunk_index": page_index,
|
|
478
|
+
"error": str(fetch_result),
|
|
479
|
+
}
|
|
480
|
+
)
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
subjob_results[list_index] = fetch_result
|
|
484
|
+
|
|
485
|
+
return subjob_results
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
|
|
489
|
+
"""Return the trace and annotation dictionaries emitted by the sink stage."""
|
|
490
|
+
|
|
491
|
+
if not isinstance(result, dict):
|
|
492
|
+
return None, None
|
|
493
|
+
|
|
494
|
+
trace = result.get("trace")
|
|
495
|
+
annotations = result.get("annotations")
|
|
496
|
+
|
|
497
|
+
trace_dict = trace if isinstance(trace, dict) else None
|
|
498
|
+
annotations_dict = annotations if isinstance(annotations, dict) else None
|
|
499
|
+
|
|
500
|
+
return trace_dict, annotations_dict
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _normalize_chunk_records(
|
|
504
|
+
records: Optional[List[Any]],
|
|
505
|
+
descriptor: Dict[str, Any],
|
|
506
|
+
parent_metadata: Dict[str, Any],
|
|
507
|
+
) -> List[Any]:
|
|
508
|
+
"""Re-map chunk-local metadata to document-level context for aggregation."""
|
|
509
|
+
|
|
510
|
+
if not isinstance(records, list):
|
|
511
|
+
return []
|
|
512
|
+
|
|
513
|
+
total_pages = parent_metadata.get("total_pages")
|
|
514
|
+
original_source_id = parent_metadata.get("original_source_id")
|
|
515
|
+
original_source_name = parent_metadata.get("original_source_name")
|
|
516
|
+
|
|
517
|
+
start_page = descriptor.get("start_page")
|
|
518
|
+
page_offset = start_page - 1 if isinstance(start_page, int) and start_page > 0 else 0
|
|
519
|
+
|
|
520
|
+
normalized_entries: List[Any] = []
|
|
521
|
+
|
|
522
|
+
for entry in records:
|
|
523
|
+
if not isinstance(entry, dict):
|
|
524
|
+
normalized_entries.append(entry)
|
|
525
|
+
continue
|
|
526
|
+
|
|
527
|
+
normalized_entry = entry.copy()
|
|
528
|
+
original_metadata = entry.get("metadata")
|
|
529
|
+
|
|
530
|
+
if isinstance(original_metadata, dict):
|
|
531
|
+
normalized_metadata = original_metadata.copy()
|
|
532
|
+
normalized_entry["metadata"] = normalized_metadata
|
|
533
|
+
|
|
534
|
+
original_source_meta = original_metadata.get("source_metadata")
|
|
535
|
+
if isinstance(original_source_meta, dict):
|
|
536
|
+
normalized_source_meta = original_source_meta.copy()
|
|
537
|
+
normalized_metadata["source_metadata"] = normalized_source_meta
|
|
538
|
+
|
|
539
|
+
if original_source_id:
|
|
540
|
+
normalized_source_meta["source_id"] = original_source_id
|
|
541
|
+
if original_source_name:
|
|
542
|
+
normalized_source_meta["source_name"] = original_source_name
|
|
543
|
+
|
|
544
|
+
original_content_meta = original_metadata.get("content_metadata")
|
|
545
|
+
if isinstance(original_content_meta, dict):
|
|
546
|
+
normalized_content_meta = original_content_meta.copy()
|
|
547
|
+
normalized_metadata["content_metadata"] = normalized_content_meta
|
|
548
|
+
|
|
549
|
+
page_number = normalized_content_meta.get("page_number")
|
|
550
|
+
if isinstance(page_number, int) and page_number >= 0:
|
|
551
|
+
normalized_content_meta["page_number"] = page_number + page_offset
|
|
552
|
+
|
|
553
|
+
if isinstance(total_pages, int) and isinstance(normalized_content_meta.get("page_count"), int):
|
|
554
|
+
# Ensure optional per-record page count reflects the full document
|
|
555
|
+
normalized_content_meta["page_count"] = total_pages
|
|
556
|
+
|
|
557
|
+
original_hierarchy = original_content_meta.get("hierarchy")
|
|
558
|
+
if isinstance(original_hierarchy, dict):
|
|
559
|
+
normalized_hierarchy = original_hierarchy.copy()
|
|
560
|
+
normalized_content_meta["hierarchy"] = normalized_hierarchy
|
|
561
|
+
|
|
562
|
+
hierarchy_page = normalized_hierarchy.get("page")
|
|
563
|
+
if isinstance(hierarchy_page, int) and hierarchy_page >= 0:
|
|
564
|
+
normalized_hierarchy["page"] = hierarchy_page + page_offset
|
|
565
|
+
if isinstance(total_pages, int):
|
|
566
|
+
normalized_hierarchy["page_count"] = total_pages
|
|
567
|
+
|
|
568
|
+
normalized_entries.append(normalized_entry)
|
|
569
|
+
|
|
570
|
+
return normalized_entries
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
|
|
574
|
+
"""
|
|
575
|
+
Aggregate chunk-level traces into parent-level metrics.
|
|
576
|
+
|
|
577
|
+
For each stage found in chunk traces:
|
|
578
|
+
- trace::entry::<stage> = min(all chunk entries) - earliest start
|
|
579
|
+
- trace::exit::<stage> = max(all chunk exits) - latest finish
|
|
580
|
+
- trace::resident_time::<stage> = sum(chunk durations) - total compute
|
|
581
|
+
|
|
582
|
+
Parameters
|
|
583
|
+
----------
|
|
584
|
+
chunk_traces : Dict[str, Any]
|
|
585
|
+
Trace dict with chunk-prefixed keys (chunk_N::trace::entry::stage_name)
|
|
586
|
+
|
|
587
|
+
Returns
|
|
588
|
+
-------
|
|
589
|
+
Dict[str, Any]
|
|
590
|
+
Parent-level aggregated traces (trace::entry::stage_name, etc.)
|
|
591
|
+
"""
|
|
592
|
+
# Group by stage: {stage_name: {chunk_idx: {entry: float, exit: float}}}
|
|
593
|
+
stage_data: Dict[str, Dict[int, Dict[str, Any]]] = {}
|
|
594
|
+
|
|
595
|
+
for key, value in chunk_traces.items():
|
|
596
|
+
if not key.startswith("chunk_"):
|
|
597
|
+
continue
|
|
598
|
+
|
|
599
|
+
parts = key.split("::")
|
|
600
|
+
if len(parts) < 4: # Minimum: chunk_N::trace::entry/exit::stage_name
|
|
601
|
+
continue
|
|
602
|
+
|
|
603
|
+
if parts[1] != "trace": # Ensure it's a trace key
|
|
604
|
+
continue
|
|
605
|
+
|
|
606
|
+
chunk_idx_str = parts[0].split("_")[1] # "chunk_1" -> "1"
|
|
607
|
+
try:
|
|
608
|
+
chunk_idx = int(chunk_idx_str)
|
|
609
|
+
except ValueError:
|
|
610
|
+
continue
|
|
611
|
+
|
|
612
|
+
event_type = parts[2] # "entry" or "exit"
|
|
613
|
+
|
|
614
|
+
# Stage name is everything after trace::entry:: or trace::exit::
|
|
615
|
+
# Handles both simple (pdf_extractor) and nested (pdf_extractor::pdf_extraction::pdfium_0)
|
|
616
|
+
stage_name = "::".join(parts[3:]) # Join remaining parts
|
|
617
|
+
|
|
618
|
+
if event_type not in ("entry", "exit"):
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
if stage_name not in stage_data:
|
|
622
|
+
stage_data[stage_name] = {}
|
|
623
|
+
if chunk_idx not in stage_data[stage_name]:
|
|
624
|
+
stage_data[stage_name][chunk_idx] = {}
|
|
625
|
+
|
|
626
|
+
stage_data[stage_name][chunk_idx][event_type] = value
|
|
627
|
+
|
|
628
|
+
# Compute aggregated metrics
|
|
629
|
+
parent_traces: Dict[str, Any] = {}
|
|
630
|
+
|
|
631
|
+
for stage_name, chunks in stage_data.items():
|
|
632
|
+
entries = []
|
|
633
|
+
exits = []
|
|
634
|
+
durations = []
|
|
635
|
+
|
|
636
|
+
for chunk_data in chunks.values():
|
|
637
|
+
entry = chunk_data.get("entry")
|
|
638
|
+
exit_time = chunk_data.get("exit")
|
|
639
|
+
|
|
640
|
+
# Both entry and exit must exist for valid pair
|
|
641
|
+
if entry is not None and exit_time is not None:
|
|
642
|
+
entries.append(entry)
|
|
643
|
+
exits.append(exit_time)
|
|
644
|
+
durations.append(exit_time - entry)
|
|
645
|
+
|
|
646
|
+
# Only add parent traces if we have valid data
|
|
647
|
+
if entries and exits:
|
|
648
|
+
parent_traces[f"trace::entry::{stage_name}"] = min(entries)
|
|
649
|
+
parent_traces[f"trace::exit::{stage_name}"] = max(exits)
|
|
650
|
+
parent_traces[f"trace::resident_time::{stage_name}"] = sum(durations)
|
|
651
|
+
|
|
652
|
+
return parent_traces
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def _build_aggregated_response(
|
|
656
|
+
parent_job_id: str,
|
|
657
|
+
subjob_results: List[Optional[Dict[str, Any]]],
|
|
658
|
+
failed_subjobs: List[Dict[str, object]],
|
|
659
|
+
ordered_descriptors: List[Dict[str, Any]],
|
|
660
|
+
metadata: Dict[str, Any],
|
|
661
|
+
) -> Dict[str, Any]:
|
|
662
|
+
"""
|
|
663
|
+
Build the aggregated response from subjob results.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
parent_job_id : str
|
|
668
|
+
The parent job identifier
|
|
669
|
+
subjob_results : List[Optional[Dict[str, Any]]]
|
|
670
|
+
Results from all subjobs (None for failed ones)
|
|
671
|
+
failed_subjobs : List[Dict[str, object]]
|
|
672
|
+
List of failed subjob information
|
|
673
|
+
ordered_descriptors : List[Dict[str, Any]]
|
|
674
|
+
Subjob descriptors in original order
|
|
675
|
+
metadata : Dict[str, Any]
|
|
676
|
+
Parent job metadata
|
|
677
|
+
|
|
678
|
+
Returns
|
|
679
|
+
-------
|
|
680
|
+
Dict[str, Any]
|
|
681
|
+
Aggregated response with combined data and metadata
|
|
682
|
+
"""
|
|
683
|
+
any_failed = len(failed_subjobs) > 0
|
|
684
|
+
subjob_ids = [desc.get("job_id") for desc in ordered_descriptors]
|
|
685
|
+
|
|
686
|
+
aggregated_result = {
|
|
687
|
+
"data": [],
|
|
688
|
+
"status": "failed" if any_failed else "success",
|
|
689
|
+
"description": (
|
|
690
|
+
"One or more subjobs failed to complete" if any_failed else "Aggregated result composed from subjob outputs"
|
|
691
|
+
),
|
|
692
|
+
# Top-level trace/annotations for V1 compatibility
|
|
693
|
+
"trace": {},
|
|
694
|
+
"annotations": {},
|
|
695
|
+
"metadata": {
|
|
696
|
+
"parent_job_id": parent_job_id,
|
|
697
|
+
"total_pages": metadata.get("total_pages", len(subjob_ids)),
|
|
698
|
+
"pages_per_chunk": metadata.get("pages_per_chunk"),
|
|
699
|
+
"original_source_id": metadata.get("original_source_id"),
|
|
700
|
+
"original_source_name": metadata.get("original_source_name"),
|
|
701
|
+
"subjobs_failed": sum(1 for r in subjob_results if r is None),
|
|
702
|
+
"failed_subjobs": failed_subjobs,
|
|
703
|
+
"subjob_ids": subjob_ids,
|
|
704
|
+
"chunks": [],
|
|
705
|
+
"trace_segments": [],
|
|
706
|
+
"annotation_segments": [],
|
|
707
|
+
},
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
# Aggregate subjob data in page order
|
|
711
|
+
for page_num, (result, descriptor) in enumerate(zip(subjob_results, ordered_descriptors), 1):
|
|
712
|
+
if result is not None:
|
|
713
|
+
# Add page data to aggregated result
|
|
714
|
+
if "data" in result:
|
|
715
|
+
normalized_records = _normalize_chunk_records(result.get("data"), descriptor, metadata)
|
|
716
|
+
aggregated_result["data"].extend(normalized_records)
|
|
717
|
+
chunk_entry = dict(descriptor)
|
|
718
|
+
aggregated_result["metadata"]["chunks"].append(chunk_entry)
|
|
719
|
+
|
|
720
|
+
trace_data, annotation_data = _extract_ray_telemetry(result)
|
|
721
|
+
start_page = descriptor.get("start_page")
|
|
722
|
+
end_page = descriptor.get("end_page")
|
|
723
|
+
|
|
724
|
+
if trace_data:
|
|
725
|
+
# Add to trace_segments (detailed, per-chunk view)
|
|
726
|
+
aggregated_result["metadata"]["trace_segments"].append(
|
|
727
|
+
{
|
|
728
|
+
"job_id": descriptor.get("job_id"),
|
|
729
|
+
"chunk_index": descriptor.get("chunk_index"),
|
|
730
|
+
"start_page": start_page,
|
|
731
|
+
"end_page": end_page,
|
|
732
|
+
"trace": trace_data,
|
|
733
|
+
}
|
|
734
|
+
)
|
|
735
|
+
# Chunk traces stay in metadata.trace_segments only (not in top-level)
|
|
736
|
+
|
|
737
|
+
if annotation_data:
|
|
738
|
+
# Add to annotation_segments (detailed, per-chunk view)
|
|
739
|
+
aggregated_result["metadata"]["annotation_segments"].append(
|
|
740
|
+
{
|
|
741
|
+
"job_id": descriptor.get("job_id"),
|
|
742
|
+
"chunk_index": descriptor.get("chunk_index"),
|
|
743
|
+
"start_page": start_page,
|
|
744
|
+
"end_page": end_page,
|
|
745
|
+
"annotations": annotation_data,
|
|
746
|
+
}
|
|
747
|
+
)
|
|
748
|
+
# Merge into top-level annotations (annotations have unique UUIDs, safe to merge)
|
|
749
|
+
aggregated_result["annotations"].update(annotation_data)
|
|
750
|
+
else:
|
|
751
|
+
# Note failed page
|
|
752
|
+
logger.warning(f"Page {page_num} failed or missing")
|
|
753
|
+
|
|
754
|
+
# Compute parent-level trace aggregations from trace_segments
|
|
755
|
+
trace_segments = aggregated_result["metadata"]["trace_segments"]
|
|
756
|
+
if trace_segments:
|
|
757
|
+
# Build a temporary chunk trace dict for aggregation
|
|
758
|
+
temp_chunk_traces = {}
|
|
759
|
+
for segment in trace_segments:
|
|
760
|
+
chunk_idx = segment.get("chunk_index")
|
|
761
|
+
chunk_trace = segment.get("trace", {})
|
|
762
|
+
for trace_key, trace_value in chunk_trace.items():
|
|
763
|
+
prefixed_key = f"chunk_{chunk_idx}::{trace_key}"
|
|
764
|
+
temp_chunk_traces[prefixed_key] = trace_value
|
|
765
|
+
|
|
766
|
+
# Aggregate and set as top-level trace (only parent traces, no chunk traces)
|
|
767
|
+
parent_level_traces = _aggregate_parent_traces(temp_chunk_traces)
|
|
768
|
+
aggregated_result["trace"] = parent_level_traces
|
|
769
|
+
|
|
770
|
+
return aggregated_result
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
# ---------------------------------------------------------------------------
|
|
774
|
+
# Bursty submission helpers (fairness without long-lived in-flight tasks)
|
|
775
|
+
# ---------------------------------------------------------------------------
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def _get_submit_burst_params() -> Tuple[int, int, int]:
|
|
779
|
+
"""
|
|
780
|
+
Returns (burst_size, pause_ms, jitter_ms) from environment with sane defaults.
|
|
781
|
+
- V2_SUBMIT_BURST_SIZE (default: 16)
|
|
782
|
+
- V2_SUBMIT_BURST_PAUSE_MS (default: 25)
|
|
783
|
+
- V2_SUBMIT_BURST_JITTER_MS (default: 10)
|
|
784
|
+
"""
|
|
785
|
+
burst_size = int(os.getenv("V2_SUBMIT_BURST_SIZE", "16"))
|
|
786
|
+
pause_ms = int(os.getenv("V2_SUBMIT_BURST_PAUSE_MS", "50"))
|
|
787
|
+
jitter_ms = int(os.getenv("V2_SUBMIT_BURST_JITTER_MS", "15"))
|
|
788
|
+
|
|
789
|
+
return max(1, burst_size), max(0, pause_ms), max(0, jitter_ms)
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
async def _submit_subjobs_in_bursts(
|
|
793
|
+
items: List[Tuple[str, MessageWrapper]],
|
|
794
|
+
ingest_service: "INGEST_SERVICE_T",
|
|
795
|
+
*,
|
|
796
|
+
burst_size: int,
|
|
797
|
+
pause_ms: int,
|
|
798
|
+
jitter_ms: int,
|
|
799
|
+
) -> None:
|
|
800
|
+
"""
|
|
801
|
+
Submit subjobs in sequential bursts and await each burst to completion.
|
|
802
|
+
This avoids keeping a large number of pending tasks in the REST handler
|
|
803
|
+
and allows other concurrent requests to interleave enqueue work between bursts.
|
|
804
|
+
"""
|
|
805
|
+
for offset in range(0, len(items), burst_size):
|
|
806
|
+
burst = items[offset : offset + burst_size]
|
|
807
|
+
tasks = [ingest_service.submit_job(wrapper, subjob_id) for (subjob_id, wrapper) in burst]
|
|
808
|
+
# Propagate any errors from this burst
|
|
809
|
+
await asyncio.gather(*tasks)
|
|
810
|
+
|
|
811
|
+
# Pause with jitter to yield to other request handlers before next burst
|
|
812
|
+
if offset + burst_size < len(items):
|
|
813
|
+
delay_ms = pause_ms + (random.randint(0, jitter_ms) if jitter_ms > 0 else 0)
|
|
814
|
+
if delay_ms > 0:
|
|
815
|
+
await asyncio.sleep(delay_ms / 1000.0)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
# POST /v2/submit_job
|
|
819
|
+
@router.post(
|
|
820
|
+
"/submit_job",
|
|
821
|
+
responses={
|
|
822
|
+
200: {"description": "Jobs were successfully submitted"},
|
|
823
|
+
500: {"description": "Error encountered while submitting jobs."},
|
|
824
|
+
503: {"description": "Service unavailable."},
|
|
825
|
+
},
|
|
826
|
+
tags=["Ingestion"],
|
|
827
|
+
summary="submit jobs to the core nv ingestion service for processing with PDF splitting",
|
|
828
|
+
operation_id="submit_job_v2",
|
|
829
|
+
)
|
|
830
|
+
@traced_endpoint("http-submit-job-v2")
|
|
831
|
+
async def submit_job_v2(
|
|
832
|
+
request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
|
|
833
|
+
):
|
|
834
|
+
span = trace.get_current_span()
|
|
835
|
+
source_id = None
|
|
836
|
+
document_type = None
|
|
837
|
+
try:
|
|
838
|
+
span.add_event("Submitting file for processing (V2)")
|
|
839
|
+
|
|
840
|
+
current_trace_id = span.get_span_context().trace_id
|
|
841
|
+
parent_job_id = trace_id_to_uuid(current_trace_id)
|
|
842
|
+
|
|
843
|
+
# Parse job spec
|
|
844
|
+
job_spec_dict = json.loads(job_spec.payload)
|
|
845
|
+
|
|
846
|
+
# Extract PDF configuration if provided by client
|
|
847
|
+
pdf_config = job_spec_dict.get("pdf_config", {})
|
|
848
|
+
client_split_page_count = pdf_config.get("split_page_count") if pdf_config else None
|
|
849
|
+
|
|
850
|
+
# Extract document type and payload from the proper structure
|
|
851
|
+
job_payload = job_spec_dict.get("job_payload", {})
|
|
852
|
+
document_types = job_payload.get("document_type", [])
|
|
853
|
+
payloads = job_payload.get("content", [])
|
|
854
|
+
|
|
855
|
+
# Resolve original source metadata up front for logging / subjob naming
|
|
856
|
+
source_ids = job_payload.get("source_id", ["unknown_source.pdf"])
|
|
857
|
+
source_names = job_payload.get("source_name", ["unknown_source.pdf"])
|
|
858
|
+
original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
|
|
859
|
+
original_source_name = source_names[0] if source_names else "unknown_source.pdf"
|
|
860
|
+
|
|
861
|
+
# Track page count for all PDFs (used for both splitting logic and metadata)
|
|
862
|
+
pdf_page_count_cache = None
|
|
863
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
864
|
+
subjob_ids: List[str] = []
|
|
865
|
+
subjob_descriptors: List[Dict[str, Any]] = []
|
|
866
|
+
parent_metadata: Dict[str, Any] = {}
|
|
867
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
868
|
+
try:
|
|
869
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
870
|
+
except ValueError:
|
|
871
|
+
logger.warning(
|
|
872
|
+
"Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
|
|
873
|
+
parent_job_id,
|
|
874
|
+
)
|
|
875
|
+
parent_uuid = uuid.uuid4()
|
|
876
|
+
# Check if this is a PDF that needs splitting
|
|
877
|
+
if document_types and payloads and document_types[0].lower() == "pdf":
|
|
878
|
+
# Decode the payload to check page count
|
|
879
|
+
pdf_content = base64.b64decode(payloads[0])
|
|
880
|
+
page_count = get_pdf_page_count(pdf_content)
|
|
881
|
+
pdf_page_count_cache = page_count # Cache for later use
|
|
882
|
+
qos_tier = get_qos_tier_for_page_count(page_count)
|
|
883
|
+
pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
|
|
884
|
+
document_type = DocumentTypeEnum.PDF
|
|
885
|
+
|
|
886
|
+
# Split if the document has more pages than our chunk size
|
|
887
|
+
if page_count > pages_per_chunk:
|
|
888
|
+
logger.warning(
|
|
889
|
+
"Splitting PDF %s into %s-page chunks (total pages: %s) -> (qos_tier: %s)",
|
|
890
|
+
original_source_name,
|
|
891
|
+
pages_per_chunk,
|
|
892
|
+
page_count,
|
|
893
|
+
qos_tier,
|
|
894
|
+
)
|
|
895
|
+
chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
|
|
896
|
+
|
|
897
|
+
subjob_ids: List[str] = []
|
|
898
|
+
subjob_descriptors: List[Dict[str, Any]] = []
|
|
899
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
900
|
+
try:
|
|
901
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
902
|
+
except ValueError:
|
|
903
|
+
logger.warning(
|
|
904
|
+
"Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
|
|
905
|
+
parent_job_id,
|
|
906
|
+
)
|
|
907
|
+
parent_uuid = uuid.uuid4()
|
|
908
|
+
|
|
909
|
+
for chunk in chunks:
|
|
910
|
+
start = chunk["start_page"]
|
|
911
|
+
end = chunk["end_page"]
|
|
912
|
+
page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
|
|
913
|
+
source_id = f"{original_source_id}#{page_suffix}"
|
|
914
|
+
source_name = f"{original_source_name}#{page_suffix}"
|
|
915
|
+
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
916
|
+
job_spec_dict,
|
|
917
|
+
chunk,
|
|
918
|
+
document_type=DocumentTypeEnum.PDF,
|
|
919
|
+
parent_uuid=parent_uuid,
|
|
920
|
+
parent_job_id=parent_job_id,
|
|
921
|
+
current_trace_id=current_trace_id,
|
|
922
|
+
source_id=source_id,
|
|
923
|
+
source_name=source_name,
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
# Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
|
|
927
|
+
try:
|
|
928
|
+
sub_spec = json.loads(subjob_wrapper.payload)
|
|
929
|
+
routing_opts = sub_spec.get("routing_options") or {}
|
|
930
|
+
routing_opts["queue_hint"] = qos_tier
|
|
931
|
+
sub_spec["routing_options"] = routing_opts
|
|
932
|
+
subjob_wrapper = MessageWrapper(payload=json.dumps(sub_spec))
|
|
933
|
+
except Exception:
|
|
934
|
+
# Best-effort; if we cannot inject, fall back to default routing
|
|
935
|
+
pass
|
|
936
|
+
|
|
937
|
+
submission_items.append((subjob_id, subjob_wrapper))
|
|
938
|
+
subjob_ids.append(subjob_id)
|
|
939
|
+
subjob_descriptors.append(
|
|
940
|
+
{
|
|
941
|
+
"job_id": subjob_id,
|
|
942
|
+
"chunk_index": len(subjob_descriptors) + 1,
|
|
943
|
+
"start_page": chunk.get("start_page"),
|
|
944
|
+
"end_page": chunk.get("end_page"),
|
|
945
|
+
"page_count": chunk.get("page_count"),
|
|
946
|
+
}
|
|
947
|
+
)
|
|
948
|
+
parent_metadata.update(
|
|
949
|
+
{
|
|
950
|
+
"total_pages": page_count,
|
|
951
|
+
"pages_per_chunk": pages_per_chunk,
|
|
952
|
+
"original_source_id": original_source_id,
|
|
953
|
+
"original_source_name": original_source_name,
|
|
954
|
+
"document_type": document_types[0] if document_types else "pdf",
|
|
955
|
+
"subjob_order": subjob_ids,
|
|
956
|
+
}
|
|
957
|
+
)
|
|
958
|
+
elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
|
|
959
|
+
document_type = document_types[0]
|
|
960
|
+
upload_path = f"./{Path(original_source_id).name}"
|
|
961
|
+
# dump the payload to a file, just came from client
|
|
962
|
+
with fsspec.open(upload_path, "wb") as f:
|
|
963
|
+
f.write(base64.b64decode(payloads[0]))
|
|
964
|
+
dataloader = DataLoader(
|
|
965
|
+
path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
|
|
966
|
+
)
|
|
967
|
+
document_type = DocumentTypeEnum.MP3
|
|
968
|
+
|
|
969
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
970
|
+
for task in job_spec_dict["tasks"]:
|
|
971
|
+
if "task_properties" in task and "document_type" in task["task_properties"]:
|
|
972
|
+
task["task_properties"]["document_type"] = document_type
|
|
973
|
+
end = 0
|
|
974
|
+
for idx, (file_path, duration) in enumerate(dataloader.files_completed):
|
|
975
|
+
start = end
|
|
976
|
+
end = int(start + duration)
|
|
977
|
+
chunk = {
|
|
978
|
+
"bytes": file_path.encode("utf-8"),
|
|
979
|
+
"chunk_index": idx,
|
|
980
|
+
"start": start,
|
|
981
|
+
"end": end,
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
985
|
+
job_spec_dict,
|
|
986
|
+
chunk,
|
|
987
|
+
parent_uuid=parent_uuid,
|
|
988
|
+
parent_job_id=parent_job_id,
|
|
989
|
+
current_trace_id=current_trace_id,
|
|
990
|
+
source_id=file_path,
|
|
991
|
+
source_name=upload_path,
|
|
992
|
+
document_type=document_type,
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
submission_items.append((subjob_id, subjob_wrapper))
|
|
996
|
+
subjob_ids.append(subjob_id)
|
|
997
|
+
subjob_descriptors.append(
|
|
998
|
+
{
|
|
999
|
+
"job_id": subjob_id,
|
|
1000
|
+
"chunk_index": idx + 1,
|
|
1001
|
+
"start_page": chunk.get("start"),
|
|
1002
|
+
"end_page": chunk.get("end"),
|
|
1003
|
+
"page_count": chunk.get("page_count", 0),
|
|
1004
|
+
}
|
|
1005
|
+
)
|
|
1006
|
+
logger.error(f"Removing uploaded file {upload_path}")
|
|
1007
|
+
os.remove(upload_path)
|
|
1008
|
+
|
|
1009
|
+
if submission_items:
|
|
1010
|
+
burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
|
|
1011
|
+
await _submit_subjobs_in_bursts(
|
|
1012
|
+
submission_items,
|
|
1013
|
+
ingest_service,
|
|
1014
|
+
burst_size=burst_size,
|
|
1015
|
+
pause_ms=pause_ms,
|
|
1016
|
+
jitter_ms=jitter_ms,
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
parent_metadata.update(
|
|
1020
|
+
{
|
|
1021
|
+
"original_source_id": original_source_id,
|
|
1022
|
+
"original_source_name": original_source_name,
|
|
1023
|
+
"document_type": document_type,
|
|
1024
|
+
"subjob_order": subjob_ids,
|
|
1025
|
+
}
|
|
1026
|
+
)
|
|
1027
|
+
# raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
|
|
1028
|
+
await ingest_service.set_parent_job_mapping(
|
|
1029
|
+
parent_job_id,
|
|
1030
|
+
subjob_ids,
|
|
1031
|
+
parent_metadata,
|
|
1032
|
+
subjob_descriptors=subjob_descriptors,
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
1036
|
+
|
|
1037
|
+
span.add_event(f"Split into {len(subjob_ids)} subjobs")
|
|
1038
|
+
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
1039
|
+
return parent_job_id
|
|
1040
|
+
|
|
1041
|
+
# For non-PDFs or cases where splitting is not required, submit as normal
|
|
1042
|
+
if "tracing_options" not in job_spec_dict:
|
|
1043
|
+
job_spec_dict["tracing_options"] = {"trace": True}
|
|
1044
|
+
job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
|
|
1045
|
+
# If this was a PDF and we computed page_count, route the single job using the same QoS tier
|
|
1046
|
+
try:
|
|
1047
|
+
if (
|
|
1048
|
+
document_types
|
|
1049
|
+
and document_types[0].lower() == "pdf"
|
|
1050
|
+
and "queue_hint" not in (job_spec_dict.get("routing_options") or {})
|
|
1051
|
+
):
|
|
1052
|
+
job_spec_dict.setdefault("routing_options", {})["queue_hint"] = qos_tier
|
|
1053
|
+
except Exception:
|
|
1054
|
+
pass
|
|
1055
|
+
updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
|
|
1056
|
+
|
|
1057
|
+
span.add_event("Submitting as single job (no split needed)")
|
|
1058
|
+
|
|
1059
|
+
# Submit the job to the pipeline task queue
|
|
1060
|
+
await ingest_service.submit_job(updated_job_spec, parent_job_id)
|
|
1061
|
+
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
1062
|
+
|
|
1063
|
+
# If this was a PDF (even if not split), store page count metadata for tracking
|
|
1064
|
+
if pdf_page_count_cache is not None:
|
|
1065
|
+
try:
|
|
1066
|
+
# Use cached page count from earlier check to avoid re-decoding
|
|
1067
|
+
# Store minimal metadata for non-split PDFs (consistent with split PDFs)
|
|
1068
|
+
single_pdf_metadata: Dict[str, Any] = {
|
|
1069
|
+
"total_pages": pdf_page_count_cache,
|
|
1070
|
+
"pages_per_chunk": pdf_page_count_cache, # Single chunk = entire document
|
|
1071
|
+
"original_source_id": original_source_id,
|
|
1072
|
+
"original_source_name": original_source_name,
|
|
1073
|
+
"document_type": document_types[0],
|
|
1074
|
+
"subjob_order": [], # No subjobs for non-split PDFs
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
# Store as parent job metadata with empty subjob list for consistency
|
|
1078
|
+
await ingest_service.set_parent_job_mapping(
|
|
1079
|
+
parent_job_id,
|
|
1080
|
+
[], # Empty subjob list
|
|
1081
|
+
single_pdf_metadata,
|
|
1082
|
+
subjob_descriptors=[],
|
|
1083
|
+
)
|
|
1084
|
+
logger.debug(
|
|
1085
|
+
f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
|
|
1086
|
+
)
|
|
1087
|
+
except Exception as metadata_err:
|
|
1088
|
+
# Don't fail the job if metadata storage fails
|
|
1089
|
+
logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
|
|
1090
|
+
|
|
1091
|
+
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
1092
|
+
return parent_job_id
|
|
1093
|
+
|
|
1094
|
+
except Exception as ex:
|
|
1095
|
+
logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
|
|
1096
|
+
raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
# GET /v2/fetch_job
|
|
1100
|
+
@router.get(
|
|
1101
|
+
"/fetch_job/{job_id}",
|
|
1102
|
+
responses={
|
|
1103
|
+
200: {"description": "Job result successfully retrieved."},
|
|
1104
|
+
202: {"description": "Job is processing or result not yet available. Retry later."},
|
|
1105
|
+
404: {"description": "Job ID not found or associated state has expired."},
|
|
1106
|
+
410: {"description": "Job result existed but is now gone (expired or retrieved destructively/cached)."},
|
|
1107
|
+
500: {"description": "Internal server error during fetch processing."},
|
|
1108
|
+
503: {"description": "Job processing failed, or backend service temporarily unavailable preventing fetch."},
|
|
1109
|
+
},
|
|
1110
|
+
tags=["Ingestion"],
|
|
1111
|
+
summary="Fetch the result of a previously submitted job by its job_id (V2 with aggregation)",
|
|
1112
|
+
operation_id="fetch_job_v2",
|
|
1113
|
+
)
|
|
1114
|
+
async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
|
|
1115
|
+
"""
|
|
1116
|
+
V2 fetch that handles parent job aggregation.
|
|
1117
|
+
"""
|
|
1118
|
+
try:
|
|
1119
|
+
# Check if this is a parent job with subjobs
|
|
1120
|
+
subjob_info = await ingest_service.get_parent_job_info(job_id)
|
|
1121
|
+
|
|
1122
|
+
if subjob_info is None:
|
|
1123
|
+
# Not a parent job, fetch identical to V1
|
|
1124
|
+
current_state = await ingest_service.get_job_state(job_id)
|
|
1125
|
+
logger.debug(f"Initial state check for job {job_id}: {current_state}")
|
|
1126
|
+
|
|
1127
|
+
if current_state is None:
|
|
1128
|
+
logger.warning(f"Job {job_id} not found or expired. Returning 404.")
|
|
1129
|
+
raise HTTPException(status_code=404, detail="Job ID not found or state has expired.")
|
|
1130
|
+
|
|
1131
|
+
if current_state == STATE_FAILED:
|
|
1132
|
+
logger.error(f"Job {job_id} failed. Returning 503.")
|
|
1133
|
+
raise HTTPException(status_code=503, detail="Job processing failed.")
|
|
1134
|
+
|
|
1135
|
+
if current_state == STATE_RETRIEVED_DESTRUCTIVE:
|
|
1136
|
+
logger.warning(f"Job {job_id} was destructively retrieved. Returning 410.")
|
|
1137
|
+
raise HTTPException(status_code=410, detail="Job result is gone (destructive read).")
|
|
1138
|
+
|
|
1139
|
+
if current_state in INTERMEDIATE_STATES or current_state in {
|
|
1140
|
+
STATE_RETRIEVED_NON_DESTRUCTIVE,
|
|
1141
|
+
STATE_RETRIEVED_CACHED,
|
|
1142
|
+
}:
|
|
1143
|
+
logger.debug(f"Attempting fetch for job {job_id} in state {current_state}.")
|
|
1144
|
+
|
|
1145
|
+
try:
|
|
1146
|
+
job_response = await ingest_service.fetch_job(job_id)
|
|
1147
|
+
logger.debug(f"Fetched result for job {job_id}.")
|
|
1148
|
+
|
|
1149
|
+
try:
|
|
1150
|
+
current_fetch_mode = await ingest_service.get_fetch_mode()
|
|
1151
|
+
if current_fetch_mode == FetchMode.DESTRUCTIVE:
|
|
1152
|
+
target_state = STATE_RETRIEVED_DESTRUCTIVE
|
|
1153
|
+
elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
|
|
1154
|
+
target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
|
|
1155
|
+
elif current_fetch_mode == FetchMode.CACHE_BEFORE_DELETE:
|
|
1156
|
+
target_state = STATE_RETRIEVED_CACHED
|
|
1157
|
+
else:
|
|
1158
|
+
target_state = "RETRIEVED_UNKNOWN"
|
|
1159
|
+
|
|
1160
|
+
if target_state != "RETRIEVED_UNKNOWN":
|
|
1161
|
+
await ingest_service.set_job_state(job_id, target_state)
|
|
1162
|
+
logger.debug(f"Updated job {job_id} state to {target_state}.")
|
|
1163
|
+
except Exception as state_err:
|
|
1164
|
+
logger.error(f"Failed to set job state for {job_id} after fetch: {state_err}")
|
|
1165
|
+
|
|
1166
|
+
try:
|
|
1167
|
+
json_bytes = json.dumps(job_response).encode("utf-8")
|
|
1168
|
+
return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
|
|
1169
|
+
except TypeError as json_err:
|
|
1170
|
+
logger.exception(f"Serialization error for job {job_id}: {json_err}")
|
|
1171
|
+
raise HTTPException(
|
|
1172
|
+
status_code=500, detail="Internal server error: Failed to serialize result."
|
|
1173
|
+
)
|
|
1174
|
+
|
|
1175
|
+
except (TimeoutError, RedisError, ConnectionError) as fetch_err:
|
|
1176
|
+
# Handle timeout/error cases same as V1
|
|
1177
|
+
fetch_err_type = type(fetch_err).__name__
|
|
1178
|
+
|
|
1179
|
+
if isinstance(fetch_err, TimeoutError):
|
|
1180
|
+
logger.debug(
|
|
1181
|
+
f"Job {job_id} still processing (state: {current_state}), fetch attempt timed out cleanly."
|
|
1182
|
+
)
|
|
1183
|
+
else:
|
|
1184
|
+
logger.warning(
|
|
1185
|
+
f"Backend error ({fetch_err_type}) during fetch attempt for job {job_id} "
|
|
1186
|
+
f"(state: {current_state}): {fetch_err}"
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
if current_state == STATE_RETRIEVED_NON_DESTRUCTIVE:
|
|
1190
|
+
if isinstance(fetch_err, TimeoutError):
|
|
1191
|
+
raise HTTPException(status_code=410, detail="Job result is gone (TTL expired).")
|
|
1192
|
+
else:
|
|
1193
|
+
raise HTTPException(
|
|
1194
|
+
status_code=503, detail="Backend service unavailable preventing access to job result."
|
|
1195
|
+
)
|
|
1196
|
+
elif current_state == STATE_RETRIEVED_CACHED:
|
|
1197
|
+
raise HTTPException(
|
|
1198
|
+
status_code=410, detail="Job result is gone (previously cached, fetch failed)."
|
|
1199
|
+
)
|
|
1200
|
+
elif current_state in INTERMEDIATE_STATES:
|
|
1201
|
+
if isinstance(fetch_err, TimeoutError):
|
|
1202
|
+
raise HTTPException(
|
|
1203
|
+
status_code=202, detail=f"Job is processing (state: {current_state}). Retry later."
|
|
1204
|
+
)
|
|
1205
|
+
else:
|
|
1206
|
+
raise HTTPException(
|
|
1207
|
+
status_code=503, detail="Backend service unavailable preventing fetch of job result."
|
|
1208
|
+
)
|
|
1209
|
+
else:
|
|
1210
|
+
logger.error(f"Unexpected state '{current_state}' for job {job_id} after fetch failure.")
|
|
1211
|
+
raise HTTPException(
|
|
1212
|
+
status_code=500, detail="Internal server error: Unexpected job state after fetch failure."
|
|
1213
|
+
)
|
|
1214
|
+
else:
|
|
1215
|
+
logger.error(f"Unknown job state '{current_state}' for job {job_id}.")
|
|
1216
|
+
raise HTTPException(
|
|
1217
|
+
status_code=500, detail=f"Internal server error: Unknown job state '{current_state}'."
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
else:
|
|
1221
|
+
# This is a parent job - orchestrate aggregation using declarative helpers
|
|
1222
|
+
subjob_ids = subjob_info.get("subjob_ids", [])
|
|
1223
|
+
metadata = subjob_info.get("metadata", {})
|
|
1224
|
+
|
|
1225
|
+
logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
|
|
1226
|
+
|
|
1227
|
+
# Special case: Non-split PDFs have metadata but no subjobs
|
|
1228
|
+
# Fetch the result directly and augment with page count metadata
|
|
1229
|
+
if len(subjob_ids) == 0:
|
|
1230
|
+
logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
|
|
1231
|
+
try:
|
|
1232
|
+
job_response = await ingest_service.fetch_job(job_id)
|
|
1233
|
+
|
|
1234
|
+
# Augment response with page count metadata
|
|
1235
|
+
if isinstance(job_response, dict):
|
|
1236
|
+
if "metadata" not in job_response:
|
|
1237
|
+
job_response["metadata"] = {}
|
|
1238
|
+
job_response["metadata"]["total_pages"] = metadata.get("total_pages")
|
|
1239
|
+
job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
|
|
1240
|
+
job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
|
|
1241
|
+
|
|
1242
|
+
# Update job state after successful fetch
|
|
1243
|
+
await _update_job_state_after_fetch(job_id, ingest_service)
|
|
1244
|
+
|
|
1245
|
+
return _stream_json_response(job_response)
|
|
1246
|
+
except (TimeoutError, RedisError, ConnectionError):
|
|
1247
|
+
logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
|
|
1248
|
+
raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
|
|
1249
|
+
except Exception as e:
|
|
1250
|
+
logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
|
|
1251
|
+
raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
|
|
1252
|
+
|
|
1253
|
+
# Build ordered descriptors for subjobs
|
|
1254
|
+
stored_descriptors = subjob_info.get("subjob_descriptors") or []
|
|
1255
|
+
descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
|
|
1256
|
+
|
|
1257
|
+
ordered_descriptors: List[Dict[str, Any]] = []
|
|
1258
|
+
for idx, subjob_id in enumerate(subjob_ids, 1):
|
|
1259
|
+
descriptor = descriptor_lookup.get(subjob_id, {})
|
|
1260
|
+
ordered_descriptors.append(
|
|
1261
|
+
{
|
|
1262
|
+
"job_id": subjob_id,
|
|
1263
|
+
"chunk_index": descriptor.get("chunk_index", idx),
|
|
1264
|
+
"start_page": descriptor.get("start_page"),
|
|
1265
|
+
"end_page": descriptor.get("end_page"),
|
|
1266
|
+
"page_count": descriptor.get("page_count"),
|
|
1267
|
+
}
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
# Calculate max parallel operations (stay within Redis connection pool)
|
|
1271
|
+
max_parallel_ops = max(
|
|
1272
|
+
1, min(len(ordered_descriptors), getattr(ingest_service, "_concurrency_level", 10) // 2)
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
# Check all subjob states (raises 202 if any still processing)
|
|
1276
|
+
subjob_states, failed_subjobs = await _check_all_subjob_states(
|
|
1277
|
+
ordered_descriptors, max_parallel_ops, ingest_service
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1280
|
+
# Fetch all subjob results (raises 202 if any not ready)
|
|
1281
|
+
subjob_results = await _fetch_all_subjob_results(
|
|
1282
|
+
ordered_descriptors, subjob_states, failed_subjobs, max_parallel_ops, ingest_service
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1285
|
+
# Build aggregated response from all subjob results
|
|
1286
|
+
aggregated_result = _build_aggregated_response(
|
|
1287
|
+
job_id, subjob_results, failed_subjobs, ordered_descriptors, metadata
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
# Update parent job state after successful aggregation
|
|
1291
|
+
await _update_job_state_after_fetch(job_id, ingest_service)
|
|
1292
|
+
|
|
1293
|
+
# Return aggregated result as streaming response
|
|
1294
|
+
return _stream_json_response(aggregated_result)
|
|
1295
|
+
|
|
1296
|
+
except HTTPException:
|
|
1297
|
+
raise
|
|
1298
|
+
except Exception as e:
|
|
1299
|
+
logger.exception(f"Unexpected error in fetch_job_v2: {e}")
|
|
1300
|
+
raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
|