nv-ingest 2025.10.9.dev20251009__py3-none-any.whl → 2025.10.11.dev20251011__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +104 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +816 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +192 -10
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.11.dev20251011.dist-info}/METADATA +1 -1
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.11.dev20251011.dist-info}/RECORD +12 -8
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.11.dev20251011.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.11.dev20251011.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.10.9.dev20251009.dist-info → nv_ingest-2025.10.11.dev20251011.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,816 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# pylint: skip-file
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
import uuid
|
|
15
|
+
|
|
16
|
+
from fastapi import APIRouter, Request, Response
|
|
17
|
+
from fastapi import HTTPException
|
|
18
|
+
from fastapi.responses import StreamingResponse
|
|
19
|
+
from redis import RedisError
|
|
20
|
+
|
|
21
|
+
from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
|
|
22
|
+
from nv_ingest_api.util.service_clients.client_base import FetchMode
|
|
23
|
+
|
|
24
|
+
# For PDF splitting
|
|
25
|
+
import pypdfium2 as pdfium
|
|
26
|
+
|
|
27
|
+
# Reuse V1 state constants and helper functions
|
|
28
|
+
from ..v1.ingest import (
|
|
29
|
+
trace_id_to_uuid,
|
|
30
|
+
INGEST_SERVICE_T,
|
|
31
|
+
STATE_RETRIEVED_DESTRUCTIVE,
|
|
32
|
+
STATE_RETRIEVED_NON_DESTRUCTIVE,
|
|
33
|
+
STATE_RETRIEVED_CACHED,
|
|
34
|
+
STATE_FAILED,
|
|
35
|
+
STATE_SUBMITTED,
|
|
36
|
+
INTERMEDIATE_STATES,
|
|
37
|
+
)
|
|
38
|
+
from .. import traced_endpoint
|
|
39
|
+
from opentelemetry import trace
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger("uvicorn")
|
|
42
|
+
|
|
43
|
+
router = APIRouter()
|
|
44
|
+
|
|
45
|
+
DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_pdf_split_page_count() -> int:
|
|
49
|
+
"""Resolve the configured page chunk size for PDF splitting."""
|
|
50
|
+
|
|
51
|
+
raw_value = os.environ.get("PDF_SPLIT_PAGE_COUNT")
|
|
52
|
+
if raw_value is None:
|
|
53
|
+
return DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
parsed = int(raw_value)
|
|
57
|
+
except ValueError:
|
|
58
|
+
logger.warning(
|
|
59
|
+
"Invalid PDF_SPLIT_PAGE_COUNT '%s'; falling back to default %s", raw_value, DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
60
|
+
)
|
|
61
|
+
return DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
62
|
+
|
|
63
|
+
if parsed <= 0:
|
|
64
|
+
logger.warning("PDF_SPLIT_PAGE_COUNT must be >= 1; received %s. Using 1.", parsed)
|
|
65
|
+
return 1
|
|
66
|
+
|
|
67
|
+
return parsed
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def split_pdf_to_chunks(pdf_content: bytes, pages_per_chunk: int) -> List[Dict[str, Any]]:
|
|
71
|
+
"""
|
|
72
|
+
Split a PDF into multi-page chunks using pypdfium2.
|
|
73
|
+
|
|
74
|
+
Returns a list of dictionaries containing the chunk bytes and page range metadata.
|
|
75
|
+
Note: this currently buffers each chunk in-memory; consider streaming in future upgrades.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
chunks: List[Dict[str, Any]] = []
|
|
79
|
+
|
|
80
|
+
if pages_per_chunk <= 0:
|
|
81
|
+
pages_per_chunk = 1
|
|
82
|
+
|
|
83
|
+
pdf = pdfium.PdfDocument(pdf_content)
|
|
84
|
+
total_pages = len(pdf)
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
for chunk_index, start_zero in enumerate(range(0, total_pages, pages_per_chunk)):
|
|
88
|
+
end_zero = min(start_zero + pages_per_chunk, total_pages)
|
|
89
|
+
page_indices = list(range(start_zero, end_zero))
|
|
90
|
+
|
|
91
|
+
new_pdf = pdfium.PdfDocument.new()
|
|
92
|
+
try:
|
|
93
|
+
new_pdf.import_pages(pdf, page_indices)
|
|
94
|
+
|
|
95
|
+
buffer = BytesIO()
|
|
96
|
+
try:
|
|
97
|
+
new_pdf.save(buffer)
|
|
98
|
+
chunk_bytes = buffer.getvalue()
|
|
99
|
+
finally:
|
|
100
|
+
buffer.close()
|
|
101
|
+
finally:
|
|
102
|
+
new_pdf.close()
|
|
103
|
+
|
|
104
|
+
start_page = start_zero + 1
|
|
105
|
+
end_page = end_zero
|
|
106
|
+
chunk_info: Dict[str, Any] = {
|
|
107
|
+
"bytes": chunk_bytes,
|
|
108
|
+
"chunk_index": chunk_index,
|
|
109
|
+
"start_page": start_page,
|
|
110
|
+
"end_page": end_page,
|
|
111
|
+
"page_count": end_page - start_page + 1,
|
|
112
|
+
}
|
|
113
|
+
chunks.append(chunk_info)
|
|
114
|
+
|
|
115
|
+
finally:
|
|
116
|
+
pdf.close()
|
|
117
|
+
|
|
118
|
+
return chunks
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_pdf_page_count(pdf_content: bytes) -> int:
|
|
122
|
+
"""Get the number of pages in a PDF using pypdfium2."""
|
|
123
|
+
try:
|
|
124
|
+
pdf = pdfium.PdfDocument(pdf_content)
|
|
125
|
+
page_count = len(pdf)
|
|
126
|
+
pdf.close()
|
|
127
|
+
return page_count
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Failed to get PDF page count: {e}")
|
|
130
|
+
return 1 # Assume single page on error
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _prepare_chunk_submission(
|
|
134
|
+
job_spec_template: Dict[str, Any],
|
|
135
|
+
chunk: Dict[str, Any],
|
|
136
|
+
*,
|
|
137
|
+
parent_uuid: uuid.UUID,
|
|
138
|
+
parent_job_id: str,
|
|
139
|
+
current_trace_id: int,
|
|
140
|
+
original_source_id: str,
|
|
141
|
+
original_source_name: str,
|
|
142
|
+
) -> Tuple[str, MessageWrapper]:
|
|
143
|
+
"""Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
|
|
144
|
+
|
|
145
|
+
chunk_number = chunk["chunk_index"] + 1
|
|
146
|
+
start_page = chunk["start_page"]
|
|
147
|
+
end_page = chunk["end_page"]
|
|
148
|
+
|
|
149
|
+
subjob_spec = {
|
|
150
|
+
key: value
|
|
151
|
+
for key, value in job_spec_template.items()
|
|
152
|
+
if key not in {"job_payload", "job_id", "tracing_options"}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
156
|
+
subjob_payload = {
|
|
157
|
+
key: value
|
|
158
|
+
for key, value in subjob_payload_template.items()
|
|
159
|
+
if key not in {"content", "source_id", "source_name"}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
chunk_bytes = chunk["bytes"]
|
|
163
|
+
subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
|
|
164
|
+
|
|
165
|
+
page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
|
|
166
|
+
subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
|
|
167
|
+
subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
|
|
168
|
+
|
|
169
|
+
subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
|
|
170
|
+
subjob_id = str(subjob_uuid)
|
|
171
|
+
subjob_spec["job_payload"] = subjob_payload
|
|
172
|
+
subjob_spec["job_id"] = subjob_id
|
|
173
|
+
|
|
174
|
+
base_tracing_options = job_spec_template.get("tracing_options") or {}
|
|
175
|
+
tracing_options = dict(base_tracing_options)
|
|
176
|
+
tracing_options.setdefault("trace", True)
|
|
177
|
+
tracing_options["trace_id"] = str(current_trace_id)
|
|
178
|
+
tracing_options["ts_send"] = int(time.time() * 1000)
|
|
179
|
+
tracing_options["parent_job_id"] = parent_job_id
|
|
180
|
+
tracing_options["page_num"] = start_page
|
|
181
|
+
|
|
182
|
+
subjob_spec["tracing_options"] = tracing_options
|
|
183
|
+
|
|
184
|
+
return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ============================================================================
|
|
188
|
+
# Helper Functions for Fetch Job Aggregation
|
|
189
|
+
# ============================================================================
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
async def _gather_in_batches(coroutines: List, batch_size: int, return_exceptions: bool = False) -> List[Any]:
|
|
193
|
+
"""
|
|
194
|
+
Execute coroutines in batches to respect concurrency limits.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
coroutines : List
|
|
199
|
+
List of coroutines to execute
|
|
200
|
+
batch_size : int
|
|
201
|
+
Maximum number of coroutines to execute concurrently
|
|
202
|
+
return_exceptions : bool
|
|
203
|
+
Whether to return exceptions as results (passed to asyncio.gather)
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
List[Any]
|
|
208
|
+
Results from all coroutines in original order
|
|
209
|
+
"""
|
|
210
|
+
results: List[Any] = []
|
|
211
|
+
for offset in range(0, len(coroutines), batch_size):
|
|
212
|
+
batch = coroutines[offset : offset + batch_size]
|
|
213
|
+
batch_results = await asyncio.gather(*batch, return_exceptions=return_exceptions)
|
|
214
|
+
results.extend(batch_results)
|
|
215
|
+
return results
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
async def _update_job_state_after_fetch(job_id: str, ingest_service: INGEST_SERVICE_T) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Update job state after successful fetch based on configured fetch mode.
|
|
221
|
+
|
|
222
|
+
Parameters
|
|
223
|
+
----------
|
|
224
|
+
job_id : str
|
|
225
|
+
The job identifier
|
|
226
|
+
ingest_service : IngestServiceMeta
|
|
227
|
+
The ingest service instance
|
|
228
|
+
"""
|
|
229
|
+
try:
|
|
230
|
+
current_fetch_mode = await ingest_service.get_fetch_mode()
|
|
231
|
+
if current_fetch_mode == FetchMode.DESTRUCTIVE:
|
|
232
|
+
target_state = STATE_RETRIEVED_DESTRUCTIVE
|
|
233
|
+
elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
|
|
234
|
+
target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
|
|
235
|
+
else:
|
|
236
|
+
target_state = STATE_RETRIEVED_CACHED
|
|
237
|
+
|
|
238
|
+
await ingest_service.set_job_state(job_id, target_state)
|
|
239
|
+
logger.debug(f"Updated job {job_id} state to {target_state}")
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.error(f"Failed to update job state for {job_id}: {e}")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _stream_json_response(data: Dict[str, Any]) -> StreamingResponse:
|
|
245
|
+
"""
|
|
246
|
+
Create a StreamingResponse for JSON data.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
data : Dict[str, Any]
|
|
251
|
+
The data to serialize and stream
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
StreamingResponse
|
|
256
|
+
FastAPI streaming response with JSON content
|
|
257
|
+
"""
|
|
258
|
+
json_bytes = json.dumps(data).encode("utf-8")
|
|
259
|
+
return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
async def _check_all_subjob_states(
|
|
263
|
+
ordered_descriptors: List[Dict[str, Any]], max_parallel_ops: int, ingest_service: INGEST_SERVICE_T
|
|
264
|
+
) -> Tuple[List[Optional[str]], List[Dict[str, object]]]:
|
|
265
|
+
"""
|
|
266
|
+
Check the state of all subjobs in parallel batches.
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
----------
|
|
270
|
+
ordered_descriptors : List[Dict[str, Any]]
|
|
271
|
+
List of subjob descriptors with job_id and chunk_index
|
|
272
|
+
max_parallel_ops : int
|
|
273
|
+
Maximum number of parallel operations
|
|
274
|
+
ingest_service : IngestServiceMeta
|
|
275
|
+
The ingest service instance
|
|
276
|
+
|
|
277
|
+
Returns
|
|
278
|
+
-------
|
|
279
|
+
Tuple[List[Optional[str]], List[Dict[str, object]]]
|
|
280
|
+
Tuple of (subjob_states, failed_subjobs_list)
|
|
281
|
+
|
|
282
|
+
Raises
|
|
283
|
+
------
|
|
284
|
+
HTTPException
|
|
285
|
+
If any subjob is still processing (202)
|
|
286
|
+
"""
|
|
287
|
+
# Gather all subjob states in parallel batches
|
|
288
|
+
state_coroutines = [ingest_service.get_job_state(descriptor.get("job_id")) for descriptor in ordered_descriptors]
|
|
289
|
+
subjob_states = await _gather_in_batches(state_coroutines, max_parallel_ops)
|
|
290
|
+
|
|
291
|
+
# Check for failures and pending work
|
|
292
|
+
failed_subjobs: List[Dict[str, object]] = []
|
|
293
|
+
|
|
294
|
+
for page_index, (descriptor, subjob_state) in enumerate(zip(ordered_descriptors, subjob_states), start=1):
|
|
295
|
+
subjob_id = descriptor.get("job_id")
|
|
296
|
+
|
|
297
|
+
if subjob_state == STATE_FAILED:
|
|
298
|
+
logger.warning(f"Subjob {subjob_id} failed")
|
|
299
|
+
failed_subjobs.append({"subjob_id": subjob_id, "chunk_index": page_index})
|
|
300
|
+
elif subjob_state in INTERMEDIATE_STATES:
|
|
301
|
+
raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
|
|
302
|
+
|
|
303
|
+
return subjob_states, failed_subjobs
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
async def _fetch_all_subjob_results(
|
|
307
|
+
ordered_descriptors: List[Dict[str, Any]],
|
|
308
|
+
subjob_states: List[Optional[str]],
|
|
309
|
+
failed_subjobs: List[Dict[str, object]],
|
|
310
|
+
max_parallel_ops: int,
|
|
311
|
+
ingest_service: INGEST_SERVICE_T,
|
|
312
|
+
) -> List[Optional[Dict[str, Any]]]:
|
|
313
|
+
"""
|
|
314
|
+
Fetch results for all completed subjobs in parallel batches.
|
|
315
|
+
|
|
316
|
+
Parameters
|
|
317
|
+
----------
|
|
318
|
+
ordered_descriptors : List[Dict[str, Any]]
|
|
319
|
+
List of subjob descriptors
|
|
320
|
+
subjob_states : List[Optional[str]]
|
|
321
|
+
States of all subjobs (from _check_all_subjob_states)
|
|
322
|
+
failed_subjobs : List[Dict[str, object]]
|
|
323
|
+
List to append failed fetch attempts to (modified in place)
|
|
324
|
+
max_parallel_ops : int
|
|
325
|
+
Maximum number of parallel operations
|
|
326
|
+
ingest_service : IngestServiceMeta
|
|
327
|
+
The ingest service instance
|
|
328
|
+
|
|
329
|
+
Returns
|
|
330
|
+
-------
|
|
331
|
+
List[Optional[Dict[str, Any]]]
|
|
332
|
+
Results for each subjob (None for failed ones)
|
|
333
|
+
|
|
334
|
+
Raises
|
|
335
|
+
------
|
|
336
|
+
HTTPException
|
|
337
|
+
If any subjob is not ready yet (202)
|
|
338
|
+
"""
|
|
339
|
+
# Initialize results array with None placeholders
|
|
340
|
+
subjob_results: List[Optional[Dict[str, Any]]] = [None] * len(ordered_descriptors)
|
|
341
|
+
|
|
342
|
+
# Build list of fetch tasks (only for non-failed subjobs)
|
|
343
|
+
fetch_coroutines = []
|
|
344
|
+
fetch_targets: List[Dict[str, Any]] = []
|
|
345
|
+
|
|
346
|
+
for list_index, (page_index, descriptor, subjob_state) in enumerate(
|
|
347
|
+
zip(range(1, len(ordered_descriptors) + 1), ordered_descriptors, subjob_states)
|
|
348
|
+
):
|
|
349
|
+
subjob_id = descriptor.get("job_id")
|
|
350
|
+
|
|
351
|
+
# Skip failed subjobs (already recorded in failed_subjobs)
|
|
352
|
+
if subjob_state == STATE_FAILED:
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
# Skip intermediate states (should have been caught earlier, but defensive)
|
|
356
|
+
if subjob_state in INTERMEDIATE_STATES:
|
|
357
|
+
continue
|
|
358
|
+
|
|
359
|
+
# Queue this subjob for fetching
|
|
360
|
+
fetch_coroutines.append(ingest_service.fetch_job(subjob_id))
|
|
361
|
+
fetch_targets.append(
|
|
362
|
+
{
|
|
363
|
+
"list_index": list_index,
|
|
364
|
+
"page_index": page_index,
|
|
365
|
+
"subjob_id": subjob_id,
|
|
366
|
+
}
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# Fetch all results in parallel batches
|
|
370
|
+
if fetch_coroutines:
|
|
371
|
+
fetch_results = await _gather_in_batches(fetch_coroutines, max_parallel_ops, return_exceptions=True)
|
|
372
|
+
|
|
373
|
+
# Process results and handle errors
|
|
374
|
+
for target, fetch_result in zip(fetch_targets, fetch_results):
|
|
375
|
+
subjob_id = target["subjob_id"]
|
|
376
|
+
page_index = target["page_index"]
|
|
377
|
+
list_index = target["list_index"]
|
|
378
|
+
|
|
379
|
+
if isinstance(fetch_result, TimeoutError):
|
|
380
|
+
logger.debug(f"Subjob {subjob_id} not ready yet; deferring aggregation")
|
|
381
|
+
raise HTTPException(status_code=202, detail="Parent job still processing. Some pages not complete.")
|
|
382
|
+
|
|
383
|
+
if isinstance(fetch_result, Exception):
|
|
384
|
+
logger.error(f"Failed to fetch subjob {subjob_id}: {fetch_result}")
|
|
385
|
+
failed_subjobs.append(
|
|
386
|
+
{
|
|
387
|
+
"subjob_id": subjob_id,
|
|
388
|
+
"chunk_index": page_index,
|
|
389
|
+
"error": str(fetch_result),
|
|
390
|
+
}
|
|
391
|
+
)
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
subjob_results[list_index] = fetch_result
|
|
395
|
+
|
|
396
|
+
return subjob_results
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
|
|
400
|
+
"""Return the trace and annotation dictionaries emitted by the sink stage."""
|
|
401
|
+
|
|
402
|
+
if not isinstance(result, dict):
|
|
403
|
+
return None, None
|
|
404
|
+
|
|
405
|
+
trace = result.get("trace")
|
|
406
|
+
annotations = result.get("annotations")
|
|
407
|
+
|
|
408
|
+
trace_dict = trace if isinstance(trace, dict) else None
|
|
409
|
+
annotations_dict = annotations if isinstance(annotations, dict) else None
|
|
410
|
+
|
|
411
|
+
return trace_dict, annotations_dict
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _build_aggregated_response(
|
|
415
|
+
parent_job_id: str,
|
|
416
|
+
subjob_results: List[Optional[Dict[str, Any]]],
|
|
417
|
+
failed_subjobs: List[Dict[str, object]],
|
|
418
|
+
ordered_descriptors: List[Dict[str, Any]],
|
|
419
|
+
metadata: Dict[str, Any],
|
|
420
|
+
) -> Dict[str, Any]:
|
|
421
|
+
"""
|
|
422
|
+
Build the aggregated response from subjob results.
|
|
423
|
+
|
|
424
|
+
Parameters
|
|
425
|
+
----------
|
|
426
|
+
parent_job_id : str
|
|
427
|
+
The parent job identifier
|
|
428
|
+
subjob_results : List[Optional[Dict[str, Any]]]
|
|
429
|
+
Results from all subjobs (None for failed ones)
|
|
430
|
+
failed_subjobs : List[Dict[str, object]]
|
|
431
|
+
List of failed subjob information
|
|
432
|
+
ordered_descriptors : List[Dict[str, Any]]
|
|
433
|
+
Subjob descriptors in original order
|
|
434
|
+
metadata : Dict[str, Any]
|
|
435
|
+
Parent job metadata
|
|
436
|
+
|
|
437
|
+
Returns
|
|
438
|
+
-------
|
|
439
|
+
Dict[str, Any]
|
|
440
|
+
Aggregated response with combined data and metadata
|
|
441
|
+
"""
|
|
442
|
+
any_failed = len(failed_subjobs) > 0
|
|
443
|
+
subjob_ids = [desc.get("job_id") for desc in ordered_descriptors]
|
|
444
|
+
|
|
445
|
+
aggregated_result = {
|
|
446
|
+
"data": [],
|
|
447
|
+
"status": "failed" if any_failed else "success",
|
|
448
|
+
"description": (
|
|
449
|
+
"One or more subjobs failed to complete" if any_failed else "Aggregated result composed from subjob outputs"
|
|
450
|
+
),
|
|
451
|
+
"metadata": {
|
|
452
|
+
"parent_job_id": parent_job_id,
|
|
453
|
+
"total_pages": metadata.get("total_pages", len(subjob_ids)),
|
|
454
|
+
"pages_per_chunk": metadata.get("pages_per_chunk"),
|
|
455
|
+
"original_source_id": metadata.get("original_source_id"),
|
|
456
|
+
"original_source_name": metadata.get("original_source_name"),
|
|
457
|
+
"subjobs_failed": sum(1 for r in subjob_results if r is None),
|
|
458
|
+
"failed_subjobs": failed_subjobs,
|
|
459
|
+
"subjob_ids": subjob_ids,
|
|
460
|
+
"chunks": [],
|
|
461
|
+
"trace_segments": [],
|
|
462
|
+
"annotation_segments": [],
|
|
463
|
+
},
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
# Aggregate subjob data in page order
|
|
467
|
+
for page_num, (result, descriptor) in enumerate(zip(subjob_results, ordered_descriptors), 1):
|
|
468
|
+
if result is not None:
|
|
469
|
+
# Add page data to aggregated result
|
|
470
|
+
if "data" in result:
|
|
471
|
+
aggregated_result["data"].extend(result["data"])
|
|
472
|
+
chunk_entry = dict(descriptor)
|
|
473
|
+
aggregated_result["metadata"]["chunks"].append(chunk_entry)
|
|
474
|
+
|
|
475
|
+
trace_data, annotation_data = _extract_ray_telemetry(result)
|
|
476
|
+
start_page = descriptor.get("start_page")
|
|
477
|
+
end_page = descriptor.get("end_page")
|
|
478
|
+
|
|
479
|
+
if trace_data:
|
|
480
|
+
aggregated_result["metadata"]["trace_segments"].append(
|
|
481
|
+
{
|
|
482
|
+
"job_id": descriptor.get("job_id"),
|
|
483
|
+
"chunk_index": descriptor.get("chunk_index"),
|
|
484
|
+
"start_page": start_page,
|
|
485
|
+
"end_page": end_page,
|
|
486
|
+
"trace": trace_data,
|
|
487
|
+
}
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
if annotation_data:
|
|
491
|
+
aggregated_result["metadata"]["annotation_segments"].append(
|
|
492
|
+
{
|
|
493
|
+
"job_id": descriptor.get("job_id"),
|
|
494
|
+
"chunk_index": descriptor.get("chunk_index"),
|
|
495
|
+
"start_page": start_page,
|
|
496
|
+
"end_page": end_page,
|
|
497
|
+
"annotations": annotation_data,
|
|
498
|
+
}
|
|
499
|
+
)
|
|
500
|
+
else:
|
|
501
|
+
# Note failed page
|
|
502
|
+
logger.warning(f"Page {page_num} failed or missing")
|
|
503
|
+
|
|
504
|
+
return aggregated_result
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
# POST /v2/submit_job
|
|
508
|
+
@router.post(
|
|
509
|
+
"/submit_job",
|
|
510
|
+
responses={
|
|
511
|
+
200: {"description": "Jobs were successfully submitted"},
|
|
512
|
+
500: {"description": "Error encountered while submitting jobs."},
|
|
513
|
+
503: {"description": "Service unavailable."},
|
|
514
|
+
},
|
|
515
|
+
tags=["Ingestion"],
|
|
516
|
+
summary="submit jobs to the core nv ingestion service for processing with PDF splitting",
|
|
517
|
+
operation_id="submit_job_v2",
|
|
518
|
+
)
|
|
519
|
+
@traced_endpoint("http-submit-job-v2")
|
|
520
|
+
async def submit_job_v2(
|
|
521
|
+
request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
|
|
522
|
+
):
|
|
523
|
+
span = trace.get_current_span()
|
|
524
|
+
try:
|
|
525
|
+
span.add_event("Submitting file for processing (V2)")
|
|
526
|
+
|
|
527
|
+
current_trace_id = span.get_span_context().trace_id
|
|
528
|
+
parent_job_id = trace_id_to_uuid(current_trace_id)
|
|
529
|
+
|
|
530
|
+
# Parse job spec
|
|
531
|
+
job_spec_dict = json.loads(job_spec.payload)
|
|
532
|
+
|
|
533
|
+
# Extract document type and payload from the proper structure
|
|
534
|
+
job_payload = job_spec_dict.get("job_payload", {})
|
|
535
|
+
document_types = job_payload.get("document_type", [])
|
|
536
|
+
payloads = job_payload.get("content", [])
|
|
537
|
+
|
|
538
|
+
# Resolve original source metadata up front for logging / subjob naming
|
|
539
|
+
source_ids = job_payload.get("source_id", ["unknown_source.pdf"])
|
|
540
|
+
source_names = job_payload.get("source_name", ["unknown_source.pdf"])
|
|
541
|
+
original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
|
|
542
|
+
original_source_name = source_names[0] if source_names else "unknown_source.pdf"
|
|
543
|
+
|
|
544
|
+
# Check if this is a PDF that needs splitting
|
|
545
|
+
if document_types and payloads and document_types[0].lower() == "pdf":
|
|
546
|
+
# Decode the payload to check page count
|
|
547
|
+
pdf_content = base64.b64decode(payloads[0])
|
|
548
|
+
page_count = get_pdf_page_count(pdf_content)
|
|
549
|
+
pages_per_chunk = get_pdf_split_page_count()
|
|
550
|
+
|
|
551
|
+
# Split if the document has more pages than our chunk size
|
|
552
|
+
if page_count > pages_per_chunk:
|
|
553
|
+
logger.warning(
|
|
554
|
+
"[dev-reload-check] Splitting PDF %s into %s-page chunks (total pages: %s)",
|
|
555
|
+
original_source_name,
|
|
556
|
+
pages_per_chunk,
|
|
557
|
+
page_count,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
|
|
561
|
+
|
|
562
|
+
subjob_ids: List[str] = []
|
|
563
|
+
subjob_descriptors: List[Dict[str, Any]] = []
|
|
564
|
+
submission_tasks = []
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
568
|
+
except ValueError:
|
|
569
|
+
logger.warning(
|
|
570
|
+
"Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
|
|
571
|
+
parent_job_id,
|
|
572
|
+
)
|
|
573
|
+
parent_uuid = uuid.uuid4()
|
|
574
|
+
|
|
575
|
+
for chunk in chunks:
|
|
576
|
+
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
577
|
+
job_spec_dict,
|
|
578
|
+
chunk,
|
|
579
|
+
parent_uuid=parent_uuid,
|
|
580
|
+
parent_job_id=parent_job_id,
|
|
581
|
+
current_trace_id=current_trace_id,
|
|
582
|
+
original_source_id=original_source_id,
|
|
583
|
+
original_source_name=original_source_name,
|
|
584
|
+
)
|
|
585
|
+
submission_tasks.append(ingest_service.submit_job(subjob_wrapper, subjob_id))
|
|
586
|
+
subjob_ids.append(subjob_id)
|
|
587
|
+
subjob_descriptors.append(
|
|
588
|
+
{
|
|
589
|
+
"job_id": subjob_id,
|
|
590
|
+
"chunk_index": len(subjob_descriptors) + 1,
|
|
591
|
+
"start_page": chunk.get("start_page"),
|
|
592
|
+
"end_page": chunk.get("end_page"),
|
|
593
|
+
"page_count": chunk.get("page_count"),
|
|
594
|
+
}
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
if submission_tasks:
|
|
598
|
+
await asyncio.gather(*submission_tasks)
|
|
599
|
+
|
|
600
|
+
parent_metadata: Dict[str, Any] = {
|
|
601
|
+
"total_pages": page_count,
|
|
602
|
+
"original_source_id": original_source_id,
|
|
603
|
+
"original_source_name": original_source_name,
|
|
604
|
+
"document_type": document_types[0] if document_types else "pdf",
|
|
605
|
+
"subjob_order": subjob_ids,
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
await ingest_service.set_parent_job_mapping(
|
|
609
|
+
parent_job_id,
|
|
610
|
+
subjob_ids,
|
|
611
|
+
parent_metadata,
|
|
612
|
+
subjob_descriptors=subjob_descriptors,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
616
|
+
|
|
617
|
+
span.add_event(f"Split into {len(subjob_ids)} subjobs")
|
|
618
|
+
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
619
|
+
return parent_job_id
|
|
620
|
+
|
|
621
|
+
# For non-PDFs or cases where splitting is not required, submit as normal
|
|
622
|
+
if "tracing_options" not in job_spec_dict:
|
|
623
|
+
job_spec_dict["tracing_options"] = {"trace": True}
|
|
624
|
+
job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
|
|
625
|
+
updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
|
|
626
|
+
|
|
627
|
+
span.add_event("Submitting as single job (no split needed)")
|
|
628
|
+
|
|
629
|
+
# Submit the job to the pipeline task queue
|
|
630
|
+
await ingest_service.submit_job(updated_job_spec, parent_job_id)
|
|
631
|
+
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
632
|
+
|
|
633
|
+
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
634
|
+
return parent_job_id
|
|
635
|
+
|
|
636
|
+
except Exception as ex:
|
|
637
|
+
logger.exception(f"Error submitting job: {str(ex)}")
|
|
638
|
+
raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
# GET /v2/fetch_job
|
|
642
|
+
@router.get(
|
|
643
|
+
"/fetch_job/{job_id}",
|
|
644
|
+
responses={
|
|
645
|
+
200: {"description": "Job result successfully retrieved."},
|
|
646
|
+
202: {"description": "Job is processing or result not yet available. Retry later."},
|
|
647
|
+
404: {"description": "Job ID not found or associated state has expired."},
|
|
648
|
+
410: {"description": "Job result existed but is now gone (expired or retrieved destructively/cached)."},
|
|
649
|
+
500: {"description": "Internal server error during fetch processing."},
|
|
650
|
+
503: {"description": "Job processing failed, or backend service temporarily unavailable preventing fetch."},
|
|
651
|
+
},
|
|
652
|
+
tags=["Ingestion"],
|
|
653
|
+
summary="Fetch the result of a previously submitted job by its job_id (V2 with aggregation)",
|
|
654
|
+
operation_id="fetch_job_v2",
|
|
655
|
+
)
|
|
656
|
+
async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
|
|
657
|
+
"""
|
|
658
|
+
V2 fetch that handles parent job aggregation.
|
|
659
|
+
"""
|
|
660
|
+
try:
|
|
661
|
+
# Check if this is a parent job with subjobs
|
|
662
|
+
subjob_info = await ingest_service.get_parent_job_info(job_id)
|
|
663
|
+
|
|
664
|
+
if subjob_info is None:
|
|
665
|
+
# Not a parent job, fetch identical to V1
|
|
666
|
+
current_state = await ingest_service.get_job_state(job_id)
|
|
667
|
+
logger.debug(f"Initial state check for job {job_id}: {current_state}")
|
|
668
|
+
|
|
669
|
+
if current_state is None:
|
|
670
|
+
logger.warning(f"Job {job_id} not found or expired. Returning 404.")
|
|
671
|
+
raise HTTPException(status_code=404, detail="Job ID not found or state has expired.")
|
|
672
|
+
|
|
673
|
+
if current_state == STATE_FAILED:
|
|
674
|
+
logger.error(f"Job {job_id} failed. Returning 503.")
|
|
675
|
+
raise HTTPException(status_code=503, detail="Job processing failed.")
|
|
676
|
+
|
|
677
|
+
if current_state == STATE_RETRIEVED_DESTRUCTIVE:
|
|
678
|
+
logger.warning(f"Job {job_id} was destructively retrieved. Returning 410.")
|
|
679
|
+
raise HTTPException(status_code=410, detail="Job result is gone (destructive read).")
|
|
680
|
+
|
|
681
|
+
if current_state in INTERMEDIATE_STATES or current_state in {
|
|
682
|
+
STATE_RETRIEVED_NON_DESTRUCTIVE,
|
|
683
|
+
STATE_RETRIEVED_CACHED,
|
|
684
|
+
}:
|
|
685
|
+
logger.debug(f"Attempting fetch for job {job_id} in state {current_state}.")
|
|
686
|
+
|
|
687
|
+
try:
|
|
688
|
+
job_response = await ingest_service.fetch_job(job_id)
|
|
689
|
+
logger.debug(f"Fetched result for job {job_id}.")
|
|
690
|
+
|
|
691
|
+
try:
|
|
692
|
+
current_fetch_mode = await ingest_service.get_fetch_mode()
|
|
693
|
+
if current_fetch_mode == FetchMode.DESTRUCTIVE:
|
|
694
|
+
target_state = STATE_RETRIEVED_DESTRUCTIVE
|
|
695
|
+
elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
|
|
696
|
+
target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
|
|
697
|
+
elif current_fetch_mode == FetchMode.CACHE_BEFORE_DELETE:
|
|
698
|
+
target_state = STATE_RETRIEVED_CACHED
|
|
699
|
+
else:
|
|
700
|
+
target_state = "RETRIEVED_UNKNOWN"
|
|
701
|
+
|
|
702
|
+
if target_state != "RETRIEVED_UNKNOWN":
|
|
703
|
+
await ingest_service.set_job_state(job_id, target_state)
|
|
704
|
+
logger.debug(f"Updated job {job_id} state to {target_state}.")
|
|
705
|
+
except Exception as state_err:
|
|
706
|
+
logger.error(f"Failed to set job state for {job_id} after fetch: {state_err}")
|
|
707
|
+
|
|
708
|
+
try:
|
|
709
|
+
json_bytes = json.dumps(job_response).encode("utf-8")
|
|
710
|
+
return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
|
|
711
|
+
except TypeError as json_err:
|
|
712
|
+
logger.exception(f"Serialization error for job {job_id}: {json_err}")
|
|
713
|
+
raise HTTPException(
|
|
714
|
+
status_code=500, detail="Internal server error: Failed to serialize result."
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
except (TimeoutError, RedisError, ConnectionError) as fetch_err:
|
|
718
|
+
# Handle timeout/error cases same as V1
|
|
719
|
+
fetch_err_type = type(fetch_err).__name__
|
|
720
|
+
|
|
721
|
+
if isinstance(fetch_err, TimeoutError):
|
|
722
|
+
logger.debug(
|
|
723
|
+
f"Job {job_id} still processing (state: {current_state}), fetch attempt timed out cleanly."
|
|
724
|
+
)
|
|
725
|
+
else:
|
|
726
|
+
logger.warning(
|
|
727
|
+
f"Backend error ({fetch_err_type}) during fetch attempt for job {job_id} "
|
|
728
|
+
f"(state: {current_state}): {fetch_err}"
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
if current_state == STATE_RETRIEVED_NON_DESTRUCTIVE:
|
|
732
|
+
if isinstance(fetch_err, TimeoutError):
|
|
733
|
+
raise HTTPException(status_code=410, detail="Job result is gone (TTL expired).")
|
|
734
|
+
else:
|
|
735
|
+
raise HTTPException(
|
|
736
|
+
status_code=503, detail="Backend service unavailable preventing access to job result."
|
|
737
|
+
)
|
|
738
|
+
elif current_state == STATE_RETRIEVED_CACHED:
|
|
739
|
+
raise HTTPException(
|
|
740
|
+
status_code=410, detail="Job result is gone (previously cached, fetch failed)."
|
|
741
|
+
)
|
|
742
|
+
elif current_state in INTERMEDIATE_STATES:
|
|
743
|
+
if isinstance(fetch_err, TimeoutError):
|
|
744
|
+
raise HTTPException(
|
|
745
|
+
status_code=202, detail=f"Job is processing (state: {current_state}). Retry later."
|
|
746
|
+
)
|
|
747
|
+
else:
|
|
748
|
+
raise HTTPException(
|
|
749
|
+
status_code=503, detail="Backend service unavailable preventing fetch of job result."
|
|
750
|
+
)
|
|
751
|
+
else:
|
|
752
|
+
logger.error(f"Unexpected state '{current_state}' for job {job_id} after fetch failure.")
|
|
753
|
+
raise HTTPException(
|
|
754
|
+
status_code=500, detail="Internal server error: Unexpected job state after fetch failure."
|
|
755
|
+
)
|
|
756
|
+
else:
|
|
757
|
+
logger.error(f"Unknown job state '{current_state}' for job {job_id}.")
|
|
758
|
+
raise HTTPException(
|
|
759
|
+
status_code=500, detail=f"Internal server error: Unknown job state '{current_state}'."
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
else:
|
|
763
|
+
# This is a parent job - orchestrate aggregation using declarative helpers
|
|
764
|
+
subjob_ids = subjob_info.get("subjob_ids", [])
|
|
765
|
+
metadata = subjob_info.get("metadata", {})
|
|
766
|
+
|
|
767
|
+
logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
|
|
768
|
+
|
|
769
|
+
# Build ordered descriptors for subjobs
|
|
770
|
+
stored_descriptors = subjob_info.get("subjob_descriptors") or []
|
|
771
|
+
descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
|
|
772
|
+
|
|
773
|
+
ordered_descriptors: List[Dict[str, Any]] = []
|
|
774
|
+
for idx, subjob_id in enumerate(subjob_ids, 1):
|
|
775
|
+
descriptor = descriptor_lookup.get(subjob_id, {})
|
|
776
|
+
ordered_descriptors.append(
|
|
777
|
+
{
|
|
778
|
+
"job_id": subjob_id,
|
|
779
|
+
"chunk_index": descriptor.get("chunk_index", idx),
|
|
780
|
+
"start_page": descriptor.get("start_page"),
|
|
781
|
+
"end_page": descriptor.get("end_page"),
|
|
782
|
+
"page_count": descriptor.get("page_count"),
|
|
783
|
+
}
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
# Calculate max parallel operations (stay within Redis connection pool)
|
|
787
|
+
max_parallel_ops = max(
|
|
788
|
+
1, min(len(ordered_descriptors), getattr(ingest_service, "_concurrency_level", 10) // 2)
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
# Check all subjob states (raises 202 if any still processing)
|
|
792
|
+
subjob_states, failed_subjobs = await _check_all_subjob_states(
|
|
793
|
+
ordered_descriptors, max_parallel_ops, ingest_service
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
# Fetch all subjob results (raises 202 if any not ready)
|
|
797
|
+
subjob_results = await _fetch_all_subjob_results(
|
|
798
|
+
ordered_descriptors, subjob_states, failed_subjobs, max_parallel_ops, ingest_service
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# Build aggregated response from all subjob results
|
|
802
|
+
aggregated_result = _build_aggregated_response(
|
|
803
|
+
job_id, subjob_results, failed_subjobs, ordered_descriptors, metadata
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
# Update parent job state after successful aggregation
|
|
807
|
+
await _update_job_state_after_fetch(job_id, ingest_service)
|
|
808
|
+
|
|
809
|
+
# Return aggregated result as streaming response
|
|
810
|
+
return _stream_json_response(aggregated_result)
|
|
811
|
+
|
|
812
|
+
except HTTPException:
|
|
813
|
+
raise
|
|
814
|
+
except Exception as e:
|
|
815
|
+
logger.exception(f"Unexpected error in fetch_job_v2: {e}")
|
|
816
|
+
raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
|