nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
nv_ingest/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import warnings
6
+
7
+
8
+ # Suppressing CUDA-related warnings when running NV-Ingest on a CPU-only system.
9
+ #
10
+ # The warnings originate from Numba, which attempts to initialize CUDA even if no GPU is available.
11
+ # These warnings include errors about missing CUDA drivers or failing to dlopen `libcuda.so.1`.
12
+ #
13
+ # By temporarily ignoring `UserWarning` during the import, we prevent unnecessary clutter in logs
14
+ # while ensuring that cuDF still functions in CPU mode.
15
+ #
16
+ # Note: This does not affect cuDF behavior - it will still fall back to CPU execution if no GPU is detected.
17
+ with warnings.catch_warnings():
18
+ warnings.simplefilter("ignore", category=UserWarning)
19
+ # import cudf
20
+ # TODO(Devin) No cudf import in this file, but keeping it here for future use
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
nv_ingest/api/main.py ADDED
@@ -0,0 +1,43 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import os
7
+
8
+ from fastapi import FastAPI
9
+ from opentelemetry import trace
10
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
11
+ from opentelemetry.sdk.resources import Resource
12
+ from opentelemetry.sdk.trace import TracerProvider
13
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
14
+
15
+ from .v1.health import router as HealthApiRouter
16
+ from .v1.ingest import router as IngestApiRouter
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # nv-ingest FastAPI app declaration
21
+ app = FastAPI(
22
+ title="NV-Ingest Microservice",
23
+ description="Service for ingesting heterogenous datatypes",
24
+ version="25.3.0",
25
+ contact={
26
+ "name": "NVIDIA Corporation",
27
+ "url": "https://nvidia.com",
28
+ },
29
+ docs_url="/docs",
30
+ )
31
+
32
+ app.include_router(IngestApiRouter, prefix="/v1")
33
+ app.include_router(HealthApiRouter, prefix="/v1/health")
34
+
35
+ # Set up the tracer provider and add a processor for exporting traces
36
+ resource = Resource(attributes={"service.name": "nv-ingest"})
37
+ trace.set_tracer_provider(TracerProvider(resource=resource))
38
+ tracer = trace.get_tracer(__name__)
39
+
40
+ otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "otel-collector:4317")
41
+ exporter = OTLPSpanExporter(endpoint=otel_endpoint, insecure=True)
42
+ span_processor = BatchSpanProcessor(exporter)
43
+ trace.get_tracer_provider().add_span_processor(span_processor)
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,114 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import os
7
+
8
+ from fastapi import APIRouter
9
+ from fastapi import status
10
+ from fastapi.responses import JSONResponse
11
+
12
+ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import is_ready
13
+
14
+ # logger = logging.getLogger("uvicorn")
15
+ logger = logging.getLogger(__name__)
16
+
17
+ router = APIRouter()
18
+
19
+ # List of ALL of the HTTP environment variable endpoints that should be checked
20
+ READY_CHECK_ENV_VAR_MAP = {
21
+ "paddle": "PADDLE_HTTP_ENDPOINT",
22
+ "yolox_graphic_elements": "YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT",
23
+ "yolox_page_elements": "YOLOX_HTTP_ENDPOINT",
24
+ "yolox_table_structure": "YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT",
25
+ }
26
+
27
+
28
+ @router.get(
29
+ "/live",
30
+ tags=["Health"],
31
+ summary="Check if the service is running.",
32
+ description="""
33
+ Check if the service is running.
34
+ """,
35
+ status_code=status.HTTP_200_OK,
36
+ )
37
+ async def get_live_state() -> dict:
38
+ live_content = {"live": True}
39
+ return JSONResponse(content=live_content, status_code=200)
40
+
41
+
42
+ @router.get(
43
+ "/ready",
44
+ tags=["Health"],
45
+ summary="Check if the service is ready to receive traffic.",
46
+ description="""
47
+ Check if the service is ready to receive traffic.
48
+ """,
49
+ status_code=status.HTTP_200_OK,
50
+ )
51
+ async def get_ready_state() -> dict:
52
+ # "Ready" to use means this.
53
+ # 1. nv-ingest FastAPI is live, check you are here nothing to do.
54
+ # 2. Ray pipeline is up and running
55
+ # 3. NIMs that are configured by the service are reporting "ready"
56
+ # After all of those are "ready" this service returns "ready" as well
57
+ # Otherwise a HTTP 503 Service not Available response is returned.
58
+
59
+ ingest_ready = True
60
+ # Need to explore options for process checking here.
61
+ # We cannot guarantee this process is local to check.
62
+ # If it is not local and we cannot find a running version
63
+ # locally we could be blocking processing with our
64
+ # readiness endpoint which is really bad. I think it safe
65
+ # for now to assume that if nv-ingest is running so is
66
+ # the pipeline.
67
+ pipeline_ready = True
68
+
69
+ # Components that the service should check for "ready"
70
+ # Possible options are
71
+ # 1. empty/none -> This equates to disabling ready checks
72
+ # 2. all/ALL -> This equates to checking the environment variables and checking all configured services for "ready"
73
+ # 3. {comma_delimited_list} -> Comma delimited list of {NIM_HTTP_ENDPOINTS} that should be checked for ready
74
+ components_to_check = os.getenv("COMPONENTS_TO_READY_CHECK", "ALL").upper()
75
+
76
+ if components_to_check == "" or components_to_check is None:
77
+ # Ready checks disabled. Immdiately return "ready" status
78
+ return JSONResponse(content={"ready": True}, status_code=200)
79
+ else:
80
+ # Determine the list of components to check, either ALL or a specified list
81
+ endpoint_nim_name_map = {}
82
+ if components_to_check == "ALL":
83
+ # Gather all the known HTTP env endpoints
84
+ for nim_name, nim_env_var in READY_CHECK_ENV_VAR_MAP.items():
85
+ endpoint_url = os.getenv(nim_env_var, None)
86
+ endpoint_nim_name_map[endpoint_url] = nim_name
87
+ else:
88
+ # This will be a list of env variables for the http endpoints to check
89
+ for env_var in components_to_check.split(","):
90
+ env_var = env_var.strip()
91
+ endpoint_url = os.getenv(env_var, None)
92
+
93
+ # Get the user friendly name for the NIM from the endpoints map
94
+ nim_name = next((k for k, v in READY_CHECK_ENV_VAR_MAP.items() if v == env_var), None)
95
+ endpoint_nim_name_map[endpoint_url] = nim_name
96
+
97
+ # Check the endpoints for their readiness
98
+ ready_statuses = {"ingest_ready": ingest_ready, "pipeline_ready": pipeline_ready}
99
+ ready_to_work = True # consider nv-ingest ready until an endpoint proves otherwise
100
+ for endpoint, nim_name in endpoint_nim_name_map.items():
101
+ endpoint_ready = is_ready(endpoint, "/v1/health/ready")
102
+ if not endpoint_ready:
103
+ logger.debug(f"Not ready for work. NIM endpoint: '{endpoint}' reporting not ready.")
104
+ ready_to_work = False
105
+ ready_statuses[nim_name + "_ready"] = False
106
+ else:
107
+ ready_statuses[nim_name + "_ready"] = True
108
+
109
+ # Build the response for the client
110
+ if ready_to_work:
111
+ return JSONResponse(content={"ready": True}, status_code=200)
112
+ else:
113
+ logger.debug(f"Ready Statuses: {ready_statuses}")
114
+ return JSONResponse(content=ready_statuses, status_code=503)
@@ -0,0 +1,454 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # pylint: skip-file
5
+
6
+ from io import BytesIO
7
+ from typing import Annotated, Dict, List
8
+ import base64
9
+ import json
10
+ import logging
11
+ import time
12
+ import uuid
13
+
14
+ from fastapi import APIRouter, Request, Response
15
+ from fastapi import Depends
16
+ from fastapi import File, UploadFile, Form
17
+ from fastapi import HTTPException
18
+ from fastapi.responses import StreamingResponse
19
+ from fastapi.responses import JSONResponse
20
+
21
+ from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
22
+ from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob, ConversionStatus
23
+ from nv_ingest.framework.util.service.impl.ingest.redis_ingest_service import RedisIngestService
24
+ from nv_ingest.framework.util.service.meta.ingest.ingest_service_meta import IngestServiceMeta
25
+ from nv_ingest_api.util.service_clients.client_base import FetchMode
26
+ from nv_ingest_client.primitives.jobs.job_spec import JobSpec
27
+ from nv_ingest_client.primitives.tasks.extract import ExtractTask
28
+ from opentelemetry import trace
29
+ from redis import RedisError
30
+
31
+ from nv_ingest_api.util.converters.formats import ingest_json_results_to_blob
32
+
33
+ from nv_ingest_client.primitives.tasks.table_extraction import TableExtractionTask
34
+ from nv_ingest_client.primitives.tasks.chart_extraction import ChartExtractionTask
35
+ from nv_ingest_client.primitives.tasks.infographic_extraction import InfographicExtractionTask
36
+
37
+ logger = logging.getLogger("uvicorn")
38
+ tracer = trace.get_tracer(__name__)
39
+
40
+ router = APIRouter()
41
+
42
+
43
+ async def _get_ingest_service() -> IngestServiceMeta:
44
+ """
45
+ Gather the appropriate Ingestion Service to use for the nv-ingest endpoint.
46
+ """
47
+ logger.debug("Creating RedisIngestService singleton for dependency injection")
48
+ return RedisIngestService.get_instance()
49
+
50
+
51
+ INGEST_SERVICE_T = Annotated[IngestServiceMeta, Depends(_get_ingest_service)]
52
+ STATE_RETRIEVED_DESTRUCTIVE = "RETRIEVED_DESTRUCTIVE"
53
+ STATE_RETRIEVED_NON_DESTRUCTIVE = "RETRIEVED_NON_DESTRUCTIVE"
54
+ STATE_RETRIEVED_CACHED = "RETRIEVED_CACHED"
55
+ STATE_FAILED = "FAILED"
56
+ STATE_PROCESSING = "PROCESSING"
57
+ STATE_SUBMITTED = "SUBMITTED"
58
+ INTERMEDIATE_STATES = {STATE_PROCESSING, STATE_SUBMITTED}
59
+
60
+
61
+ # POST /submit
62
+ @router.post(
63
+ "/submit",
64
+ responses={
65
+ 200: {"description": "Submission was successful"},
66
+ 500: {"description": "Error encountered during submission"},
67
+ },
68
+ tags=["Ingestion"],
69
+ summary="submit document to the core nv ingestion service for processing",
70
+ operation_id="submit",
71
+ )
72
+ async def submit_job_curl_friendly(ingest_service: INGEST_SERVICE_T, file: UploadFile = File(...)):
73
+ """
74
+ A multipart/form-data friendly Job submission endpoint that makes interacting with
75
+ the nv-ingest service through tools like Curl easier.
76
+ """
77
+ try:
78
+ file_stream = BytesIO(file.file.read())
79
+ doc_content = base64.b64encode(file_stream.read()).decode("utf-8")
80
+
81
+ # Construct the JobSpec from the HTTP supplied form-data
82
+ job_spec = JobSpec(
83
+ # TOOD: Update this to look at the uploaded content-type, currently that is not working
84
+ document_type="pdf",
85
+ payload=doc_content,
86
+ source_id=file.filename,
87
+ source_name=file.filename,
88
+ # TODO: Update this to accept user defined options
89
+ extended_options={
90
+ "tracing_options": {
91
+ "trace": True,
92
+ "ts_send": time.time_ns(),
93
+ "trace_id": str(trace.get_current_span().get_span_context().trace_id),
94
+ }
95
+ },
96
+ )
97
+
98
+ # This is the "easy submission path" just default to extracting everything
99
+ extract_task = ExtractTask(document_type="pdf", extract_text=True, extract_images=True, extract_tables=True)
100
+
101
+ job_spec.add_task(extract_task)
102
+
103
+ submitted_job_id = await ingest_service.submit_job(MessageWrapper(payload=json.dumps(job_spec.to_dict())))
104
+ return submitted_job_id
105
+ except Exception as ex:
106
+ logger.exception(f"Error submitting job: {str(ex)}")
107
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
108
+
109
+
110
+ def trace_id_to_uuid(trace_id: str) -> str:
111
+ """Convert a 32-character OpenTelemetry trace ID to a UUID-like format."""
112
+ trace_id = str(trace.format_trace_id(trace_id))
113
+ if len(trace_id) != 32:
114
+ raise ValueError("Trace ID must be a 32-character hexadecimal string")
115
+ return f"{trace_id[:8]}-{trace_id[8:12]}-{trace_id[12:16]}-{trace_id[16:20]}-{trace_id[20:]}"
116
+
117
+
118
+ # POST /submit_job
119
+ @router.post(
120
+ "/submit_job",
121
+ responses={
122
+ 200: {"description": "Jobs were successfully submitted"},
123
+ 500: {"description": "Error encountered while submitting jobs."},
124
+ 503: {"description": "Service unavailable."},
125
+ },
126
+ tags=["Ingestion"],
127
+ summary="submit jobs to the core nv ingestion service for processing",
128
+ operation_id="submit_job",
129
+ )
130
+ async def submit_job(request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T):
131
+ with tracer.start_as_current_span("http-submit-job") as span:
132
+ try:
133
+ # Add custom attributes to the span
134
+ span.set_attribute("http.method", request.method)
135
+ span.set_attribute("http.url", str(request.url))
136
+ span.add_event("Submitting file for processing")
137
+
138
+ current_trace_id = span.get_span_context().trace_id
139
+ job_id = trace_id_to_uuid(current_trace_id)
140
+
141
+ # Add trace_id to job_spec payload
142
+ job_spec_dict = json.loads(job_spec.payload)
143
+ if "tracing_options" not in job_spec_dict:
144
+ job_spec_dict["tracing_options"] = {"trace": True}
145
+ job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
146
+ updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
147
+
148
+ # Add another event
149
+ span.add_event("Finished processing")
150
+
151
+ # Submit the job to the pipeline task queue
152
+ await ingest_service.submit_job(updated_job_spec, job_id) # Pass job_id used for state
153
+ await ingest_service.set_job_state(job_id, "SUBMITTED")
154
+
155
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
156
+ return job_id
157
+
158
+ except Exception as ex:
159
+ logger.exception(f"Error submitting job: {str(ex)}")
160
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
161
+
162
+
163
+ # GET /fetch_job
164
+ @router.get(
165
+ "/fetch_job/{job_id}",
166
+ responses={
167
+ 200: {"description": "Job result successfully retrieved."},
168
+ 202: {"description": "Job is processing or result not yet available. Retry later."},
169
+ 404: {"description": "Job ID not found or associated state has expired."},
170
+ 410: {"description": "Job result existed but is now gone (expired or retrieved destructively/cached)."},
171
+ 500: {"description": "Internal server error during fetch processing."},
172
+ 503: {"description": "Job processing failed, or backend service temporarily unavailable preventing fetch."},
173
+ },
174
+ tags=["Ingestion"],
175
+ summary="Fetch the result of a previously submitted job by its job_id",
176
+ operation_id="fetch_job",
177
+ )
178
+ async def fetch_job(job_id: str, ingest_service: INGEST_SERVICE_T):
179
+ """
180
+ Fetches job result, checking job state *before* attempting data retrieval.
181
+
182
+ Distinguishes non-existent jobs (404) from expired results (410).
183
+ """
184
+ try:
185
+ current_state = await ingest_service.get_job_state(job_id)
186
+ logger.debug(f"Initial state check for job {job_id}: {current_state}")
187
+
188
+ if current_state is None:
189
+ logger.warning(f"Job {job_id} not found or expired. Returning 404.")
190
+ raise HTTPException(status_code=404, detail="Job ID not found or state has expired.")
191
+
192
+ if current_state == STATE_FAILED:
193
+ logger.error(f"Job {job_id} failed. Returning 503.")
194
+ raise HTTPException(status_code=503, detail="Job processing failed.")
195
+
196
+ if current_state == STATE_RETRIEVED_DESTRUCTIVE:
197
+ logger.warning(f"Job {job_id} was destructively retrieved. Returning 410.")
198
+ raise HTTPException(status_code=410, detail="Job result is gone (destructive read).")
199
+
200
+ if current_state in INTERMEDIATE_STATES or current_state in {
201
+ STATE_RETRIEVED_NON_DESTRUCTIVE,
202
+ STATE_RETRIEVED_CACHED,
203
+ }:
204
+ logger.debug(f"Attempting fetch for job {job_id} in state {current_state}.")
205
+
206
+ try:
207
+ job_response = await ingest_service.fetch_job(job_id)
208
+ logger.debug(f"Fetched result for job {job_id}.")
209
+
210
+ try:
211
+ current_fetch_mode = await ingest_service.get_fetch_mode()
212
+ if current_fetch_mode == FetchMode.DESTRUCTIVE:
213
+ target_state = STATE_RETRIEVED_DESTRUCTIVE
214
+ elif current_fetch_mode == FetchMode.NON_DESTRUCTIVE:
215
+ target_state = STATE_RETRIEVED_NON_DESTRUCTIVE
216
+ elif current_fetch_mode == FetchMode.CACHE_BEFORE_DELETE:
217
+ target_state = STATE_RETRIEVED_CACHED
218
+ else:
219
+ target_state = "RETRIEVED_UNKNOWN"
220
+
221
+ if target_state != "RETRIEVED_UNKNOWN":
222
+ await ingest_service.set_job_state(job_id, target_state)
223
+ logger.debug(f"Updated job {job_id} state to {target_state}.")
224
+ except Exception as state_err:
225
+ logger.error(f"Failed to set job state for {job_id} after fetch: {state_err}")
226
+
227
+ try:
228
+ json_bytes = json.dumps(job_response).encode("utf-8")
229
+ return StreamingResponse(iter([json_bytes]), media_type="application/json", status_code=200)
230
+ except TypeError as json_err:
231
+ logger.exception(f"Serialization error for job {job_id}: {json_err}")
232
+ raise HTTPException(status_code=500, detail="Internal server error: Failed to serialize result.")
233
+
234
+ except (TimeoutError, RedisError, ConnectionError) as fetch_err:
235
+ fetch_err_type = type(fetch_err).__name__
236
+
237
+ if isinstance(fetch_err, TimeoutError):
238
+ logger.info(
239
+ f"Job {job_id} still processing (state: {current_state}), fetch attempt timed out cleanly."
240
+ )
241
+ else:
242
+ logger.warning(
243
+ f"Backend error ({fetch_err_type}) during fetch attempt for job {job_id} "
244
+ f"(state: {current_state}): {fetch_err}"
245
+ )
246
+
247
+ if current_state == STATE_RETRIEVED_NON_DESTRUCTIVE:
248
+ if isinstance(fetch_err, TimeoutError):
249
+ raise HTTPException(status_code=410, detail="Job result is gone (TTL expired).")
250
+ else:
251
+ raise HTTPException(
252
+ status_code=503, detail="Backend service unavailable preventing access to job result."
253
+ )
254
+
255
+ elif current_state == STATE_RETRIEVED_CACHED:
256
+ raise HTTPException(status_code=410, detail="Job result is gone (previously cached, fetch failed).")
257
+
258
+ elif current_state in INTERMEDIATE_STATES:
259
+ if isinstance(fetch_err, TimeoutError):
260
+ raise HTTPException(
261
+ status_code=202, detail=f"Job is processing (state: {current_state}). Retry later."
262
+ )
263
+ else:
264
+ raise HTTPException(
265
+ status_code=503, detail="Backend service unavailable preventing fetch of job result."
266
+ )
267
+
268
+ else:
269
+ logger.error(f"Unexpected state '{current_state}' for job {job_id} after fetch failure.")
270
+ raise HTTPException(
271
+ status_code=500, detail="Internal server error: Unexpected job state after fetch failure."
272
+ )
273
+
274
+ except ValueError as ve:
275
+ logger.exception(f"Value error fetching job {job_id}: {ve}")
276
+ raise HTTPException(status_code=500, detail="Internal server error processing job data.")
277
+
278
+ except Exception as fetch_ex:
279
+ logger.exception(f"Unexpected fetch error for job {job_id}: {fetch_ex}")
280
+ raise HTTPException(status_code=500, detail="Internal server error during data fetch.")
281
+
282
+ else:
283
+ logger.error(f"Unknown job state '{current_state}' for job {job_id}.")
284
+ raise HTTPException(status_code=500, detail=f"Internal server error: Unknown job state '{current_state}'.")
285
+
286
+ except HTTPException as http_exc:
287
+ raise http_exc # Pass through cleanly
288
+
289
+ except Exception as initial_err:
290
+ logger.exception(f"Unexpected server error handling fetch for job {job_id}: {initial_err}")
291
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
292
+
293
+
294
+ @router.post("/convert")
295
+ async def convert_pdf(
296
+ ingest_service: INGEST_SERVICE_T,
297
+ files: List[UploadFile] = File(...),
298
+ job_id: str = Form(...),
299
+ extract_text: bool = Form(True),
300
+ extract_images: bool = Form(True),
301
+ extract_tables: bool = Form(True),
302
+ extract_charts: bool = Form(False),
303
+ extract_infographics: bool = Form(False),
304
+ ) -> Dict[str, str]:
305
+ try:
306
+
307
+ if job_id is None:
308
+ job_id = str(uuid.uuid4())
309
+ logger.debug(f"JobId is None, Created JobId: {job_id}")
310
+
311
+ submitted_jobs: List[ProcessingJob] = []
312
+ for file in files:
313
+ file_stream = BytesIO(file.file.read())
314
+ doc_content = base64.b64encode(file_stream.read()).decode("utf-8")
315
+
316
+ try:
317
+ content_type = file.content_type.split("/")[1]
318
+ except Exception:
319
+ err_message = f"Unsupported content_type: {file.content_type}"
320
+ logger.error(err_message)
321
+ raise HTTPException(status_code=500, detail=err_message)
322
+
323
+ job_spec = JobSpec(
324
+ document_type=content_type,
325
+ payload=doc_content,
326
+ source_id=file.filename,
327
+ source_name=file.filename,
328
+ extended_options={
329
+ "tracing_options": {
330
+ "trace": True,
331
+ "ts_send": time.time_ns(),
332
+ }
333
+ },
334
+ )
335
+
336
+ extract_task = ExtractTask(
337
+ document_type=content_type,
338
+ extract_text=extract_text,
339
+ extract_images=extract_images,
340
+ extract_tables=extract_tables,
341
+ extract_charts=extract_charts,
342
+ extract_infographics=extract_infographics,
343
+ )
344
+
345
+ job_spec.add_task(extract_task)
346
+
347
+ # Conditionally add tasks as needed.
348
+ if extract_tables:
349
+ table_data_extract = TableExtractionTask()
350
+ job_spec.add_task(table_data_extract)
351
+
352
+ if extract_charts:
353
+ chart_data_extract = ChartExtractionTask()
354
+ job_spec.add_task(chart_data_extract)
355
+
356
+ if extract_infographics:
357
+ infographic_data_extract = InfographicExtractionTask()
358
+ job_spec.add_task(infographic_data_extract)
359
+
360
+ submitted_job_id = await ingest_service.submit_job(
361
+ MessageWrapper(payload=json.dumps(job_spec.to_dict())), job_id
362
+ )
363
+
364
+ processing_job = ProcessingJob(
365
+ submitted_job_id=submitted_job_id,
366
+ filename=file.filename,
367
+ status=ConversionStatus.IN_PROGRESS,
368
+ )
369
+
370
+ submitted_jobs.append(processing_job)
371
+
372
+ await ingest_service.set_processing_cache(job_id, submitted_jobs)
373
+
374
+ logger.debug(f"Submitted: {len(submitted_jobs)} documents of type: '{content_type}' for processing")
375
+
376
+ return {
377
+ "task_id": job_id,
378
+ "status": "processing",
379
+ "status_url": f"/status/{job_id}",
380
+ }
381
+
382
+ except Exception as e:
383
+ logger.error(f"Error starting conversion: {str(e)}")
384
+ raise HTTPException(status_code=500, detail=str(e))
385
+
386
+
387
+ @router.get("/status/{job_id}")
388
+ async def get_status(ingest_service: INGEST_SERVICE_T, job_id: str):
389
+ t_start = time.time()
390
+ try:
391
+ processing_jobs = await ingest_service.get_processing_cache(job_id)
392
+ except Exception as e:
393
+ logger.error(f"Error getting status: {str(e)}")
394
+ raise HTTPException(status_code=500, detail=str(e))
395
+
396
+ updated_cache: List[ProcessingJob] = []
397
+ num_ready_docs = 0
398
+
399
+ for processing_job in processing_jobs:
400
+ logger.debug(f"submitted_job_id: {processing_job.submitted_job_id} - Status: {processing_job.status}")
401
+
402
+ if processing_job.status == ConversionStatus.IN_PROGRESS:
403
+ # Attempt to fetch the job from the ingest service
404
+ try:
405
+ job_response = await ingest_service.fetch_job(processing_job.submitted_job_id)
406
+
407
+ job_response = json.dumps(job_response)
408
+
409
+ # Convert JSON into pseudo markdown format
410
+ blob_response = ingest_json_results_to_blob(job_response)
411
+
412
+ processing_job.raw_result = job_response
413
+ processing_job.content = blob_response
414
+ processing_job.status = ConversionStatus.SUCCESS
415
+ num_ready_docs = num_ready_docs + 1
416
+ updated_cache.append(processing_job)
417
+
418
+ except TimeoutError:
419
+ logger.error(f"TimeoutError getting result for job_id: {processing_job.submitted_job_id}")
420
+ updated_cache.append(processing_job)
421
+ continue
422
+ except RedisError:
423
+ logger.error(f"RedisError getting result for job_id: {processing_job.submitted_job_id}")
424
+ updated_cache.append(processing_job)
425
+ continue
426
+ else:
427
+ logger.debug(f"{processing_job.submitted_job_id} has already finished successfully ....")
428
+ num_ready_docs = num_ready_docs + 1
429
+ updated_cache.append(processing_job)
430
+
431
+ await ingest_service.set_processing_cache(job_id, updated_cache)
432
+
433
+ logger.debug(f"{num_ready_docs}/{len(updated_cache)} complete")
434
+ if num_ready_docs == len(updated_cache):
435
+ results = []
436
+ raw_results = []
437
+ for result in updated_cache:
438
+ results.append(
439
+ {
440
+ "filename": result.filename,
441
+ "status": "success",
442
+ "content": result.content,
443
+ }
444
+ )
445
+ raw_results.append(result.raw_result)
446
+
447
+ return JSONResponse(
448
+ content={"status": "completed", "result": results},
449
+ status_code=200,
450
+ )
451
+ else:
452
+ # Not yet ready ...
453
+ logger.debug(f"/status/{job_id} endpoint execution time: {time.time() - t_start}")
454
+ raise HTTPException(status_code=202, detail="Job is not ready yet. Retry later.")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0