nv-ingest 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (102) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +45 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/api/v1/metrics.py +29 -0
  8. nv_ingest/framework/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  12. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  13. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  14. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  15. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  18. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  19. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  20. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  22. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +591 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1322 -0
  24. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  25. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  34. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  35. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  36. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  41. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  42. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  44. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  45. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  47. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  48. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  49. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  52. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  53. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  56. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  60. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  61. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  62. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  64. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +200 -0
  68. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
  69. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +624 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  71. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  72. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  73. nv_ingest/framework/schemas/__init__.py +0 -0
  74. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  75. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  76. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  77. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  78. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  79. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  80. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  81. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  82. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  83. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  84. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  85. nv_ingest/framework/util/__init__.py +3 -0
  86. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  87. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  88. nv_ingest/framework/util/service/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  90. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  91. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  92. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  93. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  94. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  95. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  96. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  97. nv_ingest/version.py +38 -0
  98. nv_ingest-25.6.0.dist-info/METADATA +266 -0
  99. nv_ingest-25.6.0.dist-info/RECORD +102 -0
  100. nv_ingest-25.6.0.dist-info/WHEEL +5 -0
  101. nv_ingest-25.6.0.dist-info/licenses/LICENSE +201 -0
  102. nv_ingest-25.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,200 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ import logging
7
+ import math
8
+ import os
9
+ from typing import Dict, Any
10
+
11
+ import ray
12
+ from pydantic import BaseModel
13
+
14
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline
15
+ from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
16
+ add_source_stage,
17
+ add_metadata_injector_stage,
18
+ add_pdf_extractor_stage,
19
+ add_image_extractor_stage,
20
+ add_docx_extractor_stage,
21
+ add_audio_extractor_stage,
22
+ add_html_extractor_stage,
23
+ add_image_dedup_stage,
24
+ add_image_filter_stage,
25
+ add_table_extractor_stage,
26
+ add_chart_extractor_stage,
27
+ add_image_caption_stage,
28
+ add_text_splitter_stage,
29
+ add_text_embedding_stage,
30
+ add_embedding_storage_stage,
31
+ add_image_storage_stage,
32
+ add_message_broker_response_stage,
33
+ add_pptx_extractor_stage,
34
+ add_infographic_extractor_stage,
35
+ add_otel_tracer_stage,
36
+ add_default_drain_stage,
37
+ )
38
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
39
+
40
+ logger = logging.getLogger("uvicorn")
41
+
42
+
43
+ def export_config_to_env(ingest_config: Any) -> None:
44
+ if isinstance(ingest_config, BaseModel):
45
+ ingest_config = ingest_config.model_dump()
46
+
47
+ os.environ.update({key.upper(): val for key, val in ingest_config.items()})
48
+
49
+
50
+ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any] = None):
51
+ # Initialize the pipeline with the configuration
52
+ if ingest_config:
53
+ # Export the config to environment variables
54
+ export_config_to_env(ingest_config)
55
+
56
+ current_level = logging.getLogger().getEffectiveLevel()
57
+ ray_context = ray.init(
58
+ namespace="nv_ingest_ray",
59
+ logging_level=current_level,
60
+ ignore_reinit_error=True,
61
+ dashboard_host="0.0.0.0",
62
+ dashboard_port=8265,
63
+ _system_config={
64
+ "local_fs_capacity_threshold": 0.9,
65
+ "object_spilling_config": json.dumps(
66
+ {
67
+ "type": "filesystem",
68
+ "params": {
69
+ "directory_path": [
70
+ "/tmp/ray_spill_testing_0",
71
+ "/tmp/ray_spill_testing_1",
72
+ "/tmp/ray_spill_testing_2",
73
+ "/tmp/ray_spill_testing_3",
74
+ ],
75
+ "buffer_size": 100_000_000,
76
+ },
77
+ },
78
+ ),
79
+ },
80
+ )
81
+ system_resource_probe = SystemResourceProbe()
82
+
83
+ effective_cpu_core_count = system_resource_probe.get_effective_cores()
84
+ default_cpu_count = int(os.environ.get("NV_INGEST_MAX_UTIL", int(max(1, math.floor(effective_cpu_core_count)))))
85
+
86
+ add_meter_stage = os.environ.get("MESSAGE_CLIENT_TYPE") != "simple"
87
+ _ = add_meter_stage # TODO(Devin)
88
+
89
+ ########################################################################################################
90
+ ## Insertion and Pre-processing stages
91
+ ########################################################################################################
92
+ logger.debug("Setting up ingestion pipeline")
93
+ source_stage_id = add_source_stage(pipeline, default_cpu_count)
94
+ # TODO(Devin): Job counter used a global stats object that isn't ray compatible, need to update.
95
+ # submitted_job_counter_stage = add_submitted_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
96
+ metadata_injector_stage_id = add_metadata_injector_stage(pipeline, default_cpu_count)
97
+ ########################################################################################################
98
+
99
+ ########################################################################################################
100
+ ## Primitive extraction
101
+ ########################################################################################################
102
+ pdf_extractor_stage_id = add_pdf_extractor_stage(pipeline, default_cpu_count)
103
+ image_extractor_stage_id = add_image_extractor_stage(pipeline, default_cpu_count)
104
+ docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
105
+ pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
106
+ audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
107
+ html_extractor_stage_id = add_html_extractor_stage(pipeline, default_cpu_count)
108
+ ########################################################################################################
109
+
110
+ ########################################################################################################
111
+ ## Post-processing
112
+ ########################################################################################################
113
+ image_dedup_stage_id = add_image_dedup_stage(pipeline, default_cpu_count)
114
+ image_filter_stage_id = add_image_filter_stage(pipeline, default_cpu_count)
115
+ table_extraction_stage_id = add_table_extractor_stage(pipeline, default_cpu_count)
116
+ chart_extraction_stage_id = add_chart_extractor_stage(pipeline, default_cpu_count)
117
+ infographic_extraction_stage_id = add_infographic_extractor_stage(pipeline, default_cpu_count)
118
+ image_caption_stage_id = add_image_caption_stage(pipeline, default_cpu_count)
119
+ ########################################################################################################
120
+
121
+ ########################################################################################################
122
+ ## Transforms and data synthesis
123
+ ########################################################################################################
124
+ text_splitter_stage_id = add_text_splitter_stage(pipeline, default_cpu_count)
125
+ embed_extractions_stage_id = add_text_embedding_stage(pipeline, default_cpu_count)
126
+
127
+ ########################################################################################################
128
+ ## Storage and output
129
+ ########################################################################################################
130
+ embedding_storage_stage_id = add_embedding_storage_stage(pipeline, default_cpu_count)
131
+ image_storage_stage_id = add_image_storage_stage(pipeline, default_cpu_count)
132
+ # vdb_task_sink_stage = add_vdb_task_sink_stage(pipe, morpheus_pipeline_config, ingest_config)
133
+ broker_response_stage_id = add_message_broker_response_stage(pipeline, default_cpu_count)
134
+ ########################################################################################################
135
+
136
+ #######################################################################################################
137
+ ## Telemetry (Note: everything after the sync stage is out of the hot path, please keep it that way) ##
138
+ #######################################################################################################
139
+ otel_tracer_stage_id = add_otel_tracer_stage(pipeline, default_cpu_count)
140
+
141
+ # TODO(devin)
142
+ # if add_meter_stage:
143
+ # otel_meter_stage = add_otel_meter_stage(pipe, morpheus_pipeline_config, ingest_config)
144
+ # else:
145
+ # otel_meter_stage = None
146
+ # completed_job_counter_stage = add_completed_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
147
+ ########################################################################################################
148
+
149
+ # Add a drain stage to the pipeline -- flushes and deletes control messages
150
+ drain_id = add_default_drain_stage(pipeline, default_cpu_count)
151
+
152
+ ingest_edge_buffer_size = int(os.environ.get("INGEST_EDGE_BUFFER_SIZE", 32))
153
+
154
+ # Add edges
155
+ ###### Intake Stages ########
156
+ pipeline.make_edge(source_stage_id, metadata_injector_stage_id, queue_size=ingest_edge_buffer_size)
157
+ pipeline.make_edge(metadata_injector_stage_id, pdf_extractor_stage_id, queue_size=ingest_edge_buffer_size)
158
+
159
+ ###### Document Extractors ########
160
+ pipeline.make_edge(pdf_extractor_stage_id, audio_extractor_stage_id, queue_size=ingest_edge_buffer_size)
161
+ pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
162
+ pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
163
+ pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
164
+ pipeline.make_edge(image_extractor_stage_id, html_extractor_stage_id, queue_size=ingest_edge_buffer_size)
165
+ pipeline.make_edge(html_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
166
+
167
+ ###### Primitive Extractors ########
168
+ pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
169
+ pipeline.make_edge(table_extraction_stage_id, chart_extraction_stage_id, queue_size=ingest_edge_buffer_size)
170
+ pipeline.make_edge(chart_extraction_stage_id, image_filter_stage_id, queue_size=ingest_edge_buffer_size)
171
+
172
+ ###### Primitive Mutators ########
173
+ pipeline.make_edge(image_filter_stage_id, image_dedup_stage_id, queue_size=ingest_edge_buffer_size)
174
+ pipeline.make_edge(image_dedup_stage_id, text_splitter_stage_id, queue_size=ingest_edge_buffer_size)
175
+
176
+ ###### Primitive Transforms ########
177
+ pipeline.make_edge(text_splitter_stage_id, embed_extractions_stage_id, queue_size=ingest_edge_buffer_size)
178
+ pipeline.make_edge(embed_extractions_stage_id, image_caption_stage_id, queue_size=ingest_edge_buffer_size)
179
+ pipeline.make_edge(image_caption_stage_id, image_storage_stage_id, queue_size=ingest_edge_buffer_size)
180
+
181
+ ###### Primitive Storage ########
182
+ pipeline.make_edge(image_storage_stage_id, embedding_storage_stage_id, queue_size=ingest_edge_buffer_size)
183
+ pipeline.make_edge(embedding_storage_stage_id, broker_response_stage_id, queue_size=ingest_edge_buffer_size)
184
+
185
+ ###### Response and Telemetry ########
186
+ pipeline.make_edge(broker_response_stage_id, otel_tracer_stage_id, queue_size=ingest_edge_buffer_size)
187
+ pipeline.make_edge(otel_tracer_stage_id, drain_id, queue_size=ingest_edge_buffer_size)
188
+
189
+ pipeline.build()
190
+
191
+ # TODO(devin)
192
+ # if add_meter_stage:
193
+ # pipe.add_edge(sink_stage, otel_meter_stage)
194
+ # pipe.add_edge(otel_meter_stage, otel_tracer_stage)
195
+ # else:
196
+ # pipe.add_edge(sink_stage, otel_tracer_stage)
197
+
198
+ # pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)
199
+
200
+ return ray_context
@@ -0,0 +1,376 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import atexit
6
+ import logging
7
+ import multiprocessing
8
+ import os
9
+ import signal
10
+ import sys
11
+ import time
12
+ from ctypes import CDLL, c_int
13
+ from datetime import datetime
14
+ from typing import Union, Tuple, Optional, TextIO
15
+
16
+ import ray
17
+ from pydantic import BaseModel, ConfigDict
18
+
19
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
20
+ RayPipeline,
21
+ ScalingConfig,
22
+ RayPipelineSubprocessInterface,
23
+ RayPipelineInterface,
24
+ )
25
+ from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def str_to_bool(value: str) -> bool:
31
+ return value.strip().lower() in {"1", "true", "yes", "on"}
32
+
33
+
34
+ DISABLE_DYNAMIC_SCALING = str_to_bool(os.environ.get("INGEST_DISABLE_DYNAMIC_SCALING", "false"))
35
+ DYNAMIC_MEMORY_THRESHOLD = float(os.environ.get("INGEST_DYNAMIC_MEMORY_THRESHOLD", 0.75))
36
+
37
+
38
+ class PipelineCreationSchema(BaseModel):
39
+ """
40
+ Schema for pipeline creation configuration.
41
+
42
+ Contains all parameters required to set up and execute the pipeline,
43
+ including endpoints, API keys, and processing options.
44
+ """
45
+
46
+ arrow_default_memory_pool: str = os.getenv("ARROW_DEFAULT_MEMORY_POOL", "system")
47
+
48
+ # Audio processing settings
49
+ audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
50
+ audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
51
+ audio_infer_protocol: str = os.getenv("AUDIO_INFER_PROTOCOL", "grpc")
52
+
53
+ # Embedding model settings
54
+ embedding_nim_endpoint: str = os.getenv("EMBEDDING_NIM_ENDPOINT", "https://integrate.api.nvidia.com/v1")
55
+ embedding_nim_model_name: str = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
56
+
57
+ # General pipeline settings
58
+ ingest_log_level: str = os.getenv("INGEST_LOG_LEVEL", "INFO")
59
+ max_ingest_process_workers: str = os.getenv("MAX_INGEST_PROCESS_WORKERS", "16")
60
+
61
+ # Messaging configuration
62
+ message_client_host: str = os.getenv("MESSAGE_CLIENT_HOST", "localhost")
63
+ message_client_port: str = os.getenv("MESSAGE_CLIENT_PORT", "7671")
64
+ message_client_type: str = os.getenv("MESSAGE_CLIENT_TYPE", "simple")
65
+
66
+ # NeMo Retriever settings
67
+ nemoretriever_parse_http_endpoint: str = os.getenv(
68
+ "NEMORETRIEVER_PARSE_HTTP_ENDPOINT", "https://integrate.api.nvidia.com/v1/chat/completions"
69
+ )
70
+ nemoretriever_parse_infer_protocol: str = os.getenv("NEMORETRIEVER_PARSE_INFER_PROTOCOL", "http")
71
+ nemoretriever_parse_model_name: str = os.getenv("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
72
+
73
+ # API keys
74
+ ngc_api_key: str = os.getenv("NGC_API_KEY", "")
75
+ nvidia_build_api_key: str = os.getenv("NVIDIA_BUILD_API_KEY", "")
76
+
77
+ # Observability settings
78
+ otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
79
+
80
+ # OCR settings
81
+ paddle_http_endpoint: str = os.getenv("PADDLE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
82
+ paddle_infer_protocol: str = os.getenv("PADDLE_INFER_PROTOCOL", "http")
83
+
84
+ # Task queue settings
85
+ REDIS_INGEST_TASK_QUEUE: str = "ingest_task_queue"
86
+
87
+ # Vision language model settings
88
+ vlm_caption_endpoint: str = os.getenv(
89
+ "VLM_CAPTION_ENDPOINT", "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
90
+ )
91
+ vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
92
+
93
+ # YOLOX image processing settings
94
+ yolox_graphic_elements_http_endpoint: str = os.getenv(
95
+ "YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT",
96
+ "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1",
97
+ )
98
+ yolox_graphic_elements_infer_protocol: str = os.getenv("YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL", "http")
99
+
100
+ # YOLOX page elements settings
101
+ yolox_http_endpoint: str = os.getenv(
102
+ "YOLOX_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
103
+ )
104
+ yolox_infer_protocol: str = os.getenv("YOLOX_INFER_PROTOCOL", "http")
105
+
106
+ # YOLOX table structure settings
107
+ yolox_table_structure_http_endpoint: str = os.getenv(
108
+ "YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
109
+ )
110
+ yolox_table_structure_infer_protocol: str = os.getenv("YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL", "http")
111
+
112
+ model_config = ConfigDict(extra="forbid")
113
+
114
+
115
+ def redirect_os_fds(stdout: Optional[TextIO] = None, stderr: Optional[TextIO] = None):
116
+ """
117
+ Redirect OS-level stdout (fd=1) and stderr (fd=2) to the given file-like objects,
118
+ or to /dev/null if not provided.
119
+
120
+ Parameters
121
+ ----------
122
+ stdout : Optional[TextIO]
123
+ Stream to receive OS-level stdout. If None, redirected to /dev/null.
124
+ stderr : Optional[TextIO]
125
+ Stream to receive OS-level stderr. If None, redirected to /dev/null.
126
+ """
127
+ devnull_fd = os.open(os.devnull, os.O_WRONLY)
128
+
129
+ if stdout is not None:
130
+ os.dup2(stdout.fileno(), 1)
131
+ else:
132
+ os.dup2(devnull_fd, 1)
133
+
134
+ if stderr is not None:
135
+ os.dup2(stderr.fileno(), 2)
136
+ else:
137
+ os.dup2(devnull_fd, 2)
138
+
139
+
140
+ def set_pdeathsig(sig=signal.SIGKILL):
141
+ libc = CDLL("libc.so.6")
142
+ PR_SET_PDEATHSIG = 1
143
+ libc.prctl(PR_SET_PDEATHSIG, c_int(sig))
144
+
145
+
146
+ def kill_pipeline_process_group(pid: int):
147
+ """
148
+ Kill the process group associated with the given PID, if it exists and is alive.
149
+
150
+ Parameters
151
+ ----------
152
+ pid : int
153
+ The PID of the process whose group should be killed.
154
+ """
155
+ try:
156
+ # Get the process group ID
157
+ pgid = os.getpgid(pid)
158
+
159
+ # Check if the group is still alive by sending signal 0
160
+ os.killpg(pgid, 0) # Does not kill, just checks if it's alive
161
+
162
+ # If no exception, the group is alive — kill it
163
+ os.killpg(pgid, signal.SIGKILL)
164
+ print(f"Killed subprocess group {pgid}")
165
+
166
+ except ProcessLookupError:
167
+ print(f"Process group for PID {pid} no longer exists.")
168
+ except PermissionError:
169
+ print(f"Permission denied to kill process group for PID {pid}.")
170
+ except Exception as e:
171
+ print(f"Failed to kill subprocess group: {e}")
172
+
173
+
174
+ def _run_pipeline_process(
175
+ ingest_config: PipelineCreationSchema,
176
+ disable_dynamic_scaling: Optional[bool],
177
+ dynamic_memory_threshold: Optional[float],
178
+ raw_stdout: Optional[TextIO] = None,
179
+ raw_stderr: Optional[TextIO] = None,
180
+ ):
181
+ """
182
+ Subprocess entrypoint to launch the pipeline. Redirects all output to the provided
183
+ file-like streams or /dev/null if not specified.
184
+
185
+ Parameters
186
+ ----------
187
+ ingest_config : PipelineCreationSchema
188
+ Validated pipeline configuration.
189
+ disable_dynamic_scaling : Optional[bool]
190
+ Whether to disable dynamic scaling.
191
+ dynamic_memory_threshold : Optional[float]
192
+ Threshold for triggering scaling.
193
+ raw_stdout : Optional[TextIO]
194
+ Destination for stdout. Defaults to /dev/null.
195
+ raw_stderr : Optional[TextIO]
196
+ Destination for stderr. Defaults to /dev/null.
197
+ """
198
+ # Set the death signal for the subprocess
199
+ set_pdeathsig()
200
+ os.setsid() # Creates new process group so it can be SIGKILLed as a group
201
+
202
+ # Redirect OS-level file descriptors
203
+ redirect_os_fds(stdout=raw_stdout, stderr=raw_stderr)
204
+
205
+ # Redirect Python-level sys.stdout/sys.stderr
206
+ sys.stdout = raw_stdout or open(os.devnull, "w")
207
+ sys.stderr = raw_stderr or open(os.devnull, "w")
208
+
209
+ try:
210
+ _launch_pipeline(
211
+ ingest_config,
212
+ block=True,
213
+ disable_dynamic_scaling=disable_dynamic_scaling,
214
+ dynamic_memory_threshold=dynamic_memory_threshold,
215
+ )
216
+ except Exception as e:
217
+ sys.__stderr__.write(f"Subprocess pipeline run failed: {e}\n")
218
+ raise
219
+
220
+
221
+ def _launch_pipeline(
222
+ ingest_config: PipelineCreationSchema,
223
+ block: bool,
224
+ disable_dynamic_scaling: bool = None,
225
+ dynamic_memory_threshold: float = None,
226
+ ) -> Tuple[Union[RayPipeline, None], float]:
227
+ logger.info("Starting pipeline setup")
228
+
229
+ dynamic_memory_scaling = not DISABLE_DYNAMIC_SCALING
230
+ if disable_dynamic_scaling is not None:
231
+ dynamic_memory_scaling = not disable_dynamic_scaling
232
+
233
+ dynamic_memory_threshold = dynamic_memory_threshold if dynamic_memory_threshold else DYNAMIC_MEMORY_THRESHOLD
234
+
235
+ scaling_config = ScalingConfig(
236
+ dynamic_memory_scaling=dynamic_memory_scaling, dynamic_memory_threshold=dynamic_memory_threshold
237
+ )
238
+
239
+ pipeline = RayPipeline(scaling_config=scaling_config)
240
+ start_abs = datetime.now()
241
+
242
+ # Set up the ingestion pipeline
243
+ _ = setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
244
+
245
+ # Record setup time
246
+ end_setup = start_run = datetime.now()
247
+ setup_elapsed = (end_setup - start_abs).total_seconds()
248
+ logger.info(f"Pipeline setup completed in {setup_elapsed:.2f} seconds")
249
+
250
+ # Run the pipeline
251
+ logger.debug("Running pipeline")
252
+ pipeline.start()
253
+
254
+ if block:
255
+ try:
256
+ while True:
257
+ time.sleep(5)
258
+ except KeyboardInterrupt:
259
+ logger.info("Interrupt received, shutting down pipeline.")
260
+ pipeline.stop()
261
+ ray.shutdown()
262
+ logger.info("Ray shutdown complete.")
263
+
264
+ # Record execution times
265
+ end_run = datetime.now()
266
+ run_elapsed = (end_run - start_run).total_seconds()
267
+ total_elapsed = (end_run - start_abs).total_seconds()
268
+
269
+ logger.info(f"Pipeline run completed in {run_elapsed:.2f} seconds")
270
+ logger.info(f"Total time elapsed: {total_elapsed:.2f} seconds")
271
+
272
+ return None, total_elapsed
273
+ else:
274
+ return pipeline, 0.0
275
+
276
+
277
+ def run_pipeline(
278
+ ingest_config: PipelineCreationSchema,
279
+ block: bool = True,
280
+ disable_dynamic_scaling: Optional[bool] = None,
281
+ dynamic_memory_threshold: Optional[float] = None,
282
+ run_in_subprocess: bool = False,
283
+ stdout: Optional[TextIO] = None,
284
+ stderr: Optional[TextIO] = None,
285
+ ) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
286
+ """
287
+ Launch and manage a pipeline, optionally in a subprocess.
288
+
289
+ This function is the primary entry point for executing a Ray pipeline,
290
+ either within the current process or in a separate Python subprocess.
291
+ It supports synchronous blocking execution or non-blocking lifecycle management,
292
+ and allows redirection of output to specified file-like objects.
293
+
294
+ Parameters
295
+ ----------
296
+ ingest_config : PipelineCreationSchema
297
+ The validated configuration object used to construct and launch the pipeline.
298
+ block : bool, default=True
299
+ If True, blocks until the pipeline completes.
300
+ If False, returns an interface to control the pipeline externally.
301
+ disable_dynamic_scaling : Optional[bool], default=None
302
+ If True, disables dynamic memory scaling. Overrides global configuration if set.
303
+ If None, uses the default or globally defined behavior.
304
+ dynamic_memory_threshold : Optional[float], default=None
305
+ The memory usage threshold (as a float between 0 and 1) that triggers autoscaling,
306
+ if dynamic scaling is enabled. Defaults to the globally configured value if None.
307
+ run_in_subprocess : bool, default=False
308
+ If True, launches the pipeline in a separate Python subprocess using `multiprocessing.Process`.
309
+ If False, runs the pipeline in the current process.
310
+ stdout : Optional[TextIO], default=None
311
+ Optional file-like stream to which subprocess stdout should be redirected.
312
+ If None, stdout is redirected to /dev/null.
313
+ stderr : Optional[TextIO], default=None
314
+ Optional file-like stream to which subprocess stderr should be redirected.
315
+ If None, stderr is redirected to /dev/null.
316
+
317
+ Returns
318
+ -------
319
+ Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]
320
+ - If run in-process with `block=True`: returns elapsed time in seconds (float).
321
+ - If run in-process with `block=False`: returns a `RayPipelineInterface`.
322
+ - If run in subprocess with `block=False`: returns a `RayPipelineSubprocessInterface`.
323
+ - If run in subprocess with `block=True`: returns 0.0.
324
+
325
+ Raises
326
+ ------
327
+ RuntimeError
328
+ If the subprocess fails to start or exits with an error.
329
+ Exception
330
+ Any other exceptions raised during pipeline launch or configuration.
331
+ """
332
+ if run_in_subprocess:
333
+ logger.info("Launching pipeline in Python subprocess using multiprocessing.")
334
+
335
+ ctx = multiprocessing.get_context("fork")
336
+ process = ctx.Process(
337
+ target=_run_pipeline_process,
338
+ args=(
339
+ ingest_config,
340
+ disable_dynamic_scaling,
341
+ dynamic_memory_threshold,
342
+ stdout, # raw_stdout
343
+ stderr, # raw_stderr
344
+ ),
345
+ daemon=False,
346
+ )
347
+
348
+ process.start()
349
+
350
+ interface = RayPipelineSubprocessInterface(process)
351
+
352
+ if block:
353
+ start_time = time.time()
354
+ logger.info("Waiting for subprocess pipeline to complete...")
355
+ process.join()
356
+ logger.info("Pipeline subprocess completed.")
357
+ return time.time() - start_time
358
+ else:
359
+ logger.info(f"Pipeline subprocess started (PID={process.pid})")
360
+ atexit.register(lambda: kill_pipeline_process_group(process.pid))
361
+
362
+ return interface
363
+
364
+ # Run inline
365
+ pipeline, total_elapsed = _launch_pipeline(
366
+ ingest_config,
367
+ block=block,
368
+ disable_dynamic_scaling=disable_dynamic_scaling,
369
+ dynamic_memory_threshold=dynamic_memory_threshold,
370
+ )
371
+
372
+ if block:
373
+ logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
374
+ return total_elapsed
375
+ else:
376
+ return RayPipelineInterface(pipeline)