nv-ingest 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +45 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/api/v1/metrics.py +29 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +591 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1322 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +200 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +624 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-25.6.0.dist-info/METADATA +266 -0
- nv_ingest-25.6.0.dist-info/RECORD +102 -0
- nv_ingest-25.6.0.dist-info/WHEEL +5 -0
- nv_ingest-25.6.0.dist-info/licenses/LICENSE +201 -0
- nv_ingest-25.6.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import math
|
|
8
|
+
import os
|
|
9
|
+
from typing import Dict, Any
|
|
10
|
+
|
|
11
|
+
import ray
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline
|
|
15
|
+
from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
|
|
16
|
+
add_source_stage,
|
|
17
|
+
add_metadata_injector_stage,
|
|
18
|
+
add_pdf_extractor_stage,
|
|
19
|
+
add_image_extractor_stage,
|
|
20
|
+
add_docx_extractor_stage,
|
|
21
|
+
add_audio_extractor_stage,
|
|
22
|
+
add_html_extractor_stage,
|
|
23
|
+
add_image_dedup_stage,
|
|
24
|
+
add_image_filter_stage,
|
|
25
|
+
add_table_extractor_stage,
|
|
26
|
+
add_chart_extractor_stage,
|
|
27
|
+
add_image_caption_stage,
|
|
28
|
+
add_text_splitter_stage,
|
|
29
|
+
add_text_embedding_stage,
|
|
30
|
+
add_embedding_storage_stage,
|
|
31
|
+
add_image_storage_stage,
|
|
32
|
+
add_message_broker_response_stage,
|
|
33
|
+
add_pptx_extractor_stage,
|
|
34
|
+
add_infographic_extractor_stage,
|
|
35
|
+
add_otel_tracer_stage,
|
|
36
|
+
add_default_drain_stage,
|
|
37
|
+
)
|
|
38
|
+
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger("uvicorn")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def export_config_to_env(ingest_config: Any) -> None:
|
|
44
|
+
if isinstance(ingest_config, BaseModel):
|
|
45
|
+
ingest_config = ingest_config.model_dump()
|
|
46
|
+
|
|
47
|
+
os.environ.update({key.upper(): val for key, val in ingest_config.items()})
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any] = None):
|
|
51
|
+
# Initialize the pipeline with the configuration
|
|
52
|
+
if ingest_config:
|
|
53
|
+
# Export the config to environment variables
|
|
54
|
+
export_config_to_env(ingest_config)
|
|
55
|
+
|
|
56
|
+
current_level = logging.getLogger().getEffectiveLevel()
|
|
57
|
+
ray_context = ray.init(
|
|
58
|
+
namespace="nv_ingest_ray",
|
|
59
|
+
logging_level=current_level,
|
|
60
|
+
ignore_reinit_error=True,
|
|
61
|
+
dashboard_host="0.0.0.0",
|
|
62
|
+
dashboard_port=8265,
|
|
63
|
+
_system_config={
|
|
64
|
+
"local_fs_capacity_threshold": 0.9,
|
|
65
|
+
"object_spilling_config": json.dumps(
|
|
66
|
+
{
|
|
67
|
+
"type": "filesystem",
|
|
68
|
+
"params": {
|
|
69
|
+
"directory_path": [
|
|
70
|
+
"/tmp/ray_spill_testing_0",
|
|
71
|
+
"/tmp/ray_spill_testing_1",
|
|
72
|
+
"/tmp/ray_spill_testing_2",
|
|
73
|
+
"/tmp/ray_spill_testing_3",
|
|
74
|
+
],
|
|
75
|
+
"buffer_size": 100_000_000,
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
),
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
system_resource_probe = SystemResourceProbe()
|
|
82
|
+
|
|
83
|
+
effective_cpu_core_count = system_resource_probe.get_effective_cores()
|
|
84
|
+
default_cpu_count = int(os.environ.get("NV_INGEST_MAX_UTIL", int(max(1, math.floor(effective_cpu_core_count)))))
|
|
85
|
+
|
|
86
|
+
add_meter_stage = os.environ.get("MESSAGE_CLIENT_TYPE") != "simple"
|
|
87
|
+
_ = add_meter_stage # TODO(Devin)
|
|
88
|
+
|
|
89
|
+
########################################################################################################
|
|
90
|
+
## Insertion and Pre-processing stages
|
|
91
|
+
########################################################################################################
|
|
92
|
+
logger.debug("Setting up ingestion pipeline")
|
|
93
|
+
source_stage_id = add_source_stage(pipeline, default_cpu_count)
|
|
94
|
+
# TODO(Devin): Job counter used a global stats object that isn't ray compatible, need to update.
|
|
95
|
+
# submitted_job_counter_stage = add_submitted_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
|
|
96
|
+
metadata_injector_stage_id = add_metadata_injector_stage(pipeline, default_cpu_count)
|
|
97
|
+
########################################################################################################
|
|
98
|
+
|
|
99
|
+
########################################################################################################
|
|
100
|
+
## Primitive extraction
|
|
101
|
+
########################################################################################################
|
|
102
|
+
pdf_extractor_stage_id = add_pdf_extractor_stage(pipeline, default_cpu_count)
|
|
103
|
+
image_extractor_stage_id = add_image_extractor_stage(pipeline, default_cpu_count)
|
|
104
|
+
docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
|
|
105
|
+
pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
|
|
106
|
+
audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
|
|
107
|
+
html_extractor_stage_id = add_html_extractor_stage(pipeline, default_cpu_count)
|
|
108
|
+
########################################################################################################
|
|
109
|
+
|
|
110
|
+
########################################################################################################
|
|
111
|
+
## Post-processing
|
|
112
|
+
########################################################################################################
|
|
113
|
+
image_dedup_stage_id = add_image_dedup_stage(pipeline, default_cpu_count)
|
|
114
|
+
image_filter_stage_id = add_image_filter_stage(pipeline, default_cpu_count)
|
|
115
|
+
table_extraction_stage_id = add_table_extractor_stage(pipeline, default_cpu_count)
|
|
116
|
+
chart_extraction_stage_id = add_chart_extractor_stage(pipeline, default_cpu_count)
|
|
117
|
+
infographic_extraction_stage_id = add_infographic_extractor_stage(pipeline, default_cpu_count)
|
|
118
|
+
image_caption_stage_id = add_image_caption_stage(pipeline, default_cpu_count)
|
|
119
|
+
########################################################################################################
|
|
120
|
+
|
|
121
|
+
########################################################################################################
|
|
122
|
+
## Transforms and data synthesis
|
|
123
|
+
########################################################################################################
|
|
124
|
+
text_splitter_stage_id = add_text_splitter_stage(pipeline, default_cpu_count)
|
|
125
|
+
embed_extractions_stage_id = add_text_embedding_stage(pipeline, default_cpu_count)
|
|
126
|
+
|
|
127
|
+
########################################################################################################
|
|
128
|
+
## Storage and output
|
|
129
|
+
########################################################################################################
|
|
130
|
+
embedding_storage_stage_id = add_embedding_storage_stage(pipeline, default_cpu_count)
|
|
131
|
+
image_storage_stage_id = add_image_storage_stage(pipeline, default_cpu_count)
|
|
132
|
+
# vdb_task_sink_stage = add_vdb_task_sink_stage(pipe, morpheus_pipeline_config, ingest_config)
|
|
133
|
+
broker_response_stage_id = add_message_broker_response_stage(pipeline, default_cpu_count)
|
|
134
|
+
########################################################################################################
|
|
135
|
+
|
|
136
|
+
#######################################################################################################
|
|
137
|
+
## Telemetry (Note: everything after the sync stage is out of the hot path, please keep it that way) ##
|
|
138
|
+
#######################################################################################################
|
|
139
|
+
otel_tracer_stage_id = add_otel_tracer_stage(pipeline, default_cpu_count)
|
|
140
|
+
|
|
141
|
+
# TODO(devin)
|
|
142
|
+
# if add_meter_stage:
|
|
143
|
+
# otel_meter_stage = add_otel_meter_stage(pipe, morpheus_pipeline_config, ingest_config)
|
|
144
|
+
# else:
|
|
145
|
+
# otel_meter_stage = None
|
|
146
|
+
# completed_job_counter_stage = add_completed_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
|
|
147
|
+
########################################################################################################
|
|
148
|
+
|
|
149
|
+
# Add a drain stage to the pipeline -- flushes and deletes control messages
|
|
150
|
+
drain_id = add_default_drain_stage(pipeline, default_cpu_count)
|
|
151
|
+
|
|
152
|
+
ingest_edge_buffer_size = int(os.environ.get("INGEST_EDGE_BUFFER_SIZE", 32))
|
|
153
|
+
|
|
154
|
+
# Add edges
|
|
155
|
+
###### Intake Stages ########
|
|
156
|
+
pipeline.make_edge(source_stage_id, metadata_injector_stage_id, queue_size=ingest_edge_buffer_size)
|
|
157
|
+
pipeline.make_edge(metadata_injector_stage_id, pdf_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
158
|
+
|
|
159
|
+
###### Document Extractors ########
|
|
160
|
+
pipeline.make_edge(pdf_extractor_stage_id, audio_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
161
|
+
pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
162
|
+
pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
163
|
+
pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
164
|
+
pipeline.make_edge(image_extractor_stage_id, html_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
165
|
+
pipeline.make_edge(html_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
|
|
166
|
+
|
|
167
|
+
###### Primitive Extractors ########
|
|
168
|
+
pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
|
|
169
|
+
pipeline.make_edge(table_extraction_stage_id, chart_extraction_stage_id, queue_size=ingest_edge_buffer_size)
|
|
170
|
+
pipeline.make_edge(chart_extraction_stage_id, image_filter_stage_id, queue_size=ingest_edge_buffer_size)
|
|
171
|
+
|
|
172
|
+
###### Primitive Mutators ########
|
|
173
|
+
pipeline.make_edge(image_filter_stage_id, image_dedup_stage_id, queue_size=ingest_edge_buffer_size)
|
|
174
|
+
pipeline.make_edge(image_dedup_stage_id, text_splitter_stage_id, queue_size=ingest_edge_buffer_size)
|
|
175
|
+
|
|
176
|
+
###### Primitive Transforms ########
|
|
177
|
+
pipeline.make_edge(text_splitter_stage_id, embed_extractions_stage_id, queue_size=ingest_edge_buffer_size)
|
|
178
|
+
pipeline.make_edge(embed_extractions_stage_id, image_caption_stage_id, queue_size=ingest_edge_buffer_size)
|
|
179
|
+
pipeline.make_edge(image_caption_stage_id, image_storage_stage_id, queue_size=ingest_edge_buffer_size)
|
|
180
|
+
|
|
181
|
+
###### Primitive Storage ########
|
|
182
|
+
pipeline.make_edge(image_storage_stage_id, embedding_storage_stage_id, queue_size=ingest_edge_buffer_size)
|
|
183
|
+
pipeline.make_edge(embedding_storage_stage_id, broker_response_stage_id, queue_size=ingest_edge_buffer_size)
|
|
184
|
+
|
|
185
|
+
###### Response and Telemetry ########
|
|
186
|
+
pipeline.make_edge(broker_response_stage_id, otel_tracer_stage_id, queue_size=ingest_edge_buffer_size)
|
|
187
|
+
pipeline.make_edge(otel_tracer_stage_id, drain_id, queue_size=ingest_edge_buffer_size)
|
|
188
|
+
|
|
189
|
+
pipeline.build()
|
|
190
|
+
|
|
191
|
+
# TODO(devin)
|
|
192
|
+
# if add_meter_stage:
|
|
193
|
+
# pipe.add_edge(sink_stage, otel_meter_stage)
|
|
194
|
+
# pipe.add_edge(otel_meter_stage, otel_tracer_stage)
|
|
195
|
+
# else:
|
|
196
|
+
# pipe.add_edge(sink_stage, otel_tracer_stage)
|
|
197
|
+
|
|
198
|
+
# pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)
|
|
199
|
+
|
|
200
|
+
return ray_context
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import atexit
|
|
6
|
+
import logging
|
|
7
|
+
import multiprocessing
|
|
8
|
+
import os
|
|
9
|
+
import signal
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
from ctypes import CDLL, c_int
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import Union, Tuple, Optional, TextIO
|
|
15
|
+
|
|
16
|
+
import ray
|
|
17
|
+
from pydantic import BaseModel, ConfigDict
|
|
18
|
+
|
|
19
|
+
from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
|
|
20
|
+
RayPipeline,
|
|
21
|
+
ScalingConfig,
|
|
22
|
+
RayPipelineSubprocessInterface,
|
|
23
|
+
RayPipelineInterface,
|
|
24
|
+
)
|
|
25
|
+
from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def str_to_bool(value: str) -> bool:
|
|
31
|
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
DISABLE_DYNAMIC_SCALING = str_to_bool(os.environ.get("INGEST_DISABLE_DYNAMIC_SCALING", "false"))
|
|
35
|
+
DYNAMIC_MEMORY_THRESHOLD = float(os.environ.get("INGEST_DYNAMIC_MEMORY_THRESHOLD", 0.75))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PipelineCreationSchema(BaseModel):
|
|
39
|
+
"""
|
|
40
|
+
Schema for pipeline creation configuration.
|
|
41
|
+
|
|
42
|
+
Contains all parameters required to set up and execute the pipeline,
|
|
43
|
+
including endpoints, API keys, and processing options.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
arrow_default_memory_pool: str = os.getenv("ARROW_DEFAULT_MEMORY_POOL", "system")
|
|
47
|
+
|
|
48
|
+
# Audio processing settings
|
|
49
|
+
audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
|
|
50
|
+
audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
|
|
51
|
+
audio_infer_protocol: str = os.getenv("AUDIO_INFER_PROTOCOL", "grpc")
|
|
52
|
+
|
|
53
|
+
# Embedding model settings
|
|
54
|
+
embedding_nim_endpoint: str = os.getenv("EMBEDDING_NIM_ENDPOINT", "https://integrate.api.nvidia.com/v1")
|
|
55
|
+
embedding_nim_model_name: str = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
|
|
56
|
+
|
|
57
|
+
# General pipeline settings
|
|
58
|
+
ingest_log_level: str = os.getenv("INGEST_LOG_LEVEL", "INFO")
|
|
59
|
+
max_ingest_process_workers: str = os.getenv("MAX_INGEST_PROCESS_WORKERS", "16")
|
|
60
|
+
|
|
61
|
+
# Messaging configuration
|
|
62
|
+
message_client_host: str = os.getenv("MESSAGE_CLIENT_HOST", "localhost")
|
|
63
|
+
message_client_port: str = os.getenv("MESSAGE_CLIENT_PORT", "7671")
|
|
64
|
+
message_client_type: str = os.getenv("MESSAGE_CLIENT_TYPE", "simple")
|
|
65
|
+
|
|
66
|
+
# NeMo Retriever settings
|
|
67
|
+
nemoretriever_parse_http_endpoint: str = os.getenv(
|
|
68
|
+
"NEMORETRIEVER_PARSE_HTTP_ENDPOINT", "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
69
|
+
)
|
|
70
|
+
nemoretriever_parse_infer_protocol: str = os.getenv("NEMORETRIEVER_PARSE_INFER_PROTOCOL", "http")
|
|
71
|
+
nemoretriever_parse_model_name: str = os.getenv("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
|
|
72
|
+
|
|
73
|
+
# API keys
|
|
74
|
+
ngc_api_key: str = os.getenv("NGC_API_KEY", "")
|
|
75
|
+
nvidia_build_api_key: str = os.getenv("NVIDIA_BUILD_API_KEY", "")
|
|
76
|
+
|
|
77
|
+
# Observability settings
|
|
78
|
+
otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
|
|
79
|
+
|
|
80
|
+
# OCR settings
|
|
81
|
+
paddle_http_endpoint: str = os.getenv("PADDLE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
|
|
82
|
+
paddle_infer_protocol: str = os.getenv("PADDLE_INFER_PROTOCOL", "http")
|
|
83
|
+
|
|
84
|
+
# Task queue settings
|
|
85
|
+
REDIS_INGEST_TASK_QUEUE: str = "ingest_task_queue"
|
|
86
|
+
|
|
87
|
+
# Vision language model settings
|
|
88
|
+
vlm_caption_endpoint: str = os.getenv(
|
|
89
|
+
"VLM_CAPTION_ENDPOINT", "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
|
|
90
|
+
)
|
|
91
|
+
vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
|
|
92
|
+
|
|
93
|
+
# YOLOX image processing settings
|
|
94
|
+
yolox_graphic_elements_http_endpoint: str = os.getenv(
|
|
95
|
+
"YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT",
|
|
96
|
+
"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1",
|
|
97
|
+
)
|
|
98
|
+
yolox_graphic_elements_infer_protocol: str = os.getenv("YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL", "http")
|
|
99
|
+
|
|
100
|
+
# YOLOX page elements settings
|
|
101
|
+
yolox_http_endpoint: str = os.getenv(
|
|
102
|
+
"YOLOX_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
|
|
103
|
+
)
|
|
104
|
+
yolox_infer_protocol: str = os.getenv("YOLOX_INFER_PROTOCOL", "http")
|
|
105
|
+
|
|
106
|
+
# YOLOX table structure settings
|
|
107
|
+
yolox_table_structure_http_endpoint: str = os.getenv(
|
|
108
|
+
"YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
|
|
109
|
+
)
|
|
110
|
+
yolox_table_structure_infer_protocol: str = os.getenv("YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL", "http")
|
|
111
|
+
|
|
112
|
+
model_config = ConfigDict(extra="forbid")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def redirect_os_fds(stdout: Optional[TextIO] = None, stderr: Optional[TextIO] = None):
|
|
116
|
+
"""
|
|
117
|
+
Redirect OS-level stdout (fd=1) and stderr (fd=2) to the given file-like objects,
|
|
118
|
+
or to /dev/null if not provided.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
stdout : Optional[TextIO]
|
|
123
|
+
Stream to receive OS-level stdout. If None, redirected to /dev/null.
|
|
124
|
+
stderr : Optional[TextIO]
|
|
125
|
+
Stream to receive OS-level stderr. If None, redirected to /dev/null.
|
|
126
|
+
"""
|
|
127
|
+
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
|
128
|
+
|
|
129
|
+
if stdout is not None:
|
|
130
|
+
os.dup2(stdout.fileno(), 1)
|
|
131
|
+
else:
|
|
132
|
+
os.dup2(devnull_fd, 1)
|
|
133
|
+
|
|
134
|
+
if stderr is not None:
|
|
135
|
+
os.dup2(stderr.fileno(), 2)
|
|
136
|
+
else:
|
|
137
|
+
os.dup2(devnull_fd, 2)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def set_pdeathsig(sig=signal.SIGKILL):
|
|
141
|
+
libc = CDLL("libc.so.6")
|
|
142
|
+
PR_SET_PDEATHSIG = 1
|
|
143
|
+
libc.prctl(PR_SET_PDEATHSIG, c_int(sig))
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def kill_pipeline_process_group(pid: int):
|
|
147
|
+
"""
|
|
148
|
+
Kill the process group associated with the given PID, if it exists and is alive.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
pid : int
|
|
153
|
+
The PID of the process whose group should be killed.
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
# Get the process group ID
|
|
157
|
+
pgid = os.getpgid(pid)
|
|
158
|
+
|
|
159
|
+
# Check if the group is still alive by sending signal 0
|
|
160
|
+
os.killpg(pgid, 0) # Does not kill, just checks if it's alive
|
|
161
|
+
|
|
162
|
+
# If no exception, the group is alive — kill it
|
|
163
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
164
|
+
print(f"Killed subprocess group {pgid}")
|
|
165
|
+
|
|
166
|
+
except ProcessLookupError:
|
|
167
|
+
print(f"Process group for PID {pid} no longer exists.")
|
|
168
|
+
except PermissionError:
|
|
169
|
+
print(f"Permission denied to kill process group for PID {pid}.")
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"Failed to kill subprocess group: {e}")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _run_pipeline_process(
|
|
175
|
+
ingest_config: PipelineCreationSchema,
|
|
176
|
+
disable_dynamic_scaling: Optional[bool],
|
|
177
|
+
dynamic_memory_threshold: Optional[float],
|
|
178
|
+
raw_stdout: Optional[TextIO] = None,
|
|
179
|
+
raw_stderr: Optional[TextIO] = None,
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Subprocess entrypoint to launch the pipeline. Redirects all output to the provided
|
|
183
|
+
file-like streams or /dev/null if not specified.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
ingest_config : PipelineCreationSchema
|
|
188
|
+
Validated pipeline configuration.
|
|
189
|
+
disable_dynamic_scaling : Optional[bool]
|
|
190
|
+
Whether to disable dynamic scaling.
|
|
191
|
+
dynamic_memory_threshold : Optional[float]
|
|
192
|
+
Threshold for triggering scaling.
|
|
193
|
+
raw_stdout : Optional[TextIO]
|
|
194
|
+
Destination for stdout. Defaults to /dev/null.
|
|
195
|
+
raw_stderr : Optional[TextIO]
|
|
196
|
+
Destination for stderr. Defaults to /dev/null.
|
|
197
|
+
"""
|
|
198
|
+
# Set the death signal for the subprocess
|
|
199
|
+
set_pdeathsig()
|
|
200
|
+
os.setsid() # Creates new process group so it can be SIGKILLed as a group
|
|
201
|
+
|
|
202
|
+
# Redirect OS-level file descriptors
|
|
203
|
+
redirect_os_fds(stdout=raw_stdout, stderr=raw_stderr)
|
|
204
|
+
|
|
205
|
+
# Redirect Python-level sys.stdout/sys.stderr
|
|
206
|
+
sys.stdout = raw_stdout or open(os.devnull, "w")
|
|
207
|
+
sys.stderr = raw_stderr or open(os.devnull, "w")
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
_launch_pipeline(
|
|
211
|
+
ingest_config,
|
|
212
|
+
block=True,
|
|
213
|
+
disable_dynamic_scaling=disable_dynamic_scaling,
|
|
214
|
+
dynamic_memory_threshold=dynamic_memory_threshold,
|
|
215
|
+
)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
sys.__stderr__.write(f"Subprocess pipeline run failed: {e}\n")
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _launch_pipeline(
|
|
222
|
+
ingest_config: PipelineCreationSchema,
|
|
223
|
+
block: bool,
|
|
224
|
+
disable_dynamic_scaling: bool = None,
|
|
225
|
+
dynamic_memory_threshold: float = None,
|
|
226
|
+
) -> Tuple[Union[RayPipeline, None], float]:
|
|
227
|
+
logger.info("Starting pipeline setup")
|
|
228
|
+
|
|
229
|
+
dynamic_memory_scaling = not DISABLE_DYNAMIC_SCALING
|
|
230
|
+
if disable_dynamic_scaling is not None:
|
|
231
|
+
dynamic_memory_scaling = not disable_dynamic_scaling
|
|
232
|
+
|
|
233
|
+
dynamic_memory_threshold = dynamic_memory_threshold if dynamic_memory_threshold else DYNAMIC_MEMORY_THRESHOLD
|
|
234
|
+
|
|
235
|
+
scaling_config = ScalingConfig(
|
|
236
|
+
dynamic_memory_scaling=dynamic_memory_scaling, dynamic_memory_threshold=dynamic_memory_threshold
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
pipeline = RayPipeline(scaling_config=scaling_config)
|
|
240
|
+
start_abs = datetime.now()
|
|
241
|
+
|
|
242
|
+
# Set up the ingestion pipeline
|
|
243
|
+
_ = setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
|
|
244
|
+
|
|
245
|
+
# Record setup time
|
|
246
|
+
end_setup = start_run = datetime.now()
|
|
247
|
+
setup_elapsed = (end_setup - start_abs).total_seconds()
|
|
248
|
+
logger.info(f"Pipeline setup completed in {setup_elapsed:.2f} seconds")
|
|
249
|
+
|
|
250
|
+
# Run the pipeline
|
|
251
|
+
logger.debug("Running pipeline")
|
|
252
|
+
pipeline.start()
|
|
253
|
+
|
|
254
|
+
if block:
|
|
255
|
+
try:
|
|
256
|
+
while True:
|
|
257
|
+
time.sleep(5)
|
|
258
|
+
except KeyboardInterrupt:
|
|
259
|
+
logger.info("Interrupt received, shutting down pipeline.")
|
|
260
|
+
pipeline.stop()
|
|
261
|
+
ray.shutdown()
|
|
262
|
+
logger.info("Ray shutdown complete.")
|
|
263
|
+
|
|
264
|
+
# Record execution times
|
|
265
|
+
end_run = datetime.now()
|
|
266
|
+
run_elapsed = (end_run - start_run).total_seconds()
|
|
267
|
+
total_elapsed = (end_run - start_abs).total_seconds()
|
|
268
|
+
|
|
269
|
+
logger.info(f"Pipeline run completed in {run_elapsed:.2f} seconds")
|
|
270
|
+
logger.info(f"Total time elapsed: {total_elapsed:.2f} seconds")
|
|
271
|
+
|
|
272
|
+
return None, total_elapsed
|
|
273
|
+
else:
|
|
274
|
+
return pipeline, 0.0
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def run_pipeline(
|
|
278
|
+
ingest_config: PipelineCreationSchema,
|
|
279
|
+
block: bool = True,
|
|
280
|
+
disable_dynamic_scaling: Optional[bool] = None,
|
|
281
|
+
dynamic_memory_threshold: Optional[float] = None,
|
|
282
|
+
run_in_subprocess: bool = False,
|
|
283
|
+
stdout: Optional[TextIO] = None,
|
|
284
|
+
stderr: Optional[TextIO] = None,
|
|
285
|
+
) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
|
|
286
|
+
"""
|
|
287
|
+
Launch and manage a pipeline, optionally in a subprocess.
|
|
288
|
+
|
|
289
|
+
This function is the primary entry point for executing a Ray pipeline,
|
|
290
|
+
either within the current process or in a separate Python subprocess.
|
|
291
|
+
It supports synchronous blocking execution or non-blocking lifecycle management,
|
|
292
|
+
and allows redirection of output to specified file-like objects.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
ingest_config : PipelineCreationSchema
|
|
297
|
+
The validated configuration object used to construct and launch the pipeline.
|
|
298
|
+
block : bool, default=True
|
|
299
|
+
If True, blocks until the pipeline completes.
|
|
300
|
+
If False, returns an interface to control the pipeline externally.
|
|
301
|
+
disable_dynamic_scaling : Optional[bool], default=None
|
|
302
|
+
If True, disables dynamic memory scaling. Overrides global configuration if set.
|
|
303
|
+
If None, uses the default or globally defined behavior.
|
|
304
|
+
dynamic_memory_threshold : Optional[float], default=None
|
|
305
|
+
The memory usage threshold (as a float between 0 and 1) that triggers autoscaling,
|
|
306
|
+
if dynamic scaling is enabled. Defaults to the globally configured value if None.
|
|
307
|
+
run_in_subprocess : bool, default=False
|
|
308
|
+
If True, launches the pipeline in a separate Python subprocess using `multiprocessing.Process`.
|
|
309
|
+
If False, runs the pipeline in the current process.
|
|
310
|
+
stdout : Optional[TextIO], default=None
|
|
311
|
+
Optional file-like stream to which subprocess stdout should be redirected.
|
|
312
|
+
If None, stdout is redirected to /dev/null.
|
|
313
|
+
stderr : Optional[TextIO], default=None
|
|
314
|
+
Optional file-like stream to which subprocess stderr should be redirected.
|
|
315
|
+
If None, stderr is redirected to /dev/null.
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]
|
|
320
|
+
- If run in-process with `block=True`: returns elapsed time in seconds (float).
|
|
321
|
+
- If run in-process with `block=False`: returns a `RayPipelineInterface`.
|
|
322
|
+
- If run in subprocess with `block=False`: returns a `RayPipelineSubprocessInterface`.
|
|
323
|
+
- If run in subprocess with `block=True`: returns 0.0.
|
|
324
|
+
|
|
325
|
+
Raises
|
|
326
|
+
------
|
|
327
|
+
RuntimeError
|
|
328
|
+
If the subprocess fails to start or exits with an error.
|
|
329
|
+
Exception
|
|
330
|
+
Any other exceptions raised during pipeline launch or configuration.
|
|
331
|
+
"""
|
|
332
|
+
if run_in_subprocess:
|
|
333
|
+
logger.info("Launching pipeline in Python subprocess using multiprocessing.")
|
|
334
|
+
|
|
335
|
+
ctx = multiprocessing.get_context("fork")
|
|
336
|
+
process = ctx.Process(
|
|
337
|
+
target=_run_pipeline_process,
|
|
338
|
+
args=(
|
|
339
|
+
ingest_config,
|
|
340
|
+
disable_dynamic_scaling,
|
|
341
|
+
dynamic_memory_threshold,
|
|
342
|
+
stdout, # raw_stdout
|
|
343
|
+
stderr, # raw_stderr
|
|
344
|
+
),
|
|
345
|
+
daemon=False,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
process.start()
|
|
349
|
+
|
|
350
|
+
interface = RayPipelineSubprocessInterface(process)
|
|
351
|
+
|
|
352
|
+
if block:
|
|
353
|
+
start_time = time.time()
|
|
354
|
+
logger.info("Waiting for subprocess pipeline to complete...")
|
|
355
|
+
process.join()
|
|
356
|
+
logger.info("Pipeline subprocess completed.")
|
|
357
|
+
return time.time() - start_time
|
|
358
|
+
else:
|
|
359
|
+
logger.info(f"Pipeline subprocess started (PID={process.pid})")
|
|
360
|
+
atexit.register(lambda: kill_pipeline_process_group(process.pid))
|
|
361
|
+
|
|
362
|
+
return interface
|
|
363
|
+
|
|
364
|
+
# Run inline
|
|
365
|
+
pipeline, total_elapsed = _launch_pipeline(
|
|
366
|
+
ingest_config,
|
|
367
|
+
block=block,
|
|
368
|
+
disable_dynamic_scaling=disable_dynamic_scaling,
|
|
369
|
+
dynamic_memory_threshold=dynamic_memory_threshold,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
if block:
|
|
373
|
+
logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
|
|
374
|
+
return total_elapsed
|
|
375
|
+
else:
|
|
376
|
+
return RayPipelineInterface(pipeline)
|