nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,609 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# TODO(Devin)
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
|
|
13
|
+
from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
|
|
14
|
+
from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
|
|
15
|
+
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
16
|
+
|
|
17
|
+
# Import our new pipeline class.
|
|
18
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor import AudioExtractorStage
|
|
19
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor import ChartExtractorStage
|
|
20
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor import DocxExtractorStage
|
|
21
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor import ImageExtractorStage
|
|
22
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extractor import InfographicExtractorStage
|
|
23
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
|
|
24
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
|
|
25
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
|
|
26
|
+
|
|
27
|
+
from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
|
|
28
|
+
from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
|
|
29
|
+
from nv_ingest.framework.orchestration.ray.stages.mutate.image_filter import ImageFilterStage
|
|
30
|
+
from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
|
|
31
|
+
MessageBrokerTaskSinkStage,
|
|
32
|
+
MessageBrokerTaskSinkConfig,
|
|
33
|
+
)
|
|
34
|
+
from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
|
|
35
|
+
MessageBrokerTaskSourceStage,
|
|
36
|
+
MessageBrokerTaskSourceConfig,
|
|
37
|
+
start_simple_message_broker,
|
|
38
|
+
)
|
|
39
|
+
from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
|
|
40
|
+
from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
|
|
41
|
+
from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
|
|
42
|
+
from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
|
|
43
|
+
from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
|
|
44
|
+
from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
|
|
45
|
+
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
46
|
+
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
47
|
+
from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
|
|
48
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
49
|
+
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
50
|
+
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
|
|
51
|
+
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
52
|
+
from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
|
|
53
|
+
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
54
|
+
from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
|
|
55
|
+
from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
|
|
56
|
+
from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
|
|
57
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
|
|
58
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
59
|
+
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
60
|
+
|
|
61
|
+
logger = logging.getLogger(__name__)
|
|
62
|
+
|
|
63
|
+
_system_resource_probe = SystemResourceProbe()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def validate_positive(ctx, param, value):
|
|
67
|
+
if value <= 0:
|
|
68
|
+
raise click.BadParameter("must be a positive integer")
|
|
69
|
+
return value
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_message_provider_config():
|
|
73
|
+
message_provider_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
|
|
74
|
+
message_provider_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
|
|
75
|
+
|
|
76
|
+
logger.info(f"MESSAGE_CLIENT_HOST: {message_provider_host}")
|
|
77
|
+
logger.info(f"MESSAGE_CLIENT_PORT: {message_provider_port}")
|
|
78
|
+
|
|
79
|
+
return message_provider_host, message_provider_port
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_caption_classifier_service():
|
|
83
|
+
triton_service_caption_classifier = os.environ.get(
|
|
84
|
+
"CAPTION_CLASSIFIER_GRPC_TRITON",
|
|
85
|
+
"",
|
|
86
|
+
)
|
|
87
|
+
triton_service_caption_classifier_name = os.environ.get(
|
|
88
|
+
"CAPTION_CLASSIFIER_MODEL_NAME",
|
|
89
|
+
"",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
logger.info(f"CAPTION_CLASSIFIER_GRPC_TRITON: {triton_service_caption_classifier}")
|
|
93
|
+
|
|
94
|
+
return triton_service_caption_classifier, triton_service_caption_classifier_name
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_nim_service(env_var_prefix):
|
|
98
|
+
prefix = env_var_prefix.upper()
|
|
99
|
+
grpc_endpoint = os.environ.get(
|
|
100
|
+
f"{prefix}_GRPC_ENDPOINT",
|
|
101
|
+
"",
|
|
102
|
+
)
|
|
103
|
+
http_endpoint = os.environ.get(
|
|
104
|
+
f"{prefix}_HTTP_ENDPOINT",
|
|
105
|
+
"",
|
|
106
|
+
)
|
|
107
|
+
auth_token = os.environ.get(
|
|
108
|
+
"NVIDIA_BUILD_API_KEY",
|
|
109
|
+
"",
|
|
110
|
+
) or os.environ.get(
|
|
111
|
+
"NGC_API_KEY",
|
|
112
|
+
"",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
infer_protocol = os.environ.get(
|
|
116
|
+
f"{prefix}_INFER_PROTOCOL",
|
|
117
|
+
"http" if http_endpoint else "grpc" if grpc_endpoint else "",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
|
|
121
|
+
logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
|
|
122
|
+
logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
|
|
123
|
+
|
|
124
|
+
return grpc_endpoint, http_endpoint, auth_token, infer_protocol
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def get_audio_retrieval_service(env_var_prefix):
|
|
128
|
+
prefix = env_var_prefix.upper()
|
|
129
|
+
grpc_endpoint = os.environ.get(
|
|
130
|
+
"AUDIO_GRPC_ENDPOINT",
|
|
131
|
+
"",
|
|
132
|
+
)
|
|
133
|
+
http_endpoint = os.environ.get(
|
|
134
|
+
"AUDIO_HTTP_ENDPOINT",
|
|
135
|
+
"",
|
|
136
|
+
)
|
|
137
|
+
auth_token = os.environ.get(
|
|
138
|
+
"NVIDIA_BUILD_API_KEY",
|
|
139
|
+
"",
|
|
140
|
+
) or os.environ.get(
|
|
141
|
+
"NGC_API_KEY",
|
|
142
|
+
"",
|
|
143
|
+
)
|
|
144
|
+
infer_protocol = os.environ.get(
|
|
145
|
+
"AUDIO_INFER_PROTOCOL",
|
|
146
|
+
"http" if http_endpoint else "grpc" if grpc_endpoint else "",
|
|
147
|
+
)
|
|
148
|
+
function_id = os.environ.get(
|
|
149
|
+
"AUDIO_FUNCTION_ID",
|
|
150
|
+
"",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
|
|
154
|
+
logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
|
|
155
|
+
logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
|
|
156
|
+
logger.info(f"{prefix}_FUNCTION_ID: {function_id}")
|
|
157
|
+
|
|
158
|
+
return grpc_endpoint, http_endpoint, auth_token, infer_protocol, function_id
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def add_metadata_injector_stage(pipeline, default_cpu_count, stage_name="metadata_injector"):
|
|
162
|
+
_ = default_cpu_count # Placeholder for future use
|
|
163
|
+
config = MetadataInjectorSchema()
|
|
164
|
+
|
|
165
|
+
pipeline.add_stage(
|
|
166
|
+
name=stage_name,
|
|
167
|
+
stage_actor=MetadataInjectionStage,
|
|
168
|
+
config=config,
|
|
169
|
+
min_replicas=0,
|
|
170
|
+
max_replicas=1,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return stage_name
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def add_pdf_extractor_stage(pipeline, default_cpu_count, stage_name="pdf_extractor"):
|
|
177
|
+
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
178
|
+
nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
|
|
179
|
+
get_nim_service("nemoretriever_parse")
|
|
180
|
+
)
|
|
181
|
+
model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
|
|
182
|
+
|
|
183
|
+
extractor_config = PDFExtractorSchema(
|
|
184
|
+
**{
|
|
185
|
+
"pdfium_config": {
|
|
186
|
+
"auth_token": yolox_auth, # All auth tokens are the same for the moment
|
|
187
|
+
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
188
|
+
"yolox_infer_protocol": yolox_protocol,
|
|
189
|
+
},
|
|
190
|
+
"nemoretriever_parse_config": {
|
|
191
|
+
"auth_token": nemoretriever_parse_auth,
|
|
192
|
+
"nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
|
|
193
|
+
"nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
|
|
194
|
+
"nemoretriever_parse_model_name": model_name,
|
|
195
|
+
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
196
|
+
"yolox_infer_protocol": yolox_protocol,
|
|
197
|
+
},
|
|
198
|
+
}
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
pipeline.add_stage(
|
|
202
|
+
name=stage_name,
|
|
203
|
+
stage_actor=PDFExtractorStage,
|
|
204
|
+
config=extractor_config,
|
|
205
|
+
min_replicas=0,
|
|
206
|
+
max_replicas=int(max(1, (default_cpu_count // 3))), # 33% of available CPU cores
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return stage_name
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def add_table_extractor_stage(pipeline, default_cpu_count, stage_name="table_extractor"):
|
|
213
|
+
yolox_table_structure_grpc, yolox_table_structure_http, yolox_auth, yolox_table_structure_protocol = (
|
|
214
|
+
get_nim_service("yolox_table_structure")
|
|
215
|
+
)
|
|
216
|
+
paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
|
|
217
|
+
|
|
218
|
+
table_extractor_config = TableExtractorSchema(
|
|
219
|
+
**{
|
|
220
|
+
"endpoint_config": {
|
|
221
|
+
"yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
|
|
222
|
+
"yolox_infer_protocol": yolox_table_structure_protocol,
|
|
223
|
+
"paddle_endpoints": (paddle_grpc, paddle_http),
|
|
224
|
+
"paddle_infer_protocol": paddle_protocol,
|
|
225
|
+
"auth_token": yolox_auth,
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
pipeline.add_stage(
|
|
231
|
+
name=stage_name,
|
|
232
|
+
stage_actor=TableExtractorStage,
|
|
233
|
+
config=table_extractor_config,
|
|
234
|
+
min_replicas=0,
|
|
235
|
+
max_replicas=int(max(1, (default_cpu_count // 7))), # 14% of available CPU cores
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return stage_name
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_extractor"):
|
|
242
|
+
yolox_graphic_elements_grpc, yolox_graphic_elements_http, yolox_auth, yolox_graphic_elements_protocol = (
|
|
243
|
+
get_nim_service("yolox_graphic_elements")
|
|
244
|
+
)
|
|
245
|
+
paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
|
|
246
|
+
|
|
247
|
+
chart_extractor_config = ChartExtractorSchema(
|
|
248
|
+
**{
|
|
249
|
+
"endpoint_config": {
|
|
250
|
+
"yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
|
|
251
|
+
"yolox_infer_protocol": yolox_graphic_elements_protocol,
|
|
252
|
+
"paddle_endpoints": (paddle_grpc, paddle_http),
|
|
253
|
+
"paddle_infer_protocol": paddle_protocol,
|
|
254
|
+
"auth_token": yolox_auth,
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
pipeline.add_stage(
|
|
260
|
+
name=stage_name,
|
|
261
|
+
stage_actor=ChartExtractorStage,
|
|
262
|
+
config=chart_extractor_config,
|
|
263
|
+
min_replicas=0,
|
|
264
|
+
max_replicas=int(max(1, (default_cpu_count // 7))), # 14% of available CPU cores
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return stage_name
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def add_infographic_extractor_stage(pipeline, default_cpu_count, stage_name="infographic_extractor"):
|
|
271
|
+
paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
|
|
272
|
+
|
|
273
|
+
infographic_content_extractor_config = InfographicExtractorSchema(
|
|
274
|
+
**{
|
|
275
|
+
"endpoint_config": {
|
|
276
|
+
"paddle_endpoints": (paddle_grpc, paddle_http),
|
|
277
|
+
"paddle_infer_protocol": paddle_protocol,
|
|
278
|
+
"auth_token": paddle_auth,
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
pipeline.add_stage(
|
|
284
|
+
name=stage_name,
|
|
285
|
+
stage_actor=InfographicExtractorStage,
|
|
286
|
+
config=infographic_content_extractor_config,
|
|
287
|
+
min_replicas=0,
|
|
288
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return stage_name
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def add_image_extractor_stage(pipeline, default_cpu_count, stage_name="image_extractor"):
|
|
295
|
+
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
296
|
+
|
|
297
|
+
image_extractor_config = ImageConfigSchema(
|
|
298
|
+
**{
|
|
299
|
+
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
300
|
+
"yolox_infer_protocol": yolox_protocol,
|
|
301
|
+
"auth_token": yolox_auth, # All auth tokens are the same for the moment
|
|
302
|
+
}
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
pipeline.add_stage(
|
|
306
|
+
name=stage_name,
|
|
307
|
+
stage_actor=ImageExtractorStage,
|
|
308
|
+
config=image_extractor_config,
|
|
309
|
+
min_replicas=0,
|
|
310
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
return stage_name
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def add_docx_extractor_stage(pipeline, default_cpu_count, stage_name="docx_extractor"):
|
|
317
|
+
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
318
|
+
|
|
319
|
+
docx_extractor_config = {
|
|
320
|
+
"docx_extraction_config": {
|
|
321
|
+
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
322
|
+
"yolox_infer_protocol": yolox_protocol,
|
|
323
|
+
"auth_token": yolox_auth,
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
pipeline.add_stage(
|
|
328
|
+
name=stage_name,
|
|
329
|
+
stage_actor=DocxExtractorStage,
|
|
330
|
+
config=DocxExtractorSchema(**docx_extractor_config),
|
|
331
|
+
min_replicas=0,
|
|
332
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return stage_name
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def add_pptx_extractor_stage(pipeline, default_cpu_count, stage_name="pptx_extractor"):
|
|
339
|
+
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
340
|
+
|
|
341
|
+
pptx_extractor_config = {
|
|
342
|
+
"pptx_extraction_config": {
|
|
343
|
+
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
344
|
+
"yolox_infer_protocol": yolox_protocol,
|
|
345
|
+
"auth_token": yolox_auth,
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
pipeline.add_stage(
|
|
350
|
+
name=stage_name,
|
|
351
|
+
stage_actor=PPTXExtractorStage,
|
|
352
|
+
config=PPTXExtractorSchema(**pptx_extractor_config),
|
|
353
|
+
min_replicas=0,
|
|
354
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return stage_name
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_extractor"):
|
|
361
|
+
audio_grpc, audio_http, audio_auth, audio_infer_protocol, audio_function_id = get_audio_retrieval_service("audio")
|
|
362
|
+
|
|
363
|
+
audio_extractor_config = AudioExtractorSchema(
|
|
364
|
+
**{
|
|
365
|
+
"audio_extraction_config": {
|
|
366
|
+
"audio_endpoints": (audio_grpc, audio_http),
|
|
367
|
+
"audio_infer_protocol": audio_infer_protocol,
|
|
368
|
+
"function_id": audio_function_id,
|
|
369
|
+
"auth_token": audio_auth,
|
|
370
|
+
# All auth tokens are the same for the moment
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
pipeline.add_stage(
|
|
376
|
+
name=stage_name,
|
|
377
|
+
stage_actor=AudioExtractorStage,
|
|
378
|
+
config=audio_extractor_config,
|
|
379
|
+
min_replicas=0,
|
|
380
|
+
max_replicas=1, # Audio extraction is a heavy IO bound operation with minimal CPU usage
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return stage_name
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
|
|
387
|
+
_ = default_cpu_count # Placeholder for future use
|
|
388
|
+
otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
|
389
|
+
|
|
390
|
+
otel_tracer_config = OpenTelemetryTracerSchema(
|
|
391
|
+
**{
|
|
392
|
+
"otel_endpoint": otel_endpoint,
|
|
393
|
+
}
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
pipeline.add_stage(
|
|
397
|
+
name=stage_name,
|
|
398
|
+
stage_actor=OpenTelemetryTracerStage,
|
|
399
|
+
config=otel_tracer_config,
|
|
400
|
+
min_replicas=0,
|
|
401
|
+
max_replicas=2,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
return stage_name
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def add_image_dedup_stage(pipeline, default_cpu_count, stage_name="image_dedup"):
|
|
408
|
+
config = ImageDedupSchema()
|
|
409
|
+
|
|
410
|
+
pipeline.add_stage(
|
|
411
|
+
name=stage_name,
|
|
412
|
+
stage_actor=ImageDedupStage,
|
|
413
|
+
config=config,
|
|
414
|
+
min_replicas=0,
|
|
415
|
+
max_replicas=1,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return stage_name
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def add_image_filter_stage(pipeline, default_cpu_count, stage_name="image_filter"):
|
|
422
|
+
config = ImageFilterSchema()
|
|
423
|
+
|
|
424
|
+
pipeline.add_stage(
|
|
425
|
+
name=stage_name,
|
|
426
|
+
stage_actor=ImageFilterStage,
|
|
427
|
+
config=config,
|
|
428
|
+
min_replicas=0,
|
|
429
|
+
max_replicas=1,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
return stage_name
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def add_text_splitter_stage(pipeline, default_cpu_count, stage_name="text_splitter"):
|
|
436
|
+
_ = default_cpu_count
|
|
437
|
+
|
|
438
|
+
config = TextSplitterSchema()
|
|
439
|
+
|
|
440
|
+
pipeline.add_stage(
|
|
441
|
+
name=stage_name,
|
|
442
|
+
stage_actor=TextSplitterStage,
|
|
443
|
+
config=config,
|
|
444
|
+
min_replicas=0,
|
|
445
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
return stage_name
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_caption"):
|
|
452
|
+
auth_token = os.environ.get(
|
|
453
|
+
"NVIDIA_BUILD_API_KEY",
|
|
454
|
+
"",
|
|
455
|
+
) or os.environ.get(
|
|
456
|
+
"NGC_API_KEY",
|
|
457
|
+
"",
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
endpoint_url = os.environ.get("VLM_CAPTION_ENDPOINT", "localhost:5000")
|
|
461
|
+
model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
|
|
462
|
+
|
|
463
|
+
config = ImageCaptionExtractionSchema(
|
|
464
|
+
**{
|
|
465
|
+
"api_key": auth_token,
|
|
466
|
+
"endpoint_url": endpoint_url,
|
|
467
|
+
"image_caption_model_name": model_name,
|
|
468
|
+
"prompt": "Caption the content of this image:",
|
|
469
|
+
}
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
pipeline.add_stage(
|
|
473
|
+
name=stage_name,
|
|
474
|
+
stage_actor=ImageCaptionTransformStage,
|
|
475
|
+
config=config,
|
|
476
|
+
min_replicas=0,
|
|
477
|
+
max_replicas=1,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
return stage_name
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def add_text_embedding_stage(pipeline, default_cpu_count, stage_name="text_embedding"):
|
|
484
|
+
api_key = os.environ.get(
|
|
485
|
+
"NVIDIA_BUILD_API_KEY",
|
|
486
|
+
"",
|
|
487
|
+
) or os.environ.get(
|
|
488
|
+
"NGC_API_KEY",
|
|
489
|
+
"",
|
|
490
|
+
)
|
|
491
|
+
embedding_nim_endpoint = os.getenv("EMBEDDING_NIM_ENDPOINT", "http://embedding:8000/v1")
|
|
492
|
+
embedding_model = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
|
|
493
|
+
|
|
494
|
+
config = TextEmbeddingSchema(
|
|
495
|
+
**{
|
|
496
|
+
"api_key": api_key,
|
|
497
|
+
"embedding_nim_endpoint": embedding_nim_endpoint,
|
|
498
|
+
"embedding_model": embedding_model,
|
|
499
|
+
}
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
pipeline.add_stage(
|
|
503
|
+
name=stage_name,
|
|
504
|
+
stage_actor=TextEmbeddingTransformStage,
|
|
505
|
+
config=config,
|
|
506
|
+
min_replicas=0,
|
|
507
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
return stage_name
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def add_embedding_storage_stage(pipeline, default_cpu_count, stage_name="embedding_storage"):
|
|
514
|
+
config = EmbeddingStorageSchema()
|
|
515
|
+
|
|
516
|
+
pipeline.add_stage(
|
|
517
|
+
name=stage_name,
|
|
518
|
+
stage_actor=EmbeddingStorageStage,
|
|
519
|
+
config=config,
|
|
520
|
+
min_replicas=0,
|
|
521
|
+
max_replicas=1,
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
return stage_name
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def add_image_storage_stage(pipeline, default_cpu_count, stage_name="image_storage"):
|
|
528
|
+
config = ImageStorageModuleSchema()
|
|
529
|
+
pipeline.add_stage(
|
|
530
|
+
name=stage_name,
|
|
531
|
+
stage_actor=ImageStorageStage,
|
|
532
|
+
config=config,
|
|
533
|
+
min_replicas=0,
|
|
534
|
+
max_replicas=1,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
return stage_name
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def add_default_drain_stage(pipeline, default_cpu_count, stage_name="pipeline_drain"):
|
|
541
|
+
pipeline.add_stage(
|
|
542
|
+
name=stage_name,
|
|
543
|
+
stage_actor=DefaultDrainSink,
|
|
544
|
+
config=None,
|
|
545
|
+
min_replicas=1,
|
|
546
|
+
max_replicas=1,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
return stage_name
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def add_message_broker_response_stage(pipeline, default_cpu_count, stage_name="broker_response"):
|
|
553
|
+
task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
|
|
554
|
+
task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
|
|
555
|
+
client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
|
|
556
|
+
|
|
557
|
+
sink_config = MessageBrokerTaskSinkConfig(
|
|
558
|
+
**{
|
|
559
|
+
"broker_client": {
|
|
560
|
+
"host": task_broker_host,
|
|
561
|
+
"port": task_broker_port,
|
|
562
|
+
"client_type": client_type,
|
|
563
|
+
},
|
|
564
|
+
}
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
pipeline.add_stage(
|
|
568
|
+
name=stage_name,
|
|
569
|
+
stage_actor=MessageBrokerTaskSinkStage,
|
|
570
|
+
config=sink_config,
|
|
571
|
+
min_replicas=0,
|
|
572
|
+
max_replicas=2,
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
return stage_name
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def add_source_stage(pipeline, default_cpu_count, source_name="pipeline_source"):
|
|
579
|
+
_ = default_cpu_count # Placeholder for future use
|
|
580
|
+
task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
|
|
581
|
+
task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
|
|
582
|
+
|
|
583
|
+
client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
|
|
584
|
+
task_queue_name = os.environ.get("MESSAGE_CLIENT_QUEUE", "ingest_task_queue")
|
|
585
|
+
|
|
586
|
+
source_config = MessageBrokerTaskSourceConfig(
|
|
587
|
+
**{
|
|
588
|
+
"broker_client": {
|
|
589
|
+
"host": task_broker_host,
|
|
590
|
+
"port": task_broker_port,
|
|
591
|
+
"client_type": client_type,
|
|
592
|
+
},
|
|
593
|
+
"task_queue": task_queue_name,
|
|
594
|
+
"poll_interval": "0.1",
|
|
595
|
+
}
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
pipeline.add_source(
|
|
599
|
+
name=source_name,
|
|
600
|
+
source_actor=MessageBrokerTaskSourceStage,
|
|
601
|
+
config=source_config,
|
|
602
|
+
min_replicas=1,
|
|
603
|
+
max_replicas=1,
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if source_config.broker_client.client_type == "simple":
|
|
607
|
+
start_simple_message_broker(source_config.broker_client.model_dump())
|
|
608
|
+
|
|
609
|
+
return source_name
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
import psutil
|
|
8
|
+
import ray
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def estimate_actor_memory_overhead(
|
|
12
|
+
actor_class, iterations=1, stabilization_threshold=1 * 1024 * 1024, wait_time=2, actor_args=None, actor_kwargs=None
|
|
13
|
+
):
|
|
14
|
+
"""
|
|
15
|
+
Estimate the additional system memory overhead when launching a Ray actor of the given actor_class.
|
|
16
|
+
|
|
17
|
+
Parameters:
|
|
18
|
+
actor_class: A Ray remote actor class.
|
|
19
|
+
iterations (int): Number of measurement iterations.
|
|
20
|
+
stabilization_threshold (int): Maximum difference (in bytes) between min and max measurements to
|
|
21
|
+
consider results stable.
|
|
22
|
+
wait_time (float): Seconds to wait after spawning or killing an actor for memory to stabilize.
|
|
23
|
+
actor_args (list): Positional arguments to pass to the actor's remote() call.
|
|
24
|
+
actor_kwargs (dict): Keyword arguments to pass to the actor's remote() call.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
float: Estimated average overhead in bytes for replicating the actor.
|
|
28
|
+
"""
|
|
29
|
+
actor_args = actor_args if actor_args is not None else []
|
|
30
|
+
actor_kwargs = actor_kwargs if actor_kwargs is not None else {}
|
|
31
|
+
|
|
32
|
+
measurements = []
|
|
33
|
+
|
|
34
|
+
iterations = 0 # TODO
|
|
35
|
+
for i in range(iterations):
|
|
36
|
+
# Record baseline system memory usage.
|
|
37
|
+
baseline = psutil.virtual_memory().used
|
|
38
|
+
|
|
39
|
+
# Spin up a new actor with provided arguments.
|
|
40
|
+
actor = actor_class.options(name=f"mem_estimator_{uuid.uuid4()}").remote(*actor_args, **actor_kwargs)
|
|
41
|
+
# Allow time for the actor to start.
|
|
42
|
+
time.sleep(wait_time)
|
|
43
|
+
|
|
44
|
+
# Measure memory after actor has started.
|
|
45
|
+
after_spawn = psutil.virtual_memory().used
|
|
46
|
+
overhead = after_spawn - baseline
|
|
47
|
+
measurements.append(overhead)
|
|
48
|
+
|
|
49
|
+
# Kill the actor.
|
|
50
|
+
ray.kill(actor, no_restart=True)
|
|
51
|
+
# Allow time for system memory to be released.
|
|
52
|
+
time.sleep(wait_time)
|
|
53
|
+
|
|
54
|
+
if measurements:
|
|
55
|
+
_ = max(measurements) - min(measurements)
|
|
56
|
+
_ = sum(measurements) / len(measurements)
|
|
57
|
+
|
|
58
|
+
return 1_500_000_000
|
|
59
|
+
# return estimated_overhead Need to come up with a better way to estiamte actor overhead.
|