nv-ingest 2025.8.13.dev20250813__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
- nv_ingest/framework/orchestration/process/execution.py +497 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
- nv_ingest/framework/orchestration/process/strategies.py +182 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +198 -0
- nv_ingest/pipeline/config/replica_resolver.py +227 -0
- nv_ingest/pipeline/default_pipeline_impl.py +517 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
- {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
|
@@ -1,649 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
import psutil
|
|
7
|
-
import click
|
|
8
|
-
import logging
|
|
9
|
-
|
|
10
|
-
from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
|
|
11
|
-
from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
|
|
12
|
-
from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
|
|
13
|
-
from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
|
|
14
|
-
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
15
|
-
|
|
16
|
-
# Import our new pipeline class.
|
|
17
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor import AudioExtractorStage
|
|
18
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor import ChartExtractorStage
|
|
19
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor import DocxExtractorStage
|
|
20
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor import ImageExtractorStage
|
|
21
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extractor import InfographicExtractorStage
|
|
22
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
|
|
23
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
|
|
24
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
|
|
25
|
-
from nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor import HtmlExtractorStage
|
|
26
|
-
|
|
27
|
-
from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
|
|
28
|
-
from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
|
|
29
|
-
from nv_ingest.framework.orchestration.ray.stages.mutate.image_filter import ImageFilterStage
|
|
30
|
-
from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
|
|
31
|
-
MessageBrokerTaskSinkStage,
|
|
32
|
-
MessageBrokerTaskSinkConfig,
|
|
33
|
-
)
|
|
34
|
-
from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
|
|
35
|
-
MessageBrokerTaskSourceStage,
|
|
36
|
-
MessageBrokerTaskSourceConfig,
|
|
37
|
-
start_simple_message_broker,
|
|
38
|
-
)
|
|
39
|
-
from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
|
|
40
|
-
from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
|
|
41
|
-
from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
|
|
42
|
-
from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
|
|
43
|
-
from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
|
|
44
|
-
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
45
|
-
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
46
|
-
from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
|
|
47
|
-
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
48
|
-
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
49
|
-
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
|
|
50
|
-
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
51
|
-
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
|
|
52
|
-
from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
|
|
53
|
-
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
54
|
-
from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
|
|
55
|
-
from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
|
|
56
|
-
from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
|
|
57
|
-
from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
|
|
58
|
-
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
59
|
-
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
60
|
-
from nv_ingest.framework.orchestration.ray.util.env_config import DYNAMIC_MEMORY_THRESHOLD
|
|
61
|
-
|
|
62
|
-
logger = logging.getLogger(__name__)
|
|
63
|
-
|
|
64
|
-
_system_resource_probe = SystemResourceProbe()
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def validate_positive(ctx, param, value):
|
|
68
|
-
if value <= 0:
|
|
69
|
-
raise click.BadParameter("must be a positive integer")
|
|
70
|
-
return value
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def get_message_provider_config():
|
|
74
|
-
message_provider_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
|
|
75
|
-
message_provider_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
|
|
76
|
-
|
|
77
|
-
logger.info(f"MESSAGE_CLIENT_HOST: {message_provider_host}")
|
|
78
|
-
logger.info(f"MESSAGE_CLIENT_PORT: {message_provider_port}")
|
|
79
|
-
|
|
80
|
-
return message_provider_host, message_provider_port
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def get_caption_classifier_service():
|
|
84
|
-
triton_service_caption_classifier = os.environ.get(
|
|
85
|
-
"CAPTION_CLASSIFIER_GRPC_TRITON",
|
|
86
|
-
"",
|
|
87
|
-
)
|
|
88
|
-
triton_service_caption_classifier_name = os.environ.get(
|
|
89
|
-
"CAPTION_CLASSIFIER_MODEL_NAME",
|
|
90
|
-
"",
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
logger.info(f"CAPTION_CLASSIFIER_GRPC_TRITON: {triton_service_caption_classifier}")
|
|
94
|
-
|
|
95
|
-
return triton_service_caption_classifier, triton_service_caption_classifier_name
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def get_nim_service(env_var_prefix):
|
|
99
|
-
prefix = env_var_prefix.upper()
|
|
100
|
-
grpc_endpoint = os.environ.get(
|
|
101
|
-
f"{prefix}_GRPC_ENDPOINT",
|
|
102
|
-
"",
|
|
103
|
-
)
|
|
104
|
-
http_endpoint = os.environ.get(
|
|
105
|
-
f"{prefix}_HTTP_ENDPOINT",
|
|
106
|
-
"",
|
|
107
|
-
)
|
|
108
|
-
auth_token = os.environ.get(
|
|
109
|
-
"NVIDIA_API_KEY",
|
|
110
|
-
"",
|
|
111
|
-
) or os.environ.get(
|
|
112
|
-
"NGC_API_KEY",
|
|
113
|
-
"",
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
infer_protocol = os.environ.get(
|
|
117
|
-
f"{prefix}_INFER_PROTOCOL",
|
|
118
|
-
"http" if http_endpoint else "grpc" if grpc_endpoint else "",
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
|
|
122
|
-
logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
|
|
123
|
-
logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
|
|
124
|
-
|
|
125
|
-
return grpc_endpoint, http_endpoint, auth_token, infer_protocol
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def get_audio_retrieval_service(env_var_prefix):
|
|
129
|
-
prefix = env_var_prefix.upper()
|
|
130
|
-
grpc_endpoint = os.environ.get(
|
|
131
|
-
"AUDIO_GRPC_ENDPOINT",
|
|
132
|
-
"",
|
|
133
|
-
)
|
|
134
|
-
http_endpoint = os.environ.get(
|
|
135
|
-
"AUDIO_HTTP_ENDPOINT",
|
|
136
|
-
"",
|
|
137
|
-
)
|
|
138
|
-
auth_token = os.environ.get(
|
|
139
|
-
"NVIDIA_API_KEY",
|
|
140
|
-
"",
|
|
141
|
-
) or os.environ.get(
|
|
142
|
-
"NGC_API_KEY",
|
|
143
|
-
"",
|
|
144
|
-
)
|
|
145
|
-
infer_protocol = os.environ.get(
|
|
146
|
-
"AUDIO_INFER_PROTOCOL",
|
|
147
|
-
"http" if http_endpoint else "grpc" if grpc_endpoint else "",
|
|
148
|
-
)
|
|
149
|
-
function_id = os.environ.get(
|
|
150
|
-
"AUDIO_FUNCTION_ID",
|
|
151
|
-
"",
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
|
|
155
|
-
logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
|
|
156
|
-
logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
|
|
157
|
-
logger.info(f"{prefix}_FUNCTION_ID: {function_id}")
|
|
158
|
-
|
|
159
|
-
return grpc_endpoint, http_endpoint, auth_token, infer_protocol, function_id
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def add_metadata_injector_stage(pipeline, default_cpu_count, stage_name="metadata_injector"):
|
|
163
|
-
_ = default_cpu_count # Placeholder for future use
|
|
164
|
-
config = MetadataInjectorSchema()
|
|
165
|
-
|
|
166
|
-
pipeline.add_stage(
|
|
167
|
-
name=stage_name,
|
|
168
|
-
stage_actor=MetadataInjectionStage,
|
|
169
|
-
config=config,
|
|
170
|
-
min_replicas=0,
|
|
171
|
-
max_replicas=1,
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
return stage_name
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
def add_pdf_extractor_stage(pipeline, default_cpu_count, stage_name="pdf_extractor"):
|
|
178
|
-
# Heuristic: Determine max_replicas based on system memory, capped by CPU cores.
|
|
179
|
-
total_memory_mb = psutil.virtual_memory().total / (1024**2)
|
|
180
|
-
|
|
181
|
-
# Allocate up to 75% of memory to this stage, using a 10GB high watermark per worker.
|
|
182
|
-
allocatable_memory_for_stage_mb = total_memory_mb * DYNAMIC_MEMORY_THRESHOLD
|
|
183
|
-
memory_based_replicas = int(allocatable_memory_for_stage_mb / 10_000.0)
|
|
184
|
-
|
|
185
|
-
# Cap the number of replicas by the number of available CPU cores.
|
|
186
|
-
max_replicas = max(1, min(memory_based_replicas, default_cpu_count))
|
|
187
|
-
|
|
188
|
-
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
189
|
-
nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
|
|
190
|
-
get_nim_service("nemoretriever_parse")
|
|
191
|
-
)
|
|
192
|
-
model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
|
|
193
|
-
|
|
194
|
-
extractor_config = PDFExtractorSchema(
|
|
195
|
-
**{
|
|
196
|
-
"pdfium_config": {
|
|
197
|
-
"auth_token": yolox_auth, # All auth tokens are the same for the moment
|
|
198
|
-
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
199
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
200
|
-
},
|
|
201
|
-
"nemoretriever_parse_config": {
|
|
202
|
-
"auth_token": nemoretriever_parse_auth,
|
|
203
|
-
"nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
|
|
204
|
-
"nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
|
|
205
|
-
"nemoretriever_parse_model_name": model_name,
|
|
206
|
-
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
207
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
208
|
-
},
|
|
209
|
-
}
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
pipeline.add_stage(
|
|
213
|
-
name=stage_name,
|
|
214
|
-
stage_actor=PDFExtractorStage,
|
|
215
|
-
config=extractor_config,
|
|
216
|
-
min_replicas=0,
|
|
217
|
-
max_replicas=max_replicas,
|
|
218
|
-
)
|
|
219
|
-
return stage_name
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def add_table_extractor_stage(pipeline, default_cpu_count, stage_name="table_extractor"):
|
|
223
|
-
yolox_table_structure_grpc, yolox_table_structure_http, yolox_auth, yolox_table_structure_protocol = (
|
|
224
|
-
get_nim_service("yolox_table_structure")
|
|
225
|
-
)
|
|
226
|
-
ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
|
|
227
|
-
|
|
228
|
-
table_extractor_config = TableExtractorSchema(
|
|
229
|
-
**{
|
|
230
|
-
"endpoint_config": {
|
|
231
|
-
"yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
|
|
232
|
-
"yolox_infer_protocol": yolox_table_structure_protocol,
|
|
233
|
-
"ocr_endpoints": (ocr_grpc, ocr_http),
|
|
234
|
-
"ocr_infer_protocol": ocr_protocol,
|
|
235
|
-
"auth_token": yolox_auth,
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
)
|
|
239
|
-
|
|
240
|
-
pipeline.add_stage(
|
|
241
|
-
name=stage_name,
|
|
242
|
-
stage_actor=TableExtractorStage,
|
|
243
|
-
config=table_extractor_config,
|
|
244
|
-
min_replicas=0,
|
|
245
|
-
max_replicas=_get_max_replicas(default_cpu_count, percentage_of_cpu=0.20, replica_limit=4),
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
return stage_name
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_extractor"):
|
|
252
|
-
yolox_graphic_elements_grpc, yolox_graphic_elements_http, yolox_auth, yolox_graphic_elements_protocol = (
|
|
253
|
-
get_nim_service("yolox_graphic_elements")
|
|
254
|
-
)
|
|
255
|
-
ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
|
|
256
|
-
|
|
257
|
-
chart_extractor_config = ChartExtractorSchema(
|
|
258
|
-
**{
|
|
259
|
-
"endpoint_config": {
|
|
260
|
-
"yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
|
|
261
|
-
"yolox_infer_protocol": yolox_graphic_elements_protocol,
|
|
262
|
-
"ocr_endpoints": (ocr_grpc, ocr_http),
|
|
263
|
-
"ocr_infer_protocol": ocr_protocol,
|
|
264
|
-
"auth_token": yolox_auth,
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
pipeline.add_stage(
|
|
270
|
-
name=stage_name,
|
|
271
|
-
stage_actor=ChartExtractorStage,
|
|
272
|
-
config=chart_extractor_config,
|
|
273
|
-
min_replicas=0,
|
|
274
|
-
max_replicas=_get_max_replicas(default_cpu_count, percentage_of_cpu=0.20, replica_limit=4),
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
return stage_name
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
def add_infographic_extractor_stage(pipeline, default_cpu_count, stage_name="infographic_extractor"):
|
|
281
|
-
ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
|
|
282
|
-
|
|
283
|
-
infographic_content_extractor_config = InfographicExtractorSchema(
|
|
284
|
-
**{
|
|
285
|
-
"endpoint_config": {
|
|
286
|
-
"ocr_endpoints": (ocr_grpc, ocr_http),
|
|
287
|
-
"ocr_infer_protocol": ocr_protocol,
|
|
288
|
-
"auth_token": ocr_auth,
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
)
|
|
292
|
-
|
|
293
|
-
pipeline.add_stage(
|
|
294
|
-
name=stage_name,
|
|
295
|
-
stage_actor=InfographicExtractorStage,
|
|
296
|
-
config=infographic_content_extractor_config,
|
|
297
|
-
min_replicas=0,
|
|
298
|
-
max_replicas=2,
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
return stage_name
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
def add_image_extractor_stage(pipeline, default_cpu_count, stage_name="image_extractor"):
|
|
305
|
-
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
306
|
-
|
|
307
|
-
image_extractor_config = ImageConfigSchema(
|
|
308
|
-
**{
|
|
309
|
-
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
310
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
311
|
-
"auth_token": yolox_auth, # All auth tokens are the same for the moment
|
|
312
|
-
}
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
pipeline.add_stage(
|
|
316
|
-
name=stage_name,
|
|
317
|
-
stage_actor=ImageExtractorStage,
|
|
318
|
-
config=image_extractor_config,
|
|
319
|
-
min_replicas=0,
|
|
320
|
-
max_replicas=2,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
return stage_name
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
def add_docx_extractor_stage(pipeline, default_cpu_count, stage_name="docx_extractor"):
|
|
327
|
-
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
328
|
-
|
|
329
|
-
docx_extractor_config = {
|
|
330
|
-
"docx_extraction_config": {
|
|
331
|
-
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
332
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
333
|
-
"auth_token": yolox_auth,
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
pipeline.add_stage(
|
|
338
|
-
name=stage_name,
|
|
339
|
-
stage_actor=DocxExtractorStage,
|
|
340
|
-
config=DocxExtractorSchema(**docx_extractor_config),
|
|
341
|
-
min_replicas=0,
|
|
342
|
-
max_replicas=2,
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
return stage_name
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
def add_pptx_extractor_stage(pipeline, default_cpu_count, stage_name="pptx_extractor"):
|
|
349
|
-
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
350
|
-
|
|
351
|
-
pptx_extractor_config = {
|
|
352
|
-
"pptx_extraction_config": {
|
|
353
|
-
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
354
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
355
|
-
"auth_token": yolox_auth,
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
pipeline.add_stage(
|
|
360
|
-
name=stage_name,
|
|
361
|
-
stage_actor=PPTXExtractorStage,
|
|
362
|
-
config=PPTXExtractorSchema(**pptx_extractor_config),
|
|
363
|
-
min_replicas=0,
|
|
364
|
-
max_replicas=2,
|
|
365
|
-
)
|
|
366
|
-
|
|
367
|
-
return stage_name
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_extractor"):
|
|
371
|
-
audio_grpc, audio_http, audio_auth, audio_infer_protocol, audio_function_id = get_audio_retrieval_service("audio")
|
|
372
|
-
|
|
373
|
-
audio_extractor_config = AudioExtractorSchema(
|
|
374
|
-
**{
|
|
375
|
-
"audio_extraction_config": {
|
|
376
|
-
"audio_endpoints": (audio_grpc, audio_http),
|
|
377
|
-
"audio_infer_protocol": audio_infer_protocol,
|
|
378
|
-
"function_id": audio_function_id,
|
|
379
|
-
"auth_token": audio_auth,
|
|
380
|
-
# All auth tokens are the same for the moment
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
pipeline.add_stage(
|
|
386
|
-
name=stage_name, stage_actor=AudioExtractorStage, config=audio_extractor_config, min_replicas=0, max_replicas=2
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
return stage_name
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
def add_html_extractor_stage(pipeline, default_cpu_count, stage_name="html_extractor"):
|
|
393
|
-
|
|
394
|
-
pipeline.add_stage(
|
|
395
|
-
name=stage_name,
|
|
396
|
-
stage_actor=HtmlExtractorStage,
|
|
397
|
-
config=HtmlExtractorSchema(),
|
|
398
|
-
min_replicas=0,
|
|
399
|
-
max_replicas=2,
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
return stage_name
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
|
|
406
|
-
_ = default_cpu_count # Placeholder for future use
|
|
407
|
-
otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
|
408
|
-
|
|
409
|
-
otel_tracer_config = OpenTelemetryTracerSchema(
|
|
410
|
-
**{
|
|
411
|
-
"otel_endpoint": otel_endpoint,
|
|
412
|
-
}
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
pipeline.add_stage(
|
|
416
|
-
name=stage_name,
|
|
417
|
-
stage_actor=OpenTelemetryTracerStage,
|
|
418
|
-
config=otel_tracer_config,
|
|
419
|
-
min_replicas=0,
|
|
420
|
-
max_replicas=1,
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
return stage_name
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def add_image_dedup_stage(pipeline, default_cpu_count, stage_name="image_dedup"):
|
|
427
|
-
config = ImageDedupSchema()
|
|
428
|
-
|
|
429
|
-
pipeline.add_stage(
|
|
430
|
-
name=stage_name,
|
|
431
|
-
stage_actor=ImageDedupStage,
|
|
432
|
-
config=config,
|
|
433
|
-
min_replicas=0,
|
|
434
|
-
max_replicas=1,
|
|
435
|
-
)
|
|
436
|
-
|
|
437
|
-
return stage_name
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
def add_image_filter_stage(pipeline, default_cpu_count, stage_name="image_filter"):
|
|
441
|
-
config = ImageFilterSchema()
|
|
442
|
-
|
|
443
|
-
pipeline.add_stage(
|
|
444
|
-
name=stage_name,
|
|
445
|
-
stage_actor=ImageFilterStage,
|
|
446
|
-
config=config,
|
|
447
|
-
min_replicas=0,
|
|
448
|
-
max_replicas=1,
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
return stage_name
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
def add_text_splitter_stage(pipeline, default_cpu_count, stage_name="text_splitter"):
|
|
455
|
-
_ = default_cpu_count
|
|
456
|
-
|
|
457
|
-
config = TextSplitterSchema()
|
|
458
|
-
|
|
459
|
-
pipeline.add_stage(
|
|
460
|
-
name=stage_name,
|
|
461
|
-
stage_actor=TextSplitterStage,
|
|
462
|
-
config=config,
|
|
463
|
-
min_replicas=0,
|
|
464
|
-
max_replicas=2,
|
|
465
|
-
)
|
|
466
|
-
|
|
467
|
-
return stage_name
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_caption"):
|
|
471
|
-
auth_token = os.environ.get(
|
|
472
|
-
"NVIDIA_API_KEY",
|
|
473
|
-
"",
|
|
474
|
-
) or os.environ.get(
|
|
475
|
-
"NGC_API_KEY",
|
|
476
|
-
"",
|
|
477
|
-
)
|
|
478
|
-
|
|
479
|
-
endpoint_url = os.environ.get("VLM_CAPTION_ENDPOINT", "localhost:5000")
|
|
480
|
-
model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "nvidia/llama-3.1-nemotron-nano-vl-8b-v1")
|
|
481
|
-
|
|
482
|
-
config = ImageCaptionExtractionSchema(
|
|
483
|
-
**{
|
|
484
|
-
"api_key": auth_token,
|
|
485
|
-
"endpoint_url": endpoint_url,
|
|
486
|
-
"model_name": model_name,
|
|
487
|
-
"prompt": "Caption the content of this image:",
|
|
488
|
-
}
|
|
489
|
-
)
|
|
490
|
-
|
|
491
|
-
pipeline.add_stage(
|
|
492
|
-
name=stage_name,
|
|
493
|
-
stage_actor=ImageCaptionTransformStage,
|
|
494
|
-
config=config,
|
|
495
|
-
min_replicas=0,
|
|
496
|
-
max_replicas=1,
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
return stage_name
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
def add_text_embedding_stage(pipeline, default_cpu_count, stage_name="text_embedding"):
|
|
503
|
-
api_key = os.environ.get(
|
|
504
|
-
"NVIDIA_API_KEY",
|
|
505
|
-
"",
|
|
506
|
-
) or os.environ.get(
|
|
507
|
-
"NGC_API_KEY",
|
|
508
|
-
"",
|
|
509
|
-
)
|
|
510
|
-
embedding_nim_endpoint = os.getenv("EMBEDDING_NIM_ENDPOINT", "http://embedding:8000/v1")
|
|
511
|
-
embedding_model = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
|
|
512
|
-
|
|
513
|
-
config = TextEmbeddingSchema(
|
|
514
|
-
**{
|
|
515
|
-
"api_key": api_key,
|
|
516
|
-
"embedding_nim_endpoint": embedding_nim_endpoint,
|
|
517
|
-
"embedding_model": embedding_model,
|
|
518
|
-
}
|
|
519
|
-
)
|
|
520
|
-
|
|
521
|
-
pipeline.add_stage(
|
|
522
|
-
name=stage_name,
|
|
523
|
-
stage_actor=TextEmbeddingTransformStage,
|
|
524
|
-
config=config,
|
|
525
|
-
min_replicas=0,
|
|
526
|
-
max_replicas=_get_max_replicas(default_cpu_count, percentage_of_cpu=0.07, replica_limit=4),
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
return stage_name
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
def add_embedding_storage_stage(pipeline, default_cpu_count, stage_name="embedding_storage"):
|
|
533
|
-
config = EmbeddingStorageSchema()
|
|
534
|
-
|
|
535
|
-
pipeline.add_stage(
|
|
536
|
-
name=stage_name,
|
|
537
|
-
stage_actor=EmbeddingStorageStage,
|
|
538
|
-
config=config,
|
|
539
|
-
min_replicas=0,
|
|
540
|
-
max_replicas=1,
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
return stage_name
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
def add_image_storage_stage(pipeline, default_cpu_count, stage_name="image_storage"):
|
|
547
|
-
config = ImageStorageModuleSchema()
|
|
548
|
-
pipeline.add_stage(
|
|
549
|
-
name=stage_name,
|
|
550
|
-
stage_actor=ImageStorageStage,
|
|
551
|
-
config=config,
|
|
552
|
-
min_replicas=0,
|
|
553
|
-
max_replicas=1,
|
|
554
|
-
)
|
|
555
|
-
|
|
556
|
-
return stage_name
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
def add_default_drain_stage(pipeline, default_cpu_count, stage_name="pipeline_drain"):
|
|
560
|
-
pipeline.add_stage(
|
|
561
|
-
name=stage_name,
|
|
562
|
-
stage_actor=DefaultDrainSink,
|
|
563
|
-
config=None,
|
|
564
|
-
min_replicas=1,
|
|
565
|
-
max_replicas=1,
|
|
566
|
-
)
|
|
567
|
-
|
|
568
|
-
return stage_name
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
def add_message_broker_response_stage(pipeline, default_cpu_count, stage_name="broker_response"):
|
|
572
|
-
task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
|
|
573
|
-
task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
|
|
574
|
-
client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
|
|
575
|
-
|
|
576
|
-
sink_config = MessageBrokerTaskSinkConfig(
|
|
577
|
-
**{
|
|
578
|
-
"broker_client": {
|
|
579
|
-
"host": task_broker_host,
|
|
580
|
-
"port": task_broker_port,
|
|
581
|
-
"client_type": client_type,
|
|
582
|
-
},
|
|
583
|
-
}
|
|
584
|
-
)
|
|
585
|
-
|
|
586
|
-
pipeline.add_stage(
|
|
587
|
-
name=stage_name,
|
|
588
|
-
stage_actor=MessageBrokerTaskSinkStage,
|
|
589
|
-
config=sink_config,
|
|
590
|
-
min_replicas=0,
|
|
591
|
-
max_replicas=2,
|
|
592
|
-
)
|
|
593
|
-
|
|
594
|
-
return stage_name
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
def add_source_stage(pipeline, default_cpu_count, source_name="pipeline_source"):
|
|
598
|
-
_ = default_cpu_count # Placeholder for future use
|
|
599
|
-
task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
|
|
600
|
-
task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
|
|
601
|
-
|
|
602
|
-
client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
|
|
603
|
-
task_queue_name = os.environ.get("MESSAGE_CLIENT_QUEUE", "ingest_task_queue")
|
|
604
|
-
|
|
605
|
-
source_config = MessageBrokerTaskSourceConfig(
|
|
606
|
-
**{
|
|
607
|
-
"broker_client": {
|
|
608
|
-
"host": task_broker_host,
|
|
609
|
-
"port": task_broker_port,
|
|
610
|
-
"client_type": client_type,
|
|
611
|
-
},
|
|
612
|
-
"task_queue": task_queue_name,
|
|
613
|
-
"poll_interval": "0.1",
|
|
614
|
-
}
|
|
615
|
-
)
|
|
616
|
-
|
|
617
|
-
pipeline.add_source(
|
|
618
|
-
name=source_name,
|
|
619
|
-
source_actor=MessageBrokerTaskSourceStage,
|
|
620
|
-
config=source_config,
|
|
621
|
-
min_replicas=1,
|
|
622
|
-
max_replicas=1,
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
if source_config.broker_client.client_type == "simple":
|
|
626
|
-
start_simple_message_broker(source_config.broker_client.model_dump())
|
|
627
|
-
|
|
628
|
-
return source_name
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
def _get_max_replicas(default_cpu_count=None, percentage_of_cpu=0.14, replica_limit=None):
|
|
632
|
-
"""
|
|
633
|
-
Calculate max replicas based on CPU percentage with optional upper limit.
|
|
634
|
-
|
|
635
|
-
Args:
|
|
636
|
-
default_cpu_count (int, optional): CPU cores to use. Auto-detected if None.
|
|
637
|
-
percentage_of_cpu (float, optional): CPU percentage to allocate. Defaults to 0.14.
|
|
638
|
-
replica_limit (int, optional): Upper bound for replicas. Defaults to None.
|
|
639
|
-
|
|
640
|
-
Returns:
|
|
641
|
-
int: Maximum replicas, at least 1.
|
|
642
|
-
"""
|
|
643
|
-
if default_cpu_count is None:
|
|
644
|
-
default_cpu_count = _system_resource_probe.get_cpu_count()
|
|
645
|
-
|
|
646
|
-
_max_replicas = int(max(1, (default_cpu_count * percentage_of_cpu)))
|
|
647
|
-
if replica_limit is not None:
|
|
648
|
-
_max_replicas = min(_max_replicas, replica_limit)
|
|
649
|
-
return _max_replicas
|
{nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|