nv-ingest 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.14.dev20250814.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
@@ -1,649 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import os
6
- import psutil
7
- import click
8
- import logging
9
-
10
- from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
11
- from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
12
- from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
13
- from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
14
- from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
15
-
16
- # Import our new pipeline class.
17
- from nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor import AudioExtractorStage
18
- from nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor import ChartExtractorStage
19
- from nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor import DocxExtractorStage
20
- from nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor import ImageExtractorStage
21
- from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extractor import InfographicExtractorStage
22
- from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
23
- from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
24
- from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
25
- from nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor import HtmlExtractorStage
26
-
27
- from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
28
- from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
29
- from nv_ingest.framework.orchestration.ray.stages.mutate.image_filter import ImageFilterStage
30
- from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
31
- MessageBrokerTaskSinkStage,
32
- MessageBrokerTaskSinkConfig,
33
- )
34
- from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
35
- MessageBrokerTaskSourceStage,
36
- MessageBrokerTaskSourceConfig,
37
- start_simple_message_broker,
38
- )
39
- from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
40
- from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
41
- from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
42
- from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
43
- from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
44
- from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
45
- from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
46
- from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
47
- from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
48
- from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
49
- from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
50
- from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
51
- from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
52
- from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
53
- from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
54
- from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
55
- from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
56
- from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
57
- from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
58
- from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
59
- from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
60
- from nv_ingest.framework.orchestration.ray.util.env_config import DYNAMIC_MEMORY_THRESHOLD
61
-
62
- logger = logging.getLogger(__name__)
63
-
64
- _system_resource_probe = SystemResourceProbe()
65
-
66
-
67
- def validate_positive(ctx, param, value):
68
- if value <= 0:
69
- raise click.BadParameter("must be a positive integer")
70
- return value
71
-
72
-
73
- def get_message_provider_config():
74
- message_provider_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
75
- message_provider_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
76
-
77
- logger.info(f"MESSAGE_CLIENT_HOST: {message_provider_host}")
78
- logger.info(f"MESSAGE_CLIENT_PORT: {message_provider_port}")
79
-
80
- return message_provider_host, message_provider_port
81
-
82
-
83
- def get_caption_classifier_service():
84
- triton_service_caption_classifier = os.environ.get(
85
- "CAPTION_CLASSIFIER_GRPC_TRITON",
86
- "",
87
- )
88
- triton_service_caption_classifier_name = os.environ.get(
89
- "CAPTION_CLASSIFIER_MODEL_NAME",
90
- "",
91
- )
92
-
93
- logger.info(f"CAPTION_CLASSIFIER_GRPC_TRITON: {triton_service_caption_classifier}")
94
-
95
- return triton_service_caption_classifier, triton_service_caption_classifier_name
96
-
97
-
98
- def get_nim_service(env_var_prefix):
99
- prefix = env_var_prefix.upper()
100
- grpc_endpoint = os.environ.get(
101
- f"{prefix}_GRPC_ENDPOINT",
102
- "",
103
- )
104
- http_endpoint = os.environ.get(
105
- f"{prefix}_HTTP_ENDPOINT",
106
- "",
107
- )
108
- auth_token = os.environ.get(
109
- "NVIDIA_API_KEY",
110
- "",
111
- ) or os.environ.get(
112
- "NGC_API_KEY",
113
- "",
114
- )
115
-
116
- infer_protocol = os.environ.get(
117
- f"{prefix}_INFER_PROTOCOL",
118
- "http" if http_endpoint else "grpc" if grpc_endpoint else "",
119
- )
120
-
121
- logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
122
- logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
123
- logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
124
-
125
- return grpc_endpoint, http_endpoint, auth_token, infer_protocol
126
-
127
-
128
- def get_audio_retrieval_service(env_var_prefix):
129
- prefix = env_var_prefix.upper()
130
- grpc_endpoint = os.environ.get(
131
- "AUDIO_GRPC_ENDPOINT",
132
- "",
133
- )
134
- http_endpoint = os.environ.get(
135
- "AUDIO_HTTP_ENDPOINT",
136
- "",
137
- )
138
- auth_token = os.environ.get(
139
- "NVIDIA_API_KEY",
140
- "",
141
- ) or os.environ.get(
142
- "NGC_API_KEY",
143
- "",
144
- )
145
- infer_protocol = os.environ.get(
146
- "AUDIO_INFER_PROTOCOL",
147
- "http" if http_endpoint else "grpc" if grpc_endpoint else "",
148
- )
149
- function_id = os.environ.get(
150
- "AUDIO_FUNCTION_ID",
151
- "",
152
- )
153
-
154
- logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
155
- logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
156
- logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
157
- logger.info(f"{prefix}_FUNCTION_ID: {function_id}")
158
-
159
- return grpc_endpoint, http_endpoint, auth_token, infer_protocol, function_id
160
-
161
-
162
- def add_metadata_injector_stage(pipeline, default_cpu_count, stage_name="metadata_injector"):
163
- _ = default_cpu_count # Placeholder for future use
164
- config = MetadataInjectorSchema()
165
-
166
- pipeline.add_stage(
167
- name=stage_name,
168
- stage_actor=MetadataInjectionStage,
169
- config=config,
170
- min_replicas=0,
171
- max_replicas=1,
172
- )
173
-
174
- return stage_name
175
-
176
-
177
- def add_pdf_extractor_stage(pipeline, default_cpu_count, stage_name="pdf_extractor"):
178
- # Heuristic: Determine max_replicas based on system memory, capped by CPU cores.
179
- total_memory_mb = psutil.virtual_memory().total / (1024**2)
180
-
181
- # Allocate up to 75% of memory to this stage, using a 10GB high watermark per worker.
182
- allocatable_memory_for_stage_mb = total_memory_mb * DYNAMIC_MEMORY_THRESHOLD
183
- memory_based_replicas = int(allocatable_memory_for_stage_mb / 10_000.0)
184
-
185
- # Cap the number of replicas by the number of available CPU cores.
186
- max_replicas = max(1, min(memory_based_replicas, default_cpu_count))
187
-
188
- yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
189
- nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
190
- get_nim_service("nemoretriever_parse")
191
- )
192
- model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
193
-
194
- extractor_config = PDFExtractorSchema(
195
- **{
196
- "pdfium_config": {
197
- "auth_token": yolox_auth, # All auth tokens are the same for the moment
198
- "yolox_endpoints": (yolox_grpc, yolox_http),
199
- "yolox_infer_protocol": yolox_protocol,
200
- },
201
- "nemoretriever_parse_config": {
202
- "auth_token": nemoretriever_parse_auth,
203
- "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
204
- "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
205
- "nemoretriever_parse_model_name": model_name,
206
- "yolox_endpoints": (yolox_grpc, yolox_http),
207
- "yolox_infer_protocol": yolox_protocol,
208
- },
209
- }
210
- )
211
-
212
- pipeline.add_stage(
213
- name=stage_name,
214
- stage_actor=PDFExtractorStage,
215
- config=extractor_config,
216
- min_replicas=0,
217
- max_replicas=max_replicas,
218
- )
219
- return stage_name
220
-
221
-
222
- def add_table_extractor_stage(pipeline, default_cpu_count, stage_name="table_extractor"):
223
- yolox_table_structure_grpc, yolox_table_structure_http, yolox_auth, yolox_table_structure_protocol = (
224
- get_nim_service("yolox_table_structure")
225
- )
226
- ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
227
-
228
- table_extractor_config = TableExtractorSchema(
229
- **{
230
- "endpoint_config": {
231
- "yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
232
- "yolox_infer_protocol": yolox_table_structure_protocol,
233
- "ocr_endpoints": (ocr_grpc, ocr_http),
234
- "ocr_infer_protocol": ocr_protocol,
235
- "auth_token": yolox_auth,
236
- }
237
- }
238
- )
239
-
240
- pipeline.add_stage(
241
- name=stage_name,
242
- stage_actor=TableExtractorStage,
243
- config=table_extractor_config,
244
- min_replicas=0,
245
- max_replicas=_get_max_replicas(default_cpu_count, percentage_of_cpu=0.20, replica_limit=4),
246
- )
247
-
248
- return stage_name
249
-
250
-
251
- def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_extractor"):
252
- yolox_graphic_elements_grpc, yolox_graphic_elements_http, yolox_auth, yolox_graphic_elements_protocol = (
253
- get_nim_service("yolox_graphic_elements")
254
- )
255
- ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
256
-
257
- chart_extractor_config = ChartExtractorSchema(
258
- **{
259
- "endpoint_config": {
260
- "yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
261
- "yolox_infer_protocol": yolox_graphic_elements_protocol,
262
- "ocr_endpoints": (ocr_grpc, ocr_http),
263
- "ocr_infer_protocol": ocr_protocol,
264
- "auth_token": yolox_auth,
265
- }
266
- }
267
- )
268
-
269
- pipeline.add_stage(
270
- name=stage_name,
271
- stage_actor=ChartExtractorStage,
272
- config=chart_extractor_config,
273
- min_replicas=0,
274
- max_replicas=_get_max_replicas(default_cpu_count, percentage_of_cpu=0.20, replica_limit=4),
275
- )
276
-
277
- return stage_name
278
-
279
-
280
- def add_infographic_extractor_stage(pipeline, default_cpu_count, stage_name="infographic_extractor"):
281
- ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
282
-
283
- infographic_content_extractor_config = InfographicExtractorSchema(
284
- **{
285
- "endpoint_config": {
286
- "ocr_endpoints": (ocr_grpc, ocr_http),
287
- "ocr_infer_protocol": ocr_protocol,
288
- "auth_token": ocr_auth,
289
- }
290
- }
291
- )
292
-
293
- pipeline.add_stage(
294
- name=stage_name,
295
- stage_actor=InfographicExtractorStage,
296
- config=infographic_content_extractor_config,
297
- min_replicas=0,
298
- max_replicas=2,
299
- )
300
-
301
- return stage_name
302
-
303
-
304
- def add_image_extractor_stage(pipeline, default_cpu_count, stage_name="image_extractor"):
305
- yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
306
-
307
- image_extractor_config = ImageConfigSchema(
308
- **{
309
- "yolox_endpoints": (yolox_grpc, yolox_http),
310
- "yolox_infer_protocol": yolox_protocol,
311
- "auth_token": yolox_auth, # All auth tokens are the same for the moment
312
- }
313
- )
314
-
315
- pipeline.add_stage(
316
- name=stage_name,
317
- stage_actor=ImageExtractorStage,
318
- config=image_extractor_config,
319
- min_replicas=0,
320
- max_replicas=2,
321
- )
322
-
323
- return stage_name
324
-
325
-
326
- def add_docx_extractor_stage(pipeline, default_cpu_count, stage_name="docx_extractor"):
327
- yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
328
-
329
- docx_extractor_config = {
330
- "docx_extraction_config": {
331
- "yolox_endpoints": (yolox_grpc, yolox_http),
332
- "yolox_infer_protocol": yolox_protocol,
333
- "auth_token": yolox_auth,
334
- }
335
- }
336
-
337
- pipeline.add_stage(
338
- name=stage_name,
339
- stage_actor=DocxExtractorStage,
340
- config=DocxExtractorSchema(**docx_extractor_config),
341
- min_replicas=0,
342
- max_replicas=2,
343
- )
344
-
345
- return stage_name
346
-
347
-
348
- def add_pptx_extractor_stage(pipeline, default_cpu_count, stage_name="pptx_extractor"):
349
- yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
350
-
351
- pptx_extractor_config = {
352
- "pptx_extraction_config": {
353
- "yolox_endpoints": (yolox_grpc, yolox_http),
354
- "yolox_infer_protocol": yolox_protocol,
355
- "auth_token": yolox_auth,
356
- }
357
- }
358
-
359
- pipeline.add_stage(
360
- name=stage_name,
361
- stage_actor=PPTXExtractorStage,
362
- config=PPTXExtractorSchema(**pptx_extractor_config),
363
- min_replicas=0,
364
- max_replicas=2,
365
- )
366
-
367
- return stage_name
368
-
369
-
370
- def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_extractor"):
371
- audio_grpc, audio_http, audio_auth, audio_infer_protocol, audio_function_id = get_audio_retrieval_service("audio")
372
-
373
- audio_extractor_config = AudioExtractorSchema(
374
- **{
375
- "audio_extraction_config": {
376
- "audio_endpoints": (audio_grpc, audio_http),
377
- "audio_infer_protocol": audio_infer_protocol,
378
- "function_id": audio_function_id,
379
- "auth_token": audio_auth,
380
- # All auth tokens are the same for the moment
381
- }
382
- }
383
- )
384
-
385
- pipeline.add_stage(
386
- name=stage_name, stage_actor=AudioExtractorStage, config=audio_extractor_config, min_replicas=0, max_replicas=2
387
- )
388
-
389
- return stage_name
390
-
391
-
392
- def add_html_extractor_stage(pipeline, default_cpu_count, stage_name="html_extractor"):
393
-
394
- pipeline.add_stage(
395
- name=stage_name,
396
- stage_actor=HtmlExtractorStage,
397
- config=HtmlExtractorSchema(),
398
- min_replicas=0,
399
- max_replicas=2,
400
- )
401
-
402
- return stage_name
403
-
404
-
405
- def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
406
- _ = default_cpu_count # Placeholder for future use
407
- otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
408
-
409
- otel_tracer_config = OpenTelemetryTracerSchema(
410
- **{
411
- "otel_endpoint": otel_endpoint,
412
- }
413
- )
414
-
415
- pipeline.add_stage(
416
- name=stage_name,
417
- stage_actor=OpenTelemetryTracerStage,
418
- config=otel_tracer_config,
419
- min_replicas=0,
420
- max_replicas=1,
421
- )
422
-
423
- return stage_name
424
-
425
-
426
- def add_image_dedup_stage(pipeline, default_cpu_count, stage_name="image_dedup"):
427
- config = ImageDedupSchema()
428
-
429
- pipeline.add_stage(
430
- name=stage_name,
431
- stage_actor=ImageDedupStage,
432
- config=config,
433
- min_replicas=0,
434
- max_replicas=1,
435
- )
436
-
437
- return stage_name
438
-
439
-
440
- def add_image_filter_stage(pipeline, default_cpu_count, stage_name="image_filter"):
441
- config = ImageFilterSchema()
442
-
443
- pipeline.add_stage(
444
- name=stage_name,
445
- stage_actor=ImageFilterStage,
446
- config=config,
447
- min_replicas=0,
448
- max_replicas=1,
449
- )
450
-
451
- return stage_name
452
-
453
-
454
- def add_text_splitter_stage(pipeline, default_cpu_count, stage_name="text_splitter"):
455
- _ = default_cpu_count
456
-
457
- config = TextSplitterSchema()
458
-
459
- pipeline.add_stage(
460
- name=stage_name,
461
- stage_actor=TextSplitterStage,
462
- config=config,
463
- min_replicas=0,
464
- max_replicas=2,
465
- )
466
-
467
- return stage_name
468
-
469
-
470
- def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_caption"):
471
- auth_token = os.environ.get(
472
- "NVIDIA_API_KEY",
473
- "",
474
- ) or os.environ.get(
475
- "NGC_API_KEY",
476
- "",
477
- )
478
-
479
- endpoint_url = os.environ.get("VLM_CAPTION_ENDPOINT", "localhost:5000")
480
- model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "nvidia/llama-3.1-nemotron-nano-vl-8b-v1")
481
-
482
- config = ImageCaptionExtractionSchema(
483
- **{
484
- "api_key": auth_token,
485
- "endpoint_url": endpoint_url,
486
- "model_name": model_name,
487
- "prompt": "Caption the content of this image:",
488
- }
489
- )
490
-
491
- pipeline.add_stage(
492
- name=stage_name,
493
- stage_actor=ImageCaptionTransformStage,
494
- config=config,
495
- min_replicas=0,
496
- max_replicas=1,
497
- )
498
-
499
- return stage_name
500
-
501
-
502
- def add_text_embedding_stage(pipeline, default_cpu_count, stage_name="text_embedding"):
503
- api_key = os.environ.get(
504
- "NVIDIA_API_KEY",
505
- "",
506
- ) or os.environ.get(
507
- "NGC_API_KEY",
508
- "",
509
- )
510
- embedding_nim_endpoint = os.getenv("EMBEDDING_NIM_ENDPOINT", "http://embedding:8000/v1")
511
- embedding_model = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
512
-
513
- config = TextEmbeddingSchema(
514
- **{
515
- "api_key": api_key,
516
- "embedding_nim_endpoint": embedding_nim_endpoint,
517
- "embedding_model": embedding_model,
518
- }
519
- )
520
-
521
- pipeline.add_stage(
522
- name=stage_name,
523
- stage_actor=TextEmbeddingTransformStage,
524
- config=config,
525
- min_replicas=0,
526
- max_replicas=_get_max_replicas(default_cpu_count, percentage_of_cpu=0.07, replica_limit=4),
527
- )
528
-
529
- return stage_name
530
-
531
-
532
- def add_embedding_storage_stage(pipeline, default_cpu_count, stage_name="embedding_storage"):
533
- config = EmbeddingStorageSchema()
534
-
535
- pipeline.add_stage(
536
- name=stage_name,
537
- stage_actor=EmbeddingStorageStage,
538
- config=config,
539
- min_replicas=0,
540
- max_replicas=1,
541
- )
542
-
543
- return stage_name
544
-
545
-
546
- def add_image_storage_stage(pipeline, default_cpu_count, stage_name="image_storage"):
547
- config = ImageStorageModuleSchema()
548
- pipeline.add_stage(
549
- name=stage_name,
550
- stage_actor=ImageStorageStage,
551
- config=config,
552
- min_replicas=0,
553
- max_replicas=1,
554
- )
555
-
556
- return stage_name
557
-
558
-
559
- def add_default_drain_stage(pipeline, default_cpu_count, stage_name="pipeline_drain"):
560
- pipeline.add_stage(
561
- name=stage_name,
562
- stage_actor=DefaultDrainSink,
563
- config=None,
564
- min_replicas=1,
565
- max_replicas=1,
566
- )
567
-
568
- return stage_name
569
-
570
-
571
- def add_message_broker_response_stage(pipeline, default_cpu_count, stage_name="broker_response"):
572
- task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
573
- task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
574
- client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
575
-
576
- sink_config = MessageBrokerTaskSinkConfig(
577
- **{
578
- "broker_client": {
579
- "host": task_broker_host,
580
- "port": task_broker_port,
581
- "client_type": client_type,
582
- },
583
- }
584
- )
585
-
586
- pipeline.add_stage(
587
- name=stage_name,
588
- stage_actor=MessageBrokerTaskSinkStage,
589
- config=sink_config,
590
- min_replicas=0,
591
- max_replicas=2,
592
- )
593
-
594
- return stage_name
595
-
596
-
597
- def add_source_stage(pipeline, default_cpu_count, source_name="pipeline_source"):
598
- _ = default_cpu_count # Placeholder for future use
599
- task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
600
- task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
601
-
602
- client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
603
- task_queue_name = os.environ.get("MESSAGE_CLIENT_QUEUE", "ingest_task_queue")
604
-
605
- source_config = MessageBrokerTaskSourceConfig(
606
- **{
607
- "broker_client": {
608
- "host": task_broker_host,
609
- "port": task_broker_port,
610
- "client_type": client_type,
611
- },
612
- "task_queue": task_queue_name,
613
- "poll_interval": "0.1",
614
- }
615
- )
616
-
617
- pipeline.add_source(
618
- name=source_name,
619
- source_actor=MessageBrokerTaskSourceStage,
620
- config=source_config,
621
- min_replicas=1,
622
- max_replicas=1,
623
- )
624
-
625
- if source_config.broker_client.client_type == "simple":
626
- start_simple_message_broker(source_config.broker_client.model_dump())
627
-
628
- return source_name
629
-
630
-
631
- def _get_max_replicas(default_cpu_count=None, percentage_of_cpu=0.14, replica_limit=None):
632
- """
633
- Calculate max replicas based on CPU percentage with optional upper limit.
634
-
635
- Args:
636
- default_cpu_count (int, optional): CPU cores to use. Auto-detected if None.
637
- percentage_of_cpu (float, optional): CPU percentage to allocate. Defaults to 0.14.
638
- replica_limit (int, optional): Upper bound for replicas. Defaults to None.
639
-
640
- Returns:
641
- int: Maximum replicas, at least 1.
642
- """
643
- if default_cpu_count is None:
644
- default_cpu_count = _system_resource_probe.get_cpu_count()
645
-
646
- _max_replicas = int(max(1, (default_cpu_count * percentage_of_cpu)))
647
- if replica_limit is not None:
648
- _max_replicas = min(_max_replicas, replica_limit)
649
- return _max_replicas