nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,609 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # TODO(Devin)
6
+ # flake8: noqa
7
+ import os
8
+
9
+ import click
10
+ import logging
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
13
+ from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
14
+ from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
15
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
16
+
17
+ # Import our new pipeline class.
18
+ from nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor import AudioExtractorStage
19
+ from nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor import ChartExtractorStage
20
+ from nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor import DocxExtractorStage
21
+ from nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor import ImageExtractorStage
22
+ from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extractor import InfographicExtractorStage
23
+ from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
24
+ from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
25
+ from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
26
+
27
+ from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
28
+ from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
29
+ from nv_ingest.framework.orchestration.ray.stages.mutate.image_filter import ImageFilterStage
30
+ from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
31
+ MessageBrokerTaskSinkStage,
32
+ MessageBrokerTaskSinkConfig,
33
+ )
34
+ from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
35
+ MessageBrokerTaskSourceStage,
36
+ MessageBrokerTaskSourceConfig,
37
+ start_simple_message_broker,
38
+ )
39
+ from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
40
+ from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
41
+ from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
42
+ from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
43
+ from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
44
+ from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
45
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
46
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
47
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
48
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
49
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
50
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
51
+ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
52
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
53
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
54
+ from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
55
+ from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
56
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
57
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
58
+ from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
59
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
60
+
61
+ logger = logging.getLogger(__name__)
62
+
63
+ _system_resource_probe = SystemResourceProbe()
64
+
65
+
66
+ def validate_positive(ctx, param, value):
67
+ if value <= 0:
68
+ raise click.BadParameter("must be a positive integer")
69
+ return value
70
+
71
+
72
+ def get_message_provider_config():
73
+ message_provider_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
74
+ message_provider_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
75
+
76
+ logger.info(f"MESSAGE_CLIENT_HOST: {message_provider_host}")
77
+ logger.info(f"MESSAGE_CLIENT_PORT: {message_provider_port}")
78
+
79
+ return message_provider_host, message_provider_port
80
+
81
+
82
+ def get_caption_classifier_service():
83
+ triton_service_caption_classifier = os.environ.get(
84
+ "CAPTION_CLASSIFIER_GRPC_TRITON",
85
+ "",
86
+ )
87
+ triton_service_caption_classifier_name = os.environ.get(
88
+ "CAPTION_CLASSIFIER_MODEL_NAME",
89
+ "",
90
+ )
91
+
92
+ logger.info(f"CAPTION_CLASSIFIER_GRPC_TRITON: {triton_service_caption_classifier}")
93
+
94
+ return triton_service_caption_classifier, triton_service_caption_classifier_name
95
+
96
+
97
+ def get_nim_service(env_var_prefix):
98
+ prefix = env_var_prefix.upper()
99
+ grpc_endpoint = os.environ.get(
100
+ f"{prefix}_GRPC_ENDPOINT",
101
+ "",
102
+ )
103
+ http_endpoint = os.environ.get(
104
+ f"{prefix}_HTTP_ENDPOINT",
105
+ "",
106
+ )
107
+ auth_token = os.environ.get(
108
+ "NVIDIA_BUILD_API_KEY",
109
+ "",
110
+ ) or os.environ.get(
111
+ "NGC_API_KEY",
112
+ "",
113
+ )
114
+
115
+ infer_protocol = os.environ.get(
116
+ f"{prefix}_INFER_PROTOCOL",
117
+ "http" if http_endpoint else "grpc" if grpc_endpoint else "",
118
+ )
119
+
120
+ logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
121
+ logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
122
+ logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
123
+
124
+ return grpc_endpoint, http_endpoint, auth_token, infer_protocol
125
+
126
+
127
+ def get_audio_retrieval_service(env_var_prefix):
128
+ prefix = env_var_prefix.upper()
129
+ grpc_endpoint = os.environ.get(
130
+ "AUDIO_GRPC_ENDPOINT",
131
+ "",
132
+ )
133
+ http_endpoint = os.environ.get(
134
+ "AUDIO_HTTP_ENDPOINT",
135
+ "",
136
+ )
137
+ auth_token = os.environ.get(
138
+ "NVIDIA_BUILD_API_KEY",
139
+ "",
140
+ ) or os.environ.get(
141
+ "NGC_API_KEY",
142
+ "",
143
+ )
144
+ infer_protocol = os.environ.get(
145
+ "AUDIO_INFER_PROTOCOL",
146
+ "http" if http_endpoint else "grpc" if grpc_endpoint else "",
147
+ )
148
+ function_id = os.environ.get(
149
+ "AUDIO_FUNCTION_ID",
150
+ "",
151
+ )
152
+
153
+ logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
154
+ logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
155
+ logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
156
+ logger.info(f"{prefix}_FUNCTION_ID: {function_id}")
157
+
158
+ return grpc_endpoint, http_endpoint, auth_token, infer_protocol, function_id
159
+
160
+
161
+ def add_metadata_injector_stage(pipeline, default_cpu_count, stage_name="metadata_injector"):
162
+ _ = default_cpu_count # Placeholder for future use
163
+ config = MetadataInjectorSchema()
164
+
165
+ pipeline.add_stage(
166
+ name=stage_name,
167
+ stage_actor=MetadataInjectionStage,
168
+ config=config,
169
+ min_replicas=0,
170
+ max_replicas=1,
171
+ )
172
+
173
+ return stage_name
174
+
175
+
176
+ def add_pdf_extractor_stage(pipeline, default_cpu_count, stage_name="pdf_extractor"):
177
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
178
+ nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
179
+ get_nim_service("nemoretriever_parse")
180
+ )
181
+ model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
182
+
183
+ extractor_config = PDFExtractorSchema(
184
+ **{
185
+ "pdfium_config": {
186
+ "auth_token": yolox_auth, # All auth tokens are the same for the moment
187
+ "yolox_endpoints": (yolox_grpc, yolox_http),
188
+ "yolox_infer_protocol": yolox_protocol,
189
+ },
190
+ "nemoretriever_parse_config": {
191
+ "auth_token": nemoretriever_parse_auth,
192
+ "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
193
+ "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
194
+ "nemoretriever_parse_model_name": model_name,
195
+ "yolox_endpoints": (yolox_grpc, yolox_http),
196
+ "yolox_infer_protocol": yolox_protocol,
197
+ },
198
+ }
199
+ )
200
+
201
+ pipeline.add_stage(
202
+ name=stage_name,
203
+ stage_actor=PDFExtractorStage,
204
+ config=extractor_config,
205
+ min_replicas=0,
206
+ max_replicas=int(max(1, (default_cpu_count // 3))), # 33% of available CPU cores
207
+ )
208
+
209
+ return stage_name
210
+
211
+
212
+ def add_table_extractor_stage(pipeline, default_cpu_count, stage_name="table_extractor"):
213
+ yolox_table_structure_grpc, yolox_table_structure_http, yolox_auth, yolox_table_structure_protocol = (
214
+ get_nim_service("yolox_table_structure")
215
+ )
216
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
217
+
218
+ table_extractor_config = TableExtractorSchema(
219
+ **{
220
+ "endpoint_config": {
221
+ "yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
222
+ "yolox_infer_protocol": yolox_table_structure_protocol,
223
+ "paddle_endpoints": (paddle_grpc, paddle_http),
224
+ "paddle_infer_protocol": paddle_protocol,
225
+ "auth_token": yolox_auth,
226
+ }
227
+ }
228
+ )
229
+
230
+ pipeline.add_stage(
231
+ name=stage_name,
232
+ stage_actor=TableExtractorStage,
233
+ config=table_extractor_config,
234
+ min_replicas=0,
235
+ max_replicas=int(max(1, (default_cpu_count // 7))), # 14% of available CPU cores
236
+ )
237
+
238
+ return stage_name
239
+
240
+
241
+ def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_extractor"):
242
+ yolox_graphic_elements_grpc, yolox_graphic_elements_http, yolox_auth, yolox_graphic_elements_protocol = (
243
+ get_nim_service("yolox_graphic_elements")
244
+ )
245
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
246
+
247
+ chart_extractor_config = ChartExtractorSchema(
248
+ **{
249
+ "endpoint_config": {
250
+ "yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
251
+ "yolox_infer_protocol": yolox_graphic_elements_protocol,
252
+ "paddle_endpoints": (paddle_grpc, paddle_http),
253
+ "paddle_infer_protocol": paddle_protocol,
254
+ "auth_token": yolox_auth,
255
+ }
256
+ }
257
+ )
258
+
259
+ pipeline.add_stage(
260
+ name=stage_name,
261
+ stage_actor=ChartExtractorStage,
262
+ config=chart_extractor_config,
263
+ min_replicas=0,
264
+ max_replicas=int(max(1, (default_cpu_count // 7))), # 14% of available CPU cores
265
+ )
266
+
267
+ return stage_name
268
+
269
+
270
+ def add_infographic_extractor_stage(pipeline, default_cpu_count, stage_name="infographic_extractor"):
271
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
272
+
273
+ infographic_content_extractor_config = InfographicExtractorSchema(
274
+ **{
275
+ "endpoint_config": {
276
+ "paddle_endpoints": (paddle_grpc, paddle_http),
277
+ "paddle_infer_protocol": paddle_protocol,
278
+ "auth_token": paddle_auth,
279
+ }
280
+ }
281
+ )
282
+
283
+ pipeline.add_stage(
284
+ name=stage_name,
285
+ stage_actor=InfographicExtractorStage,
286
+ config=infographic_content_extractor_config,
287
+ min_replicas=0,
288
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
289
+ )
290
+
291
+ return stage_name
292
+
293
+
294
+ def add_image_extractor_stage(pipeline, default_cpu_count, stage_name="image_extractor"):
295
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
296
+
297
+ image_extractor_config = ImageConfigSchema(
298
+ **{
299
+ "yolox_endpoints": (yolox_grpc, yolox_http),
300
+ "yolox_infer_protocol": yolox_protocol,
301
+ "auth_token": yolox_auth, # All auth tokens are the same for the moment
302
+ }
303
+ )
304
+
305
+ pipeline.add_stage(
306
+ name=stage_name,
307
+ stage_actor=ImageExtractorStage,
308
+ config=image_extractor_config,
309
+ min_replicas=0,
310
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
311
+ )
312
+
313
+ return stage_name
314
+
315
+
316
+ def add_docx_extractor_stage(pipeline, default_cpu_count, stage_name="docx_extractor"):
317
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
318
+
319
+ docx_extractor_config = {
320
+ "docx_extraction_config": {
321
+ "yolox_endpoints": (yolox_grpc, yolox_http),
322
+ "yolox_infer_protocol": yolox_protocol,
323
+ "auth_token": yolox_auth,
324
+ }
325
+ }
326
+
327
+ pipeline.add_stage(
328
+ name=stage_name,
329
+ stage_actor=DocxExtractorStage,
330
+ config=DocxExtractorSchema(**docx_extractor_config),
331
+ min_replicas=0,
332
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
333
+ )
334
+
335
+ return stage_name
336
+
337
+
338
+ def add_pptx_extractor_stage(pipeline, default_cpu_count, stage_name="pptx_extractor"):
339
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
340
+
341
+ pptx_extractor_config = {
342
+ "pptx_extraction_config": {
343
+ "yolox_endpoints": (yolox_grpc, yolox_http),
344
+ "yolox_infer_protocol": yolox_protocol,
345
+ "auth_token": yolox_auth,
346
+ }
347
+ }
348
+
349
+ pipeline.add_stage(
350
+ name=stage_name,
351
+ stage_actor=PPTXExtractorStage,
352
+ config=PPTXExtractorSchema(**pptx_extractor_config),
353
+ min_replicas=0,
354
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
355
+ )
356
+
357
+ return stage_name
358
+
359
+
360
+ def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_extractor"):
361
+ audio_grpc, audio_http, audio_auth, audio_infer_protocol, audio_function_id = get_audio_retrieval_service("audio")
362
+
363
+ audio_extractor_config = AudioExtractorSchema(
364
+ **{
365
+ "audio_extraction_config": {
366
+ "audio_endpoints": (audio_grpc, audio_http),
367
+ "audio_infer_protocol": audio_infer_protocol,
368
+ "function_id": audio_function_id,
369
+ "auth_token": audio_auth,
370
+ # All auth tokens are the same for the moment
371
+ }
372
+ }
373
+ )
374
+
375
+ pipeline.add_stage(
376
+ name=stage_name,
377
+ stage_actor=AudioExtractorStage,
378
+ config=audio_extractor_config,
379
+ min_replicas=0,
380
+ max_replicas=1, # Audio extraction is a heavy IO bound operation with minimal CPU usage
381
+ )
382
+
383
+ return stage_name
384
+
385
+
386
+ def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
387
+ _ = default_cpu_count # Placeholder for future use
388
+ otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
389
+
390
+ otel_tracer_config = OpenTelemetryTracerSchema(
391
+ **{
392
+ "otel_endpoint": otel_endpoint,
393
+ }
394
+ )
395
+
396
+ pipeline.add_stage(
397
+ name=stage_name,
398
+ stage_actor=OpenTelemetryTracerStage,
399
+ config=otel_tracer_config,
400
+ min_replicas=0,
401
+ max_replicas=2,
402
+ )
403
+
404
+ return stage_name
405
+
406
+
407
+ def add_image_dedup_stage(pipeline, default_cpu_count, stage_name="image_dedup"):
408
+ config = ImageDedupSchema()
409
+
410
+ pipeline.add_stage(
411
+ name=stage_name,
412
+ stage_actor=ImageDedupStage,
413
+ config=config,
414
+ min_replicas=0,
415
+ max_replicas=1,
416
+ )
417
+
418
+ return stage_name
419
+
420
+
421
+ def add_image_filter_stage(pipeline, default_cpu_count, stage_name="image_filter"):
422
+ config = ImageFilterSchema()
423
+
424
+ pipeline.add_stage(
425
+ name=stage_name,
426
+ stage_actor=ImageFilterStage,
427
+ config=config,
428
+ min_replicas=0,
429
+ max_replicas=1,
430
+ )
431
+
432
+ return stage_name
433
+
434
+
435
+ def add_text_splitter_stage(pipeline, default_cpu_count, stage_name="text_splitter"):
436
+ _ = default_cpu_count
437
+
438
+ config = TextSplitterSchema()
439
+
440
+ pipeline.add_stage(
441
+ name=stage_name,
442
+ stage_actor=TextSplitterStage,
443
+ config=config,
444
+ min_replicas=0,
445
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
446
+ )
447
+
448
+ return stage_name
449
+
450
+
451
+ def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_caption"):
452
+ auth_token = os.environ.get(
453
+ "NVIDIA_BUILD_API_KEY",
454
+ "",
455
+ ) or os.environ.get(
456
+ "NGC_API_KEY",
457
+ "",
458
+ )
459
+
460
+ endpoint_url = os.environ.get("VLM_CAPTION_ENDPOINT", "localhost:5000")
461
+ model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
462
+
463
+ config = ImageCaptionExtractionSchema(
464
+ **{
465
+ "api_key": auth_token,
466
+ "endpoint_url": endpoint_url,
467
+ "image_caption_model_name": model_name,
468
+ "prompt": "Caption the content of this image:",
469
+ }
470
+ )
471
+
472
+ pipeline.add_stage(
473
+ name=stage_name,
474
+ stage_actor=ImageCaptionTransformStage,
475
+ config=config,
476
+ min_replicas=0,
477
+ max_replicas=1,
478
+ )
479
+
480
+ return stage_name
481
+
482
+
483
+ def add_text_embedding_stage(pipeline, default_cpu_count, stage_name="text_embedding"):
484
+ api_key = os.environ.get(
485
+ "NVIDIA_BUILD_API_KEY",
486
+ "",
487
+ ) or os.environ.get(
488
+ "NGC_API_KEY",
489
+ "",
490
+ )
491
+ embedding_nim_endpoint = os.getenv("EMBEDDING_NIM_ENDPOINT", "http://embedding:8000/v1")
492
+ embedding_model = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
493
+
494
+ config = TextEmbeddingSchema(
495
+ **{
496
+ "api_key": api_key,
497
+ "embedding_nim_endpoint": embedding_nim_endpoint,
498
+ "embedding_model": embedding_model,
499
+ }
500
+ )
501
+
502
+ pipeline.add_stage(
503
+ name=stage_name,
504
+ stage_actor=TextEmbeddingTransformStage,
505
+ config=config,
506
+ min_replicas=0,
507
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
508
+ )
509
+
510
+ return stage_name
511
+
512
+
513
+ def add_embedding_storage_stage(pipeline, default_cpu_count, stage_name="embedding_storage"):
514
+ config = EmbeddingStorageSchema()
515
+
516
+ pipeline.add_stage(
517
+ name=stage_name,
518
+ stage_actor=EmbeddingStorageStage,
519
+ config=config,
520
+ min_replicas=0,
521
+ max_replicas=1,
522
+ )
523
+
524
+ return stage_name
525
+
526
+
527
+ def add_image_storage_stage(pipeline, default_cpu_count, stage_name="image_storage"):
528
+ config = ImageStorageModuleSchema()
529
+ pipeline.add_stage(
530
+ name=stage_name,
531
+ stage_actor=ImageStorageStage,
532
+ config=config,
533
+ min_replicas=0,
534
+ max_replicas=1,
535
+ )
536
+
537
+ return stage_name
538
+
539
+
540
+ def add_default_drain_stage(pipeline, default_cpu_count, stage_name="pipeline_drain"):
541
+ pipeline.add_stage(
542
+ name=stage_name,
543
+ stage_actor=DefaultDrainSink,
544
+ config=None,
545
+ min_replicas=1,
546
+ max_replicas=1,
547
+ )
548
+
549
+ return stage_name
550
+
551
+
552
+ def add_message_broker_response_stage(pipeline, default_cpu_count, stage_name="broker_response"):
553
+ task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
554
+ task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
555
+ client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
556
+
557
+ sink_config = MessageBrokerTaskSinkConfig(
558
+ **{
559
+ "broker_client": {
560
+ "host": task_broker_host,
561
+ "port": task_broker_port,
562
+ "client_type": client_type,
563
+ },
564
+ }
565
+ )
566
+
567
+ pipeline.add_stage(
568
+ name=stage_name,
569
+ stage_actor=MessageBrokerTaskSinkStage,
570
+ config=sink_config,
571
+ min_replicas=0,
572
+ max_replicas=2,
573
+ )
574
+
575
+ return stage_name
576
+
577
+
578
+ def add_source_stage(pipeline, default_cpu_count, source_name="pipeline_source"):
579
+ _ = default_cpu_count # Placeholder for future use
580
+ task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
581
+ task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
582
+
583
+ client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
584
+ task_queue_name = os.environ.get("MESSAGE_CLIENT_QUEUE", "ingest_task_queue")
585
+
586
+ source_config = MessageBrokerTaskSourceConfig(
587
+ **{
588
+ "broker_client": {
589
+ "host": task_broker_host,
590
+ "port": task_broker_port,
591
+ "client_type": client_type,
592
+ },
593
+ "task_queue": task_queue_name,
594
+ "poll_interval": "0.1",
595
+ }
596
+ )
597
+
598
+ pipeline.add_source(
599
+ name=source_name,
600
+ source_actor=MessageBrokerTaskSourceStage,
601
+ config=source_config,
602
+ min_replicas=1,
603
+ max_replicas=1,
604
+ )
605
+
606
+ if source_config.broker_client.client_type == "simple":
607
+ start_simple_message_broker(source_config.broker_client.model_dump())
608
+
609
+ return source_name
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,59 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import time
6
+ import uuid
7
+ import psutil
8
+ import ray
9
+
10
+
11
+ def estimate_actor_memory_overhead(
12
+ actor_class, iterations=1, stabilization_threshold=1 * 1024 * 1024, wait_time=2, actor_args=None, actor_kwargs=None
13
+ ):
14
+ """
15
+ Estimate the additional system memory overhead when launching a Ray actor of the given actor_class.
16
+
17
+ Parameters:
18
+ actor_class: A Ray remote actor class.
19
+ iterations (int): Number of measurement iterations.
20
+ stabilization_threshold (int): Maximum difference (in bytes) between min and max measurements to
21
+ consider results stable.
22
+ wait_time (float): Seconds to wait after spawning or killing an actor for memory to stabilize.
23
+ actor_args (list): Positional arguments to pass to the actor's remote() call.
24
+ actor_kwargs (dict): Keyword arguments to pass to the actor's remote() call.
25
+
26
+ Returns:
27
+ float: Estimated average overhead in bytes for replicating the actor.
28
+ """
29
+ actor_args = actor_args if actor_args is not None else []
30
+ actor_kwargs = actor_kwargs if actor_kwargs is not None else {}
31
+
32
+ measurements = []
33
+
34
+ iterations = 0 # TODO
35
+ for i in range(iterations):
36
+ # Record baseline system memory usage.
37
+ baseline = psutil.virtual_memory().used
38
+
39
+ # Spin up a new actor with provided arguments.
40
+ actor = actor_class.options(name=f"mem_estimator_{uuid.uuid4()}").remote(*actor_args, **actor_kwargs)
41
+ # Allow time for the actor to start.
42
+ time.sleep(wait_time)
43
+
44
+ # Measure memory after actor has started.
45
+ after_spawn = psutil.virtual_memory().used
46
+ overhead = after_spawn - baseline
47
+ measurements.append(overhead)
48
+
49
+ # Kill the actor.
50
+ ray.kill(actor, no_restart=True)
51
+ # Allow time for system memory to be released.
52
+ time.sleep(wait_time)
53
+
54
+ if measurements:
55
+ _ = max(measurements) - min(measurements)
56
+ _ = sum(measurements) / len(measurements)
57
+
58
+ return 1_500_000_000
59
+ # return estimated_overhead Need to come up with a better way to estiamte actor overhead.