nv-ingest 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (102) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +45 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/api/v1/metrics.py +29 -0
  8. nv_ingest/framework/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  12. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  13. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  14. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  15. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  18. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  19. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  20. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  22. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +591 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1322 -0
  24. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  25. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  34. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  35. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  36. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  41. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  42. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  44. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  45. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  47. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  48. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  49. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  52. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  53. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  56. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  60. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  61. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  62. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  64. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +200 -0
  68. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
  69. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +624 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  71. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  72. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  73. nv_ingest/framework/schemas/__init__.py +0 -0
  74. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  75. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  76. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  77. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  78. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  79. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  80. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  81. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  82. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  83. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  84. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  85. nv_ingest/framework/util/__init__.py +3 -0
  86. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  87. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  88. nv_ingest/framework/util/service/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  90. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  91. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  92. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  93. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  94. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  95. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  96. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  97. nv_ingest/version.py +38 -0
  98. nv_ingest-25.6.0.dist-info/METADATA +266 -0
  99. nv_ingest-25.6.0.dist-info/RECORD +102 -0
  100. nv_ingest-25.6.0.dist-info/WHEEL +5 -0
  101. nv_ingest-25.6.0.dist-info/licenses/LICENSE +201 -0
  102. nv_ingest-25.6.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,624 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # TODO(Devin)
6
+ # flake8: noqa
7
+ import os
8
+
9
+ import click
10
+ import logging
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
13
+ from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
14
+ from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
15
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
16
+
17
+ # Import our new pipeline class.
18
+ from nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor import AudioExtractorStage
19
+ from nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor import ChartExtractorStage
20
+ from nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor import DocxExtractorStage
21
+ from nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor import ImageExtractorStage
22
+ from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extractor import InfographicExtractorStage
23
+ from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
24
+ from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
25
+ from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
26
+ from nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor import HtmlExtractorStage
27
+
28
+ from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
29
+ from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
30
+ from nv_ingest.framework.orchestration.ray.stages.mutate.image_filter import ImageFilterStage
31
+ from nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink import (
32
+ MessageBrokerTaskSinkStage,
33
+ MessageBrokerTaskSinkConfig,
34
+ )
35
+ from nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source import (
36
+ MessageBrokerTaskSourceStage,
37
+ MessageBrokerTaskSourceConfig,
38
+ start_simple_message_broker,
39
+ )
40
+ from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import ImageStorageStage
41
+ from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
42
+ from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
43
+ from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
44
+ from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
45
+ from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
46
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
47
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
48
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
49
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
50
+ from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
51
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
52
+ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
53
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
54
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
55
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
56
+ from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
57
+ from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
58
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
59
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
60
+ from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
61
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+ _system_resource_probe = SystemResourceProbe()
66
+
67
+
68
+ def validate_positive(ctx, param, value):
69
+ if value <= 0:
70
+ raise click.BadParameter("must be a positive integer")
71
+ return value
72
+
73
+
74
+ def get_message_provider_config():
75
+ message_provider_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
76
+ message_provider_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
77
+
78
+ logger.info(f"MESSAGE_CLIENT_HOST: {message_provider_host}")
79
+ logger.info(f"MESSAGE_CLIENT_PORT: {message_provider_port}")
80
+
81
+ return message_provider_host, message_provider_port
82
+
83
+
84
+ def get_caption_classifier_service():
85
+ triton_service_caption_classifier = os.environ.get(
86
+ "CAPTION_CLASSIFIER_GRPC_TRITON",
87
+ "",
88
+ )
89
+ triton_service_caption_classifier_name = os.environ.get(
90
+ "CAPTION_CLASSIFIER_MODEL_NAME",
91
+ "",
92
+ )
93
+
94
+ logger.info(f"CAPTION_CLASSIFIER_GRPC_TRITON: {triton_service_caption_classifier}")
95
+
96
+ return triton_service_caption_classifier, triton_service_caption_classifier_name
97
+
98
+
99
+ def get_nim_service(env_var_prefix):
100
+ prefix = env_var_prefix.upper()
101
+ grpc_endpoint = os.environ.get(
102
+ f"{prefix}_GRPC_ENDPOINT",
103
+ "",
104
+ )
105
+ http_endpoint = os.environ.get(
106
+ f"{prefix}_HTTP_ENDPOINT",
107
+ "",
108
+ )
109
+ auth_token = os.environ.get(
110
+ "NVIDIA_BUILD_API_KEY",
111
+ "",
112
+ ) or os.environ.get(
113
+ "NGC_API_KEY",
114
+ "",
115
+ )
116
+
117
+ infer_protocol = os.environ.get(
118
+ f"{prefix}_INFER_PROTOCOL",
119
+ "http" if http_endpoint else "grpc" if grpc_endpoint else "",
120
+ )
121
+
122
+ logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
123
+ logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
124
+ logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
125
+
126
+ return grpc_endpoint, http_endpoint, auth_token, infer_protocol
127
+
128
+
129
+ def get_audio_retrieval_service(env_var_prefix):
130
+ prefix = env_var_prefix.upper()
131
+ grpc_endpoint = os.environ.get(
132
+ "AUDIO_GRPC_ENDPOINT",
133
+ "",
134
+ )
135
+ http_endpoint = os.environ.get(
136
+ "AUDIO_HTTP_ENDPOINT",
137
+ "",
138
+ )
139
+ auth_token = os.environ.get(
140
+ "NVIDIA_BUILD_API_KEY",
141
+ "",
142
+ ) or os.environ.get(
143
+ "NGC_API_KEY",
144
+ "",
145
+ )
146
+ infer_protocol = os.environ.get(
147
+ "AUDIO_INFER_PROTOCOL",
148
+ "http" if http_endpoint else "grpc" if grpc_endpoint else "",
149
+ )
150
+ function_id = os.environ.get(
151
+ "AUDIO_FUNCTION_ID",
152
+ "",
153
+ )
154
+
155
+ logger.info(f"{prefix}_GRPC_ENDPOINT: {grpc_endpoint}")
156
+ logger.info(f"{prefix}_HTTP_ENDPOINT: {http_endpoint}")
157
+ logger.info(f"{prefix}_INFER_PROTOCOL: {infer_protocol}")
158
+ logger.info(f"{prefix}_FUNCTION_ID: {function_id}")
159
+
160
+ return grpc_endpoint, http_endpoint, auth_token, infer_protocol, function_id
161
+
162
+
163
+ def add_metadata_injector_stage(pipeline, default_cpu_count, stage_name="metadata_injector"):
164
+ _ = default_cpu_count # Placeholder for future use
165
+ config = MetadataInjectorSchema()
166
+
167
+ pipeline.add_stage(
168
+ name=stage_name,
169
+ stage_actor=MetadataInjectionStage,
170
+ config=config,
171
+ min_replicas=0,
172
+ max_replicas=1,
173
+ )
174
+
175
+ return stage_name
176
+
177
+
178
+ def add_pdf_extractor_stage(pipeline, default_cpu_count, stage_name="pdf_extractor"):
179
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
180
+ nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
181
+ get_nim_service("nemoretriever_parse")
182
+ )
183
+ model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
184
+
185
+ extractor_config = PDFExtractorSchema(
186
+ **{
187
+ "pdfium_config": {
188
+ "auth_token": yolox_auth, # All auth tokens are the same for the moment
189
+ "yolox_endpoints": (yolox_grpc, yolox_http),
190
+ "yolox_infer_protocol": yolox_protocol,
191
+ },
192
+ "nemoretriever_parse_config": {
193
+ "auth_token": nemoretriever_parse_auth,
194
+ "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
195
+ "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
196
+ "nemoretriever_parse_model_name": model_name,
197
+ "yolox_endpoints": (yolox_grpc, yolox_http),
198
+ "yolox_infer_protocol": yolox_protocol,
199
+ },
200
+ }
201
+ )
202
+
203
+ pipeline.add_stage(
204
+ name=stage_name,
205
+ stage_actor=PDFExtractorStage,
206
+ config=extractor_config,
207
+ min_replicas=0,
208
+ max_replicas=int(max(1, (default_cpu_count // 3))), # 33% of available CPU cores
209
+ )
210
+
211
+ return stage_name
212
+
213
+
214
+ def add_table_extractor_stage(pipeline, default_cpu_count, stage_name="table_extractor"):
215
+ yolox_table_structure_grpc, yolox_table_structure_http, yolox_auth, yolox_table_structure_protocol = (
216
+ get_nim_service("yolox_table_structure")
217
+ )
218
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
219
+
220
+ table_extractor_config = TableExtractorSchema(
221
+ **{
222
+ "endpoint_config": {
223
+ "yolox_endpoints": (yolox_table_structure_grpc, yolox_table_structure_http),
224
+ "yolox_infer_protocol": yolox_table_structure_protocol,
225
+ "paddle_endpoints": (paddle_grpc, paddle_http),
226
+ "paddle_infer_protocol": paddle_protocol,
227
+ "auth_token": yolox_auth,
228
+ }
229
+ }
230
+ )
231
+
232
+ pipeline.add_stage(
233
+ name=stage_name,
234
+ stage_actor=TableExtractorStage,
235
+ config=table_extractor_config,
236
+ min_replicas=0,
237
+ max_replicas=int(max(1, (default_cpu_count // 7))), # 14% of available CPU cores
238
+ )
239
+
240
+ return stage_name
241
+
242
+
243
+ def add_chart_extractor_stage(pipeline, default_cpu_count, stage_name="chart_extractor"):
244
+ yolox_graphic_elements_grpc, yolox_graphic_elements_http, yolox_auth, yolox_graphic_elements_protocol = (
245
+ get_nim_service("yolox_graphic_elements")
246
+ )
247
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
248
+
249
+ chart_extractor_config = ChartExtractorSchema(
250
+ **{
251
+ "endpoint_config": {
252
+ "yolox_endpoints": (yolox_graphic_elements_grpc, yolox_graphic_elements_http),
253
+ "yolox_infer_protocol": yolox_graphic_elements_protocol,
254
+ "paddle_endpoints": (paddle_grpc, paddle_http),
255
+ "paddle_infer_protocol": paddle_protocol,
256
+ "auth_token": yolox_auth,
257
+ }
258
+ }
259
+ )
260
+
261
+ pipeline.add_stage(
262
+ name=stage_name,
263
+ stage_actor=ChartExtractorStage,
264
+ config=chart_extractor_config,
265
+ min_replicas=0,
266
+ max_replicas=int(max(1, (default_cpu_count // 7))), # 14% of available CPU cores
267
+ )
268
+
269
+ return stage_name
270
+
271
+
272
+ def add_infographic_extractor_stage(pipeline, default_cpu_count, stage_name="infographic_extractor"):
273
+ paddle_grpc, paddle_http, paddle_auth, paddle_protocol = get_nim_service("paddle")
274
+
275
+ infographic_content_extractor_config = InfographicExtractorSchema(
276
+ **{
277
+ "endpoint_config": {
278
+ "paddle_endpoints": (paddle_grpc, paddle_http),
279
+ "paddle_infer_protocol": paddle_protocol,
280
+ "auth_token": paddle_auth,
281
+ }
282
+ }
283
+ )
284
+
285
+ pipeline.add_stage(
286
+ name=stage_name,
287
+ stage_actor=InfographicExtractorStage,
288
+ config=infographic_content_extractor_config,
289
+ min_replicas=0,
290
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
291
+ )
292
+
293
+ return stage_name
294
+
295
+
296
+ def add_image_extractor_stage(pipeline, default_cpu_count, stage_name="image_extractor"):
297
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
298
+
299
+ image_extractor_config = ImageConfigSchema(
300
+ **{
301
+ "yolox_endpoints": (yolox_grpc, yolox_http),
302
+ "yolox_infer_protocol": yolox_protocol,
303
+ "auth_token": yolox_auth, # All auth tokens are the same for the moment
304
+ }
305
+ )
306
+
307
+ pipeline.add_stage(
308
+ name=stage_name,
309
+ stage_actor=ImageExtractorStage,
310
+ config=image_extractor_config,
311
+ min_replicas=0,
312
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
313
+ )
314
+
315
+ return stage_name
316
+
317
+
318
+ def add_docx_extractor_stage(pipeline, default_cpu_count, stage_name="docx_extractor"):
319
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
320
+
321
+ docx_extractor_config = {
322
+ "docx_extraction_config": {
323
+ "yolox_endpoints": (yolox_grpc, yolox_http),
324
+ "yolox_infer_protocol": yolox_protocol,
325
+ "auth_token": yolox_auth,
326
+ }
327
+ }
328
+
329
+ pipeline.add_stage(
330
+ name=stage_name,
331
+ stage_actor=DocxExtractorStage,
332
+ config=DocxExtractorSchema(**docx_extractor_config),
333
+ min_replicas=0,
334
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
335
+ )
336
+
337
+ return stage_name
338
+
339
+
340
+ def add_pptx_extractor_stage(pipeline, default_cpu_count, stage_name="pptx_extractor"):
341
+ yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
342
+
343
+ pptx_extractor_config = {
344
+ "pptx_extraction_config": {
345
+ "yolox_endpoints": (yolox_grpc, yolox_http),
346
+ "yolox_infer_protocol": yolox_protocol,
347
+ "auth_token": yolox_auth,
348
+ }
349
+ }
350
+
351
+ pipeline.add_stage(
352
+ name=stage_name,
353
+ stage_actor=PPTXExtractorStage,
354
+ config=PPTXExtractorSchema(**pptx_extractor_config),
355
+ min_replicas=0,
356
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
357
+ )
358
+
359
+ return stage_name
360
+
361
+
362
+ def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_extractor"):
363
+ audio_grpc, audio_http, audio_auth, audio_infer_protocol, audio_function_id = get_audio_retrieval_service("audio")
364
+
365
+ audio_extractor_config = AudioExtractorSchema(
366
+ **{
367
+ "audio_extraction_config": {
368
+ "audio_endpoints": (audio_grpc, audio_http),
369
+ "audio_infer_protocol": audio_infer_protocol,
370
+ "function_id": audio_function_id,
371
+ "auth_token": audio_auth,
372
+ # All auth tokens are the same for the moment
373
+ }
374
+ }
375
+ )
376
+
377
+ pipeline.add_stage(
378
+ name=stage_name,
379
+ stage_actor=AudioExtractorStage,
380
+ config=audio_extractor_config,
381
+ min_replicas=0,
382
+ max_replicas=1, # Audio extraction is a heavy IO bound operation with minimal CPU usage
383
+ )
384
+
385
+ return stage_name
386
+
387
+
388
+ def add_html_extractor_stage(pipeline, default_cpu_count, stage_name="html_extractor"):
389
+
390
+ pipeline.add_stage(
391
+ name=stage_name,
392
+ stage_actor=HtmlExtractorStage,
393
+ config=HtmlExtractorSchema(),
394
+ min_replicas=0,
395
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
396
+ )
397
+
398
+ return stage_name
399
+
400
+
401
+ def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
402
+ _ = default_cpu_count # Placeholder for future use
403
+ otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
404
+
405
+ otel_tracer_config = OpenTelemetryTracerSchema(
406
+ **{
407
+ "otel_endpoint": otel_endpoint,
408
+ }
409
+ )
410
+
411
+ pipeline.add_stage(
412
+ name=stage_name,
413
+ stage_actor=OpenTelemetryTracerStage,
414
+ config=otel_tracer_config,
415
+ min_replicas=0,
416
+ max_replicas=2,
417
+ )
418
+
419
+ return stage_name
420
+
421
+
422
+ def add_image_dedup_stage(pipeline, default_cpu_count, stage_name="image_dedup"):
423
+ config = ImageDedupSchema()
424
+
425
+ pipeline.add_stage(
426
+ name=stage_name,
427
+ stage_actor=ImageDedupStage,
428
+ config=config,
429
+ min_replicas=0,
430
+ max_replicas=1,
431
+ )
432
+
433
+ return stage_name
434
+
435
+
436
+ def add_image_filter_stage(pipeline, default_cpu_count, stage_name="image_filter"):
437
+ config = ImageFilterSchema()
438
+
439
+ pipeline.add_stage(
440
+ name=stage_name,
441
+ stage_actor=ImageFilterStage,
442
+ config=config,
443
+ min_replicas=0,
444
+ max_replicas=1,
445
+ )
446
+
447
+ return stage_name
448
+
449
+
450
+ def add_text_splitter_stage(pipeline, default_cpu_count, stage_name="text_splitter"):
451
+ _ = default_cpu_count
452
+
453
+ config = TextSplitterSchema()
454
+
455
+ pipeline.add_stage(
456
+ name=stage_name,
457
+ stage_actor=TextSplitterStage,
458
+ config=config,
459
+ min_replicas=0,
460
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
461
+ )
462
+
463
+ return stage_name
464
+
465
+
466
+ def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_caption"):
467
+ auth_token = os.environ.get(
468
+ "NVIDIA_BUILD_API_KEY",
469
+ "",
470
+ ) or os.environ.get(
471
+ "NGC_API_KEY",
472
+ "",
473
+ )
474
+
475
+ endpoint_url = os.environ.get("VLM_CAPTION_ENDPOINT", "localhost:5000")
476
+ model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
477
+
478
+ config = ImageCaptionExtractionSchema(
479
+ **{
480
+ "api_key": auth_token,
481
+ "endpoint_url": endpoint_url,
482
+ "model_name": model_name,
483
+ "prompt": "Caption the content of this image:",
484
+ }
485
+ )
486
+
487
+ pipeline.add_stage(
488
+ name=stage_name,
489
+ stage_actor=ImageCaptionTransformStage,
490
+ config=config,
491
+ min_replicas=0,
492
+ max_replicas=1,
493
+ )
494
+
495
+ return stage_name
496
+
497
+
498
+ def add_text_embedding_stage(pipeline, default_cpu_count, stage_name="text_embedding"):
499
+ api_key = os.environ.get(
500
+ "NVIDIA_BUILD_API_KEY",
501
+ "",
502
+ ) or os.environ.get(
503
+ "NGC_API_KEY",
504
+ "",
505
+ )
506
+ embedding_nim_endpoint = os.getenv("EMBEDDING_NIM_ENDPOINT", "http://embedding:8000/v1")
507
+ embedding_model = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
508
+
509
+ config = TextEmbeddingSchema(
510
+ **{
511
+ "api_key": api_key,
512
+ "embedding_nim_endpoint": embedding_nim_endpoint,
513
+ "embedding_model": embedding_model,
514
+ }
515
+ )
516
+
517
+ pipeline.add_stage(
518
+ name=stage_name,
519
+ stage_actor=TextEmbeddingTransformStage,
520
+ config=config,
521
+ min_replicas=0,
522
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
523
+ )
524
+
525
+ return stage_name
526
+
527
+
528
+ def add_embedding_storage_stage(pipeline, default_cpu_count, stage_name="embedding_storage"):
529
+ config = EmbeddingStorageSchema()
530
+
531
+ pipeline.add_stage(
532
+ name=stage_name,
533
+ stage_actor=EmbeddingStorageStage,
534
+ config=config,
535
+ min_replicas=0,
536
+ max_replicas=1,
537
+ )
538
+
539
+ return stage_name
540
+
541
+
542
+ def add_image_storage_stage(pipeline, default_cpu_count, stage_name="image_storage"):
543
+ config = ImageStorageModuleSchema()
544
+ pipeline.add_stage(
545
+ name=stage_name,
546
+ stage_actor=ImageStorageStage,
547
+ config=config,
548
+ min_replicas=0,
549
+ max_replicas=1,
550
+ )
551
+
552
+ return stage_name
553
+
554
+
555
+ def add_default_drain_stage(pipeline, default_cpu_count, stage_name="pipeline_drain"):
556
+ pipeline.add_stage(
557
+ name=stage_name,
558
+ stage_actor=DefaultDrainSink,
559
+ config=None,
560
+ min_replicas=1,
561
+ max_replicas=1,
562
+ )
563
+
564
+ return stage_name
565
+
566
+
567
+ def add_message_broker_response_stage(pipeline, default_cpu_count, stage_name="broker_response"):
568
+ task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
569
+ task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
570
+ client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
571
+
572
+ sink_config = MessageBrokerTaskSinkConfig(
573
+ **{
574
+ "broker_client": {
575
+ "host": task_broker_host,
576
+ "port": task_broker_port,
577
+ "client_type": client_type,
578
+ },
579
+ }
580
+ )
581
+
582
+ pipeline.add_stage(
583
+ name=stage_name,
584
+ stage_actor=MessageBrokerTaskSinkStage,
585
+ config=sink_config,
586
+ min_replicas=0,
587
+ max_replicas=2,
588
+ )
589
+
590
+ return stage_name
591
+
592
+
593
+ def add_source_stage(pipeline, default_cpu_count, source_name="pipeline_source"):
594
+ _ = default_cpu_count # Placeholder for future use
595
+ task_broker_host = os.environ.get("MESSAGE_CLIENT_HOST", "localhost")
596
+ task_broker_port = os.environ.get("MESSAGE_CLIENT_PORT", "6379")
597
+
598
+ client_type = os.environ.get("MESSAGE_CLIENT_TYPE", "redis")
599
+ task_queue_name = os.environ.get("MESSAGE_CLIENT_QUEUE", "ingest_task_queue")
600
+
601
+ source_config = MessageBrokerTaskSourceConfig(
602
+ **{
603
+ "broker_client": {
604
+ "host": task_broker_host,
605
+ "port": task_broker_port,
606
+ "client_type": client_type,
607
+ },
608
+ "task_queue": task_queue_name,
609
+ "poll_interval": "0.1",
610
+ }
611
+ )
612
+
613
+ pipeline.add_source(
614
+ name=source_name,
615
+ source_actor=MessageBrokerTaskSourceStage,
616
+ config=source_config,
617
+ min_replicas=1,
618
+ max_replicas=1,
619
+ )
620
+
621
+ if source_config.broker_client.client_type == "simple":
622
+ start_simple_message_broker(source_config.broker_client.model_dump())
623
+
624
+ return source_name
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,59 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import time
6
+ import uuid
7
+ import psutil
8
+ import ray
9
+
10
+
11
+ def estimate_actor_memory_overhead(
12
+ actor_class, iterations=1, stabilization_threshold=1 * 1024 * 1024, wait_time=2, actor_args=None, actor_kwargs=None
13
+ ):
14
+ """
15
+ Estimate the additional system memory overhead when launching a Ray actor of the given actor_class.
16
+
17
+ Parameters:
18
+ actor_class: A Ray remote actor class.
19
+ iterations (int): Number of measurement iterations.
20
+ stabilization_threshold (int): Maximum difference (in bytes) between min and max measurements to
21
+ consider results stable.
22
+ wait_time (float): Seconds to wait after spawning or killing an actor for memory to stabilize.
23
+ actor_args (list): Positional arguments to pass to the actor's remote() call.
24
+ actor_kwargs (dict): Keyword arguments to pass to the actor's remote() call.
25
+
26
+ Returns:
27
+ float: Estimated average overhead in bytes for replicating the actor.
28
+ """
29
+ actor_args = actor_args if actor_args is not None else []
30
+ actor_kwargs = actor_kwargs if actor_kwargs is not None else {}
31
+
32
+ measurements = []
33
+
34
+ iterations = 0 # TODO
35
+ for i in range(iterations):
36
+ # Record baseline system memory usage.
37
+ baseline = psutil.virtual_memory().used
38
+
39
+ # Spin up a new actor with provided arguments.
40
+ actor = actor_class.options(name=f"mem_estimator_{uuid.uuid4()}").remote(*actor_args, **actor_kwargs)
41
+ # Allow time for the actor to start.
42
+ time.sleep(wait_time)
43
+
44
+ # Measure memory after actor has started.
45
+ after_spawn = psutil.virtual_memory().used
46
+ overhead = after_spawn - baseline
47
+ measurements.append(overhead)
48
+
49
+ # Kill the actor.
50
+ ray.kill(actor, no_restart=True)
51
+ # Allow time for system memory to be released.
52
+ time.sleep(wait_time)
53
+
54
+ if measurements:
55
+ _ = max(measurements) - min(measurements)
56
+ _ = sum(measurements) / len(measurements)
57
+
58
+ return 1_500_000_000
59
+ # return estimated_overhead Need to come up with a better way to estiamte actor overhead.