nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  9. nv_ingest/framework/orchestration/execution/options.py +112 -0
  10. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
  12. nv_ingest/framework/orchestration/process/execution.py +495 -0
  13. nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
  14. nv_ingest/framework/orchestration/process/strategies.py +218 -0
  15. nv_ingest/framework/orchestration/process/termination.py +147 -0
  16. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
  17. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  18. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
  19. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  20. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
  21. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
  22. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
  23. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
  24. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
  25. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  26. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
  28. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  29. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
  30. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
  31. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  32. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  33. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
  34. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
  35. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
  36. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  37. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  38. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
  39. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
  40. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
  41. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  42. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  43. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
  44. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
  45. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
  46. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  47. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  48. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  49. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  50. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  51. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  52. nv_ingest/pipeline/__init__.py +3 -0
  53. nv_ingest/pipeline/config/__init__.py +3 -0
  54. nv_ingest/pipeline/config/loaders.py +229 -0
  55. nv_ingest/pipeline/config/replica_resolver.py +237 -0
  56. nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
  57. nv_ingest/pipeline/default_pipeline_impl.py +557 -0
  58. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  59. nv_ingest/pipeline/pipeline_schema.py +398 -0
  60. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
  61. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
  62. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  63. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  64. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
  65. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
  66. {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,495 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Low-level pipeline execution functions.
7
+
8
+ This module contains the core pipeline execution functions that are shared
9
+ between different execution strategies, extracted to avoid circular imports.
10
+ """
11
+
12
+ import logging
13
+ import multiprocessing
14
+ import os
15
+ import signal
16
+ import sys
17
+ import time
18
+ from ctypes import CDLL
19
+ from datetime import datetime
20
+ from typing import Union, Tuple, Optional, TextIO, Any
21
+ import json
22
+
23
+ import ray
24
+ from ray import LoggingConfig
25
+
26
+ from nv_ingest.framework.orchestration.process.dependent_services import start_simple_message_broker
27
+ from nv_ingest.framework.orchestration.process.termination import (
28
+ kill_pipeline_process_group as _kill_pipeline_process_group,
29
+ )
30
+ from nv_ingest.pipeline.ingest_pipeline import IngestPipelineBuilder
31
+ from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
32
+ from nv_ingest.pipeline.config.replica_resolver import resolve_static_replicas
33
+ from nv_ingest_api.util.string_processing.configuration import pretty_print_pipeline_config
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def _safe_log(level: int, msg: str) -> None:
39
+ """Best-effort logging that won't crash during interpreter shutdown.
40
+
41
+ Attempts to emit via the module logger, but if logging handlers/streams
42
+ have already been closed (common in atexit during CI/pytest teardown),
43
+ falls back to writing to sys.__stderr__ and never raises.
44
+ """
45
+ try:
46
+ logger.log(level, msg)
47
+ return
48
+ except Exception:
49
+ pass
50
+ try:
51
+ # Use the original un-captured stderr if available
52
+ if hasattr(sys, "__stderr__") and sys.__stderr__:
53
+ sys.__stderr__.write(msg + "\n")
54
+ sys.__stderr__.flush()
55
+ except Exception:
56
+ # Last resort: swallow any error to avoid noisy shutdowns
57
+ pass
58
+
59
+
60
+ def str_to_bool(value: str) -> bool:
61
+ """Convert string to boolean value."""
62
+ return value.strip().lower() in {"1", "true", "yes", "on"}
63
+
64
+
65
+ def redirect_os_fds(stdout: Optional[TextIO] = None, stderr: Optional[TextIO] = None):
66
+ """
67
+ Redirect OS-level stdout (fd=1) and stderr (fd=2) to the given file-like objects,
68
+ or to /dev/null if not provided.
69
+
70
+ Parameters
71
+ ----------
72
+ stdout : Optional[TextIO]
73
+ Stream to receive OS-level stdout. If None, redirected to /dev/null.
74
+ stderr : Optional[TextIO]
75
+ Stream to receive OS-level stderr. If None, redirected to /dev/null.
76
+ """
77
+ import os
78
+
79
+ # Get file descriptors for stdout and stderr, or use /dev/null
80
+ stdout_fd = stdout.fileno() if stdout else os.open(os.devnull, os.O_WRONLY)
81
+ stderr_fd = stderr.fileno() if stderr else os.open(os.devnull, os.O_WRONLY)
82
+
83
+ # Redirect OS-level file descriptors
84
+ os.dup2(stdout_fd, 1) # Redirect stdout (fd=1)
85
+ os.dup2(stderr_fd, 2) # Redirect stderr (fd=2)
86
+
87
+
88
+ def set_pdeathsig(sig=signal.SIGKILL):
89
+ """Set parent death signal to kill child when parent dies."""
90
+ libc = CDLL("libc.so.6")
91
+ libc.prctl(1, sig) # PR_SET_PDEATHSIG = 1
92
+
93
+
94
+ def build_logging_config_from_env() -> LoggingConfig:
95
+ """
96
+ Build Ray LoggingConfig from environment variables.
97
+ Package-level preset (sets all defaults):
98
+ - INGEST_RAY_LOG_LEVEL: PRODUCTION, DEVELOPMENT, DEBUG. Default: DEVELOPMENT
99
+ Individual environment variables (override preset defaults):
100
+ - RAY_LOGGING_LEVEL: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL). Default: INFO
101
+ - RAY_LOGGING_ENCODING: Log encoding format (TEXT, JSON). Default: TEXT
102
+ - RAY_LOGGING_ADDITIONAL_ATTRS: Comma-separated list of additional standard logger attributes
103
+ - RAY_DEDUP_LOGS: Enable/disable log deduplication (0/1). Default: 1 (enabled)
104
+ - RAY_LOG_TO_DRIVER: Enable/disable logging to driver (true/false). Default: true
105
+ - RAY_LOGGING_ROTATE_BYTES: Maximum log file size before rotation (bytes). Default: 1GB
106
+ - RAY_LOGGING_ROTATE_BACKUP_COUNT: Number of backup log files to keep. Default: 19
107
+ - RAY_DISABLE_IMPORT_WARNING: Disable Ray import warnings (0/1). Default: 0
108
+ - RAY_USAGE_STATS_ENABLED: Enable/disable usage stats collection (0/1). Default: 1
109
+ """
110
+
111
+ # Apply package-level preset defaults first
112
+ preset_level = os.environ.get("INGEST_RAY_LOG_LEVEL", "DEVELOPMENT").upper()
113
+
114
+ # Define preset configurations
115
+ presets = {
116
+ "PRODUCTION": {
117
+ "RAY_LOGGING_LEVEL": "ERROR",
118
+ "RAY_LOGGING_ENCODING": "TEXT",
119
+ "RAY_LOGGING_ADDITIONAL_ATTRS": "",
120
+ "RAY_DEDUP_LOGS": "1",
121
+ "RAY_LOG_TO_DRIVER": "0", # false
122
+ "RAY_LOGGING_ROTATE_BYTES": "1073741824", # 1GB
123
+ "RAY_LOGGING_ROTATE_BACKUP_COUNT": "9", # 10GB total
124
+ "RAY_DISABLE_IMPORT_WARNING": "1",
125
+ "RAY_USAGE_STATS_ENABLED": "0",
126
+ },
127
+ "DEVELOPMENT": {
128
+ "RAY_LOGGING_LEVEL": "INFO",
129
+ "RAY_LOGGING_ENCODING": "TEXT",
130
+ "RAY_LOGGING_ADDITIONAL_ATTRS": "",
131
+ "RAY_DEDUP_LOGS": "1",
132
+ "RAY_LOG_TO_DRIVER": "0", # false
133
+ "RAY_LOGGING_ROTATE_BYTES": "1073741824", # 1GB
134
+ "RAY_LOGGING_ROTATE_BACKUP_COUNT": "19", # 20GB total
135
+ "RAY_DISABLE_IMPORT_WARNING": "0",
136
+ "RAY_USAGE_STATS_ENABLED": "1",
137
+ },
138
+ "DEBUG": {
139
+ "RAY_LOGGING_LEVEL": "DEBUG",
140
+ "RAY_LOGGING_ENCODING": "JSON",
141
+ "RAY_LOGGING_ADDITIONAL_ATTRS": "name,funcName,lineno",
142
+ "RAY_DEDUP_LOGS": "0",
143
+ "RAY_LOG_TO_DRIVER": "0", # false
144
+ "RAY_LOGGING_ROTATE_BYTES": "536870912", # 512MB
145
+ "RAY_LOGGING_ROTATE_BACKUP_COUNT": "39", # 20GB total
146
+ "RAY_DISABLE_IMPORT_WARNING": "0",
147
+ "RAY_USAGE_STATS_ENABLED": "1",
148
+ },
149
+ }
150
+
151
+ # Validate preset level
152
+ if preset_level not in presets:
153
+ logger.warning(
154
+ f"Invalid INGEST_RAY_LOG_LEVEL '{preset_level}', using DEVELOPMENT. "
155
+ f"Valid presets: {list(presets.keys())}"
156
+ )
157
+ preset_level = "DEVELOPMENT"
158
+
159
+ # Apply preset defaults (only if env var not already set)
160
+ preset_config = presets[preset_level]
161
+ for key, default_value in preset_config.items():
162
+ if key not in os.environ:
163
+ os.environ[key] = default_value
164
+
165
+ logger.info(f"Applied Ray logging preset: {preset_level}")
166
+
167
+ # Get log level from environment, default to INFO
168
+ log_level = os.environ.get("RAY_LOGGING_LEVEL", "INFO").upper()
169
+
170
+ # Validate log level
171
+ valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
172
+ if log_level not in valid_levels:
173
+ logger.warning(f"Invalid RAY_LOGGING_LEVEL '{log_level}', using INFO. Valid levels: {valid_levels}")
174
+ log_level = "INFO"
175
+
176
+ # Get encoding format from environment, default to TEXT
177
+ encoding = os.environ.get("RAY_LOGGING_ENCODING", "TEXT").upper()
178
+
179
+ # Validate encoding
180
+ valid_encodings = ["TEXT", "JSON"]
181
+ if encoding not in valid_encodings:
182
+ logger.warning(f"Invalid RAY_LOGGING_ENCODING '{encoding}', using TEXT. Valid encodings: {valid_encodings}")
183
+ encoding = "TEXT"
184
+
185
+ # Get additional standard logger attributes
186
+ additional_attrs_str = os.environ.get("RAY_LOGGING_ADDITIONAL_ATTRS", "")
187
+ additional_log_standard_attrs = []
188
+ if additional_attrs_str:
189
+ additional_log_standard_attrs = [attr.strip() for attr in additional_attrs_str.split(",") if attr.strip()]
190
+
191
+ # Set log deduplication environment variable if specified
192
+ dedup_logs = os.environ.get("RAY_DEDUP_LOGS", "1")
193
+ if dedup_logs is not None:
194
+ os.environ["RAY_DEDUP_LOGS"] = str(dedup_logs)
195
+
196
+ # Set log to driver environment variable if specified
197
+ log_to_driver = os.environ.get("RAY_LOG_TO_DRIVER", "0")
198
+ if log_to_driver is not None:
199
+ os.environ["RAY_LOG_TO_DRIVER"] = str(log_to_driver)
200
+
201
+ # Configure log rotation settings
202
+ rotate_bytes = os.environ.get("RAY_LOGGING_ROTATE_BYTES", "1073741824") # Default: 1GB per file
203
+ if rotate_bytes is not None:
204
+ try:
205
+ rotate_bytes_int = int(rotate_bytes)
206
+ os.environ["RAY_LOGGING_ROTATE_BYTES"] = str(rotate_bytes_int)
207
+ except ValueError:
208
+ logger.warning(f"Invalid RAY_LOGGING_ROTATE_BYTES '{rotate_bytes}', using default (1GB)")
209
+ os.environ["RAY_LOGGING_ROTATE_BYTES"] = "1073741824"
210
+
211
+ rotate_backup_count = os.environ.get("RAY_LOGGING_ROTATE_BACKUP_COUNT", "19") # Default: 19 backups (20GB Max)
212
+ if rotate_backup_count is not None:
213
+ try:
214
+ backup_count_int = int(rotate_backup_count)
215
+ os.environ["RAY_LOGGING_ROTATE_BACKUP_COUNT"] = str(backup_count_int)
216
+ except ValueError:
217
+ logger.warning(f"Invalid RAY_LOGGING_ROTATE_BACKUP_COUNT '{rotate_backup_count}', using default (19)")
218
+ os.environ["RAY_LOGGING_ROTATE_BACKUP_COUNT"] = "19"
219
+
220
+ # Configure Ray internal logging verbosity
221
+ disable_import_warning = os.environ.get("RAY_DISABLE_IMPORT_WARNING", "0")
222
+ if disable_import_warning is not None:
223
+ os.environ["RAY_DISABLE_IMPORT_WARNING"] = str(disable_import_warning)
224
+
225
+ # Configure usage stats collection
226
+ usage_stats_enabled = os.environ.get("RAY_USAGE_STATS_ENABLED", "1")
227
+ if usage_stats_enabled is not None:
228
+ os.environ["RAY_USAGE_STATS_ENABLED"] = str(usage_stats_enabled)
229
+
230
+ # Create LoggingConfig with validated parameters
231
+ logging_config = LoggingConfig(
232
+ encoding=encoding,
233
+ log_level=log_level,
234
+ additional_log_standard_attrs=additional_log_standard_attrs,
235
+ )
236
+
237
+ logger.info(
238
+ f"Ray logging configured: preset={preset_level}, level={log_level}, encoding={encoding}, "
239
+ f"additional_attrs={additional_log_standard_attrs}, "
240
+ f"dedup_logs={os.environ.get('RAY_DEDUP_LOGS', '1')}, "
241
+ f"log_to_driver={os.environ.get('RAY_LOG_TO_DRIVER', '0')}, "
242
+ f"rotate_bytes={os.environ.get('RAY_LOGGING_ROTATE_BYTES', '1073741824')}, "
243
+ f"rotate_backup_count={os.environ.get('RAY_LOGGING_ROTATE_BACKUP_COUNT', '19')}"
244
+ )
245
+
246
+ return logging_config
247
+
248
+
249
+ def launch_pipeline(
250
+ pipeline_config: PipelineConfigSchema,
251
+ block: bool = True,
252
+ disable_dynamic_scaling: Optional[bool] = None,
253
+ dynamic_memory_threshold: Optional[float] = None,
254
+ ) -> Tuple[Union[Any, None], Optional[float]]:
255
+ """
256
+ Launch a pipeline using the provided configuration.
257
+
258
+ This function handles the core pipeline launching logic including Ray
259
+ initialization, pipeline building, and execution loop.
260
+
261
+ Parameters
262
+ ----------
263
+ pipeline_config : PipelineConfigSchema
264
+ Validated pipeline configuration to execute.
265
+ block : bool, optional
266
+ Whether to block until pipeline completes, by default True.
267
+ disable_dynamic_scaling : Optional[bool], optional
268
+ Override for dynamic scaling behavior, by default None.
269
+ dynamic_memory_threshold : Optional[float], optional
270
+ Override for memory threshold, by default None.
271
+
272
+ Returns
273
+ -------
274
+ Tuple[Union[Any, None], Optional[float]]
275
+ Raw pipeline object (type elided to avoid circular import) and elapsed time. For blocking execution,
276
+ returns (None, elapsed_time). For non-blocking, returns (pipeline, None).
277
+ """
278
+ logger.info("Starting pipeline setup")
279
+
280
+ # Initialize Ray if not already initialized
281
+ if not ray.is_initialized():
282
+ # Build Ray logging configuration
283
+ logging_config = build_logging_config_from_env()
284
+
285
+ # Clear existing handlers from root logger before Ray adds its handler
286
+ # This prevents duplicate logging caused by multiple handlers on the root logger
287
+ root_logger = logging.getLogger()
288
+ for handler in root_logger.handlers[:]:
289
+ root_logger.removeHandler(handler)
290
+ logger.info("Cleared existing root logger handlers to prevent Ray logging duplicates")
291
+
292
+ ray.init(
293
+ namespace="nv_ingest_ray",
294
+ ignore_reinit_error=True,
295
+ dashboard_host="0.0.0.0",
296
+ dashboard_port=8265,
297
+ logging_config=logging_config, # Ray will add its own StreamHandler
298
+ _system_config={
299
+ "local_fs_capacity_threshold": 0.9,
300
+ "object_spilling_config": json.dumps(
301
+ {
302
+ "type": "filesystem",
303
+ "params": {
304
+ "directory_path": [
305
+ "/tmp/ray_spill_testing_0",
306
+ "/tmp/ray_spill_testing_1",
307
+ "/tmp/ray_spill_testing_2",
308
+ "/tmp/ray_spill_testing_3",
309
+ ],
310
+ "buffer_size": 100_000_000,
311
+ },
312
+ },
313
+ ),
314
+ },
315
+ )
316
+
317
+ # Handle disable_dynamic_scaling parameter override
318
+ if disable_dynamic_scaling and not pipeline_config.pipeline.disable_dynamic_scaling:
319
+ # Directly modify the pipeline config to disable dynamic scaling
320
+ pipeline_config.pipeline.disable_dynamic_scaling = True
321
+ logger.info("Dynamic scaling disabled via function parameter override")
322
+
323
+ # Resolve static replicas
324
+ pipeline_config = resolve_static_replicas(pipeline_config)
325
+
326
+ # Pretty print the final pipeline configuration (after replica resolution)
327
+ pretty_output = pretty_print_pipeline_config(pipeline_config, config_path=None)
328
+ logger.info("\n" + pretty_output)
329
+
330
+ # Set up the ingestion pipeline
331
+ start_abs = datetime.now()
332
+ ingest_pipeline = None
333
+ try:
334
+ ingest_pipeline = IngestPipelineBuilder(pipeline_config)
335
+ ingest_pipeline.build()
336
+
337
+ # Record setup time
338
+ end_setup = start_run = datetime.now()
339
+ setup_time = (end_setup - start_abs).total_seconds()
340
+ logger.info(f"Pipeline setup complete in {setup_time:.2f} seconds")
341
+
342
+ # Run the pipeline
343
+ logger.debug("Running pipeline")
344
+ ingest_pipeline.start()
345
+ except Exception as e:
346
+ # Ensure any partial startup is torn down
347
+ logger.error(f"Pipeline startup failed, initiating cleanup: {e}", exc_info=True)
348
+ try:
349
+ if ingest_pipeline is not None:
350
+ try:
351
+ ingest_pipeline.stop()
352
+ except Exception:
353
+ pass
354
+ finally:
355
+ try:
356
+ if ray.is_initialized():
357
+ ray.shutdown()
358
+ logger.info("Ray shutdown complete after startup failure.")
359
+ finally:
360
+ pass
361
+ # Re-raise to surface failure to caller
362
+ raise
363
+
364
+ if block:
365
+ try:
366
+ # Block indefinitely until a KeyboardInterrupt is received
367
+ while True:
368
+ time.sleep(5)
369
+ except KeyboardInterrupt:
370
+ logger.info("Interrupt received, shutting down pipeline.")
371
+ ingest_pipeline.stop()
372
+ ray.shutdown()
373
+ logger.info("Ray shutdown complete.")
374
+ except Exception as e:
375
+ logger.error(f"Unexpected error during pipeline run: {e}", exc_info=True)
376
+ try:
377
+ ingest_pipeline.stop()
378
+ finally:
379
+ if ray.is_initialized():
380
+ ray.shutdown()
381
+ raise
382
+
383
+ # Record execution times
384
+ end_run = datetime.now()
385
+ run_time = (end_run - start_run).total_seconds()
386
+ total_elapsed = (end_run - start_abs).total_seconds()
387
+
388
+ logger.info(f"Pipeline execution time: {run_time:.2f} seconds")
389
+ logger.info(f"Total time elapsed: {total_elapsed:.2f} seconds")
390
+
391
+ return None, total_elapsed
392
+ else:
393
+ # Non-blocking - return the pipeline interface
394
+ # Access the internal RayPipeline from IngestPipelineBuilder
395
+ return ingest_pipeline._pipeline, None
396
+
397
+
398
+ def run_pipeline_process(
399
+ pipeline_config: PipelineConfigSchema,
400
+ stdout: Optional[TextIO] = None,
401
+ stderr: Optional[TextIO] = None,
402
+ ) -> None:
403
+ """
404
+ Entry point for running a pipeline in a subprocess.
405
+
406
+ This function is designed to be the target of a multiprocessing.Process,
407
+ handling output redirection and process group management.
408
+
409
+ Parameters
410
+ ----------
411
+ pipeline_config : PipelineConfigSchema
412
+ Pipeline configuration object.
413
+ stdout : Optional[TextIO], optional
414
+ Output stream for subprocess stdout, by default None.
415
+ stderr : Optional[TextIO], optional
416
+ Error stream for subprocess stderr, by default None.
417
+ """
418
+ # Set up output redirection
419
+ if stdout:
420
+ sys.stdout = stdout
421
+ if stderr:
422
+ sys.stderr = stderr
423
+
424
+ # Ensure the subprocess is killed if the parent dies to avoid hangs
425
+ try:
426
+ set_pdeathsig(signal.SIGKILL)
427
+ except Exception as e:
428
+ logger.debug(f"set_pdeathsig not available or failed: {e}")
429
+
430
+ # Create a new process group so we can terminate the entire subtree cleanly
431
+ try:
432
+ os.setpgrp()
433
+ except Exception as e:
434
+ logger.debug(f"os.setpgrp() not available or failed: {e}")
435
+
436
+ # Install signal handlers for graceful shutdown in the subprocess
437
+ def _handle_signal(signum, frame):
438
+ try:
439
+ _safe_log(logging.INFO, f"Received signal {signum}; shutting down Ray and exiting...")
440
+ if ray.is_initialized():
441
+ ray.shutdown()
442
+ finally:
443
+ # Exit immediately after best-effort cleanup
444
+ os._exit(0)
445
+
446
+ try:
447
+ signal.signal(signal.SIGINT, _handle_signal)
448
+ signal.signal(signal.SIGTERM, _handle_signal)
449
+ except Exception as e:
450
+ logger.debug(f"Signal handlers not set: {e}")
451
+
452
+ # Test output redirection
453
+ print("DEBUG: Direct print to stdout - should appear in parent process")
454
+ sys.stderr.write("DEBUG: Direct write to stderr - should appear in parent process\n")
455
+
456
+ # Test logging output
457
+ logger.info("DEBUG: Logger info - may not appear if logging handlers not redirected")
458
+
459
+ # If requested, start the simple broker inside this subprocess so it shares the process group
460
+ broker_proc = None
461
+ try:
462
+ if os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1":
463
+ try:
464
+ # Only launch if the config requests it
465
+ if getattr(pipeline_config, "pipeline", None) and getattr(
466
+ pipeline_config.pipeline, "launch_simple_broker", False
467
+ ):
468
+ _safe_log(logging.INFO, "Starting SimpleMessageBroker inside subprocess")
469
+ broker_proc = start_simple_message_broker({})
470
+ except Exception as e:
471
+ _safe_log(logging.ERROR, f"Failed to start SimpleMessageBroker in subprocess: {e}")
472
+ # Continue without broker; launch will fail fast if required
473
+
474
+ # Launch the pipeline (blocking)
475
+ launch_pipeline(pipeline_config, block=True)
476
+
477
+ except Exception as e:
478
+ logger.error(f"Subprocess pipeline execution failed: {e}")
479
+ raise
480
+ finally:
481
+ # Best-effort: if we created a broker here and the pipeline exits normally,
482
+ # attempt a graceful terminate. In failure/termination paths the process group kill
483
+ # from parent or signal handler will take care of it.
484
+ if broker_proc is not None:
485
+ try:
486
+ if hasattr(broker_proc, "is_alive") and broker_proc.is_alive():
487
+ broker_proc.terminate()
488
+ except Exception:
489
+ pass
490
+
491
+
492
+ def kill_pipeline_process_group(process: multiprocessing.Process) -> None:
493
+ """Backward-compatible shim that delegates to process.termination implementation."""
494
+ _safe_log(logging.DEBUG, "Delegating kill_pipeline_process_group to process.termination module")
495
+ _kill_pipeline_process_group(process)