nv-ingest 2025.8.18.dev20250818__py3-none-any.whl → 2025.8.20.dev20250820__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/framework/orchestration/process/dependent_services.py +32 -10
- nv_ingest/framework/orchestration/process/execution.py +92 -94
- nv_ingest/framework/orchestration/process/lifecycle.py +98 -6
- nv_ingest/framework/orchestration/process/strategies.py +36 -4
- nv_ingest/framework/orchestration/process/termination.py +107 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +9 -15
- nv_ingest/pipeline/config/loaders.py +33 -2
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +514 -0
- nv_ingest/pipeline/default_pipeline_impl.py +54 -56
- {nv_ingest-2025.8.18.dev20250818.dist-info → nv_ingest-2025.8.20.dev20250820.dist-info}/METADATA +1 -1
- {nv_ingest-2025.8.18.dev20250818.dist-info → nv_ingest-2025.8.20.dev20250820.dist-info}/RECORD +14 -12
- {nv_ingest-2025.8.18.dev20250818.dist-info → nv_ingest-2025.8.20.dev20250820.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.18.dev20250818.dist-info → nv_ingest-2025.8.20.dev20250820.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.18.dev20250818.dist-info → nv_ingest-2025.8.20.dev20250820.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ that the pipeline requires, such as message brokers and other infrastructure.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import logging
|
|
13
|
+
import os
|
|
13
14
|
import multiprocessing
|
|
14
15
|
import socket
|
|
15
16
|
from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
|
|
@@ -35,21 +36,42 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
|
35
36
|
The process running the SimpleMessageBroker server.
|
|
36
37
|
"""
|
|
37
38
|
|
|
39
|
+
# Resolve host/port early for pre-flight checks
|
|
40
|
+
broker_params = broker_client.get("broker_params", {})
|
|
41
|
+
max_queue_size = broker_params.get("max_queue_size", 10000)
|
|
42
|
+
server_host = broker_client.get("host", "0.0.0.0")
|
|
43
|
+
server_port = broker_client.get("port", 7671)
|
|
44
|
+
|
|
45
|
+
# Pre-flight: if something is already listening on the target port, do not spawn another broker.
|
|
46
|
+
# This avoids noisy stack traces from a failing child process when tests/pipeline are run repeatedly.
|
|
47
|
+
def _is_port_open(host: str, port: int) -> bool:
|
|
48
|
+
check_host = "127.0.0.1" if host in ("0.0.0.0", "::") else host
|
|
49
|
+
try:
|
|
50
|
+
with socket.create_connection((check_host, port), timeout=0.5):
|
|
51
|
+
return True
|
|
52
|
+
except Exception:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
if _is_port_open(server_host, server_port):
|
|
56
|
+
logger.warning(
|
|
57
|
+
f"SimpleMessageBroker port already in use at {server_host}:{server_port}; "
|
|
58
|
+
f"continuing to spawn a broker process (tests expect a Process to be returned)"
|
|
59
|
+
)
|
|
60
|
+
|
|
38
61
|
def broker_server():
|
|
39
|
-
#
|
|
40
|
-
broker_params = broker_client.get("broker_params", {})
|
|
41
|
-
max_queue_size = broker_params.get("max_queue_size", 10000)
|
|
42
|
-
server_host = broker_client.get("host", "0.0.0.0")
|
|
43
|
-
server_port = broker_client.get("port", 7671)
|
|
44
|
-
# Optionally, set socket options here for reuse.
|
|
62
|
+
# Optionally, set socket options here for reuse (note: binding occurs in server __init__).
|
|
45
63
|
server = SimpleMessageBroker(server_host, server_port, max_queue_size)
|
|
46
|
-
|
|
47
|
-
|
|
64
|
+
try:
|
|
65
|
+
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|
|
48
68
|
server.serve_forever()
|
|
49
69
|
|
|
50
70
|
p = multiprocessing.Process(target=broker_server)
|
|
51
|
-
|
|
71
|
+
# If we're launching from inside the pipeline subprocess, mark daemon so the
|
|
72
|
+
# broker dies automatically when the subprocess exits.
|
|
73
|
+
p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
|
|
52
74
|
p.start()
|
|
53
|
-
logger.info(f"Started SimpleMessageBroker server in separate process on port {
|
|
75
|
+
logger.info(f"Started SimpleMessageBroker server in separate process on port {server_port}")
|
|
54
76
|
|
|
55
77
|
return p
|
|
@@ -17,14 +17,15 @@ import sys
|
|
|
17
17
|
import time
|
|
18
18
|
from ctypes import CDLL
|
|
19
19
|
from datetime import datetime
|
|
20
|
-
from typing import Union, Tuple, Optional, TextIO
|
|
20
|
+
from typing import Union, Tuple, Optional, TextIO, Any
|
|
21
21
|
import json
|
|
22
22
|
|
|
23
23
|
import ray
|
|
24
24
|
from ray import LoggingConfig
|
|
25
25
|
|
|
26
|
-
from nv_ingest.framework.orchestration.
|
|
27
|
-
|
|
26
|
+
from nv_ingest.framework.orchestration.process.dependent_services import start_simple_message_broker
|
|
27
|
+
from nv_ingest.framework.orchestration.process.termination import (
|
|
28
|
+
kill_pipeline_process_group as _kill_pipeline_process_group,
|
|
28
29
|
)
|
|
29
30
|
from nv_ingest.pipeline.ingest_pipeline import IngestPipelineBuilder
|
|
30
31
|
from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
|
|
@@ -250,7 +251,7 @@ def launch_pipeline(
|
|
|
250
251
|
block: bool = True,
|
|
251
252
|
disable_dynamic_scaling: Optional[bool] = None,
|
|
252
253
|
dynamic_memory_threshold: Optional[float] = None,
|
|
253
|
-
) -> Tuple[Union[
|
|
254
|
+
) -> Tuple[Union[Any, None], Optional[float]]:
|
|
254
255
|
"""
|
|
255
256
|
Launch a pipeline using the provided configuration.
|
|
256
257
|
|
|
@@ -270,8 +271,8 @@ def launch_pipeline(
|
|
|
270
271
|
|
|
271
272
|
Returns
|
|
272
273
|
-------
|
|
273
|
-
Tuple[Union[
|
|
274
|
-
Raw
|
|
274
|
+
Tuple[Union[Any, None], Optional[float]]
|
|
275
|
+
Raw pipeline object (type elided to avoid circular import) and elapsed time. For blocking execution,
|
|
275
276
|
returns (None, elapsed_time). For non-blocking, returns (pipeline, None).
|
|
276
277
|
"""
|
|
277
278
|
logger.info("Starting pipeline setup")
|
|
@@ -328,17 +329,37 @@ def launch_pipeline(
|
|
|
328
329
|
|
|
329
330
|
# Set up the ingestion pipeline
|
|
330
331
|
start_abs = datetime.now()
|
|
331
|
-
ingest_pipeline =
|
|
332
|
-
|
|
332
|
+
ingest_pipeline = None
|
|
333
|
+
try:
|
|
334
|
+
ingest_pipeline = IngestPipelineBuilder(pipeline_config)
|
|
335
|
+
ingest_pipeline.build()
|
|
333
336
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
337
|
+
# Record setup time
|
|
338
|
+
end_setup = start_run = datetime.now()
|
|
339
|
+
setup_time = (end_setup - start_abs).total_seconds()
|
|
340
|
+
logger.info(f"Pipeline setup complete in {setup_time:.2f} seconds")
|
|
338
341
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
+
# Run the pipeline
|
|
343
|
+
logger.debug("Running pipeline")
|
|
344
|
+
ingest_pipeline.start()
|
|
345
|
+
except Exception as e:
|
|
346
|
+
# Ensure any partial startup is torn down
|
|
347
|
+
logger.error(f"Pipeline startup failed, initiating cleanup: {e}", exc_info=True)
|
|
348
|
+
try:
|
|
349
|
+
if ingest_pipeline is not None:
|
|
350
|
+
try:
|
|
351
|
+
ingest_pipeline.stop()
|
|
352
|
+
except Exception:
|
|
353
|
+
pass
|
|
354
|
+
finally:
|
|
355
|
+
try:
|
|
356
|
+
if ray.is_initialized():
|
|
357
|
+
ray.shutdown()
|
|
358
|
+
logger.info("Ray shutdown complete after startup failure.")
|
|
359
|
+
finally:
|
|
360
|
+
pass
|
|
361
|
+
# Re-raise to surface failure to caller
|
|
362
|
+
raise
|
|
342
363
|
|
|
343
364
|
if block:
|
|
344
365
|
try:
|
|
@@ -350,6 +371,14 @@ def launch_pipeline(
|
|
|
350
371
|
ingest_pipeline.stop()
|
|
351
372
|
ray.shutdown()
|
|
352
373
|
logger.info("Ray shutdown complete.")
|
|
374
|
+
except Exception as e:
|
|
375
|
+
logger.error(f"Unexpected error during pipeline run: {e}", exc_info=True)
|
|
376
|
+
try:
|
|
377
|
+
ingest_pipeline.stop()
|
|
378
|
+
finally:
|
|
379
|
+
if ray.is_initialized():
|
|
380
|
+
ray.shutdown()
|
|
381
|
+
raise
|
|
353
382
|
|
|
354
383
|
# Record execution times
|
|
355
384
|
end_run = datetime.now()
|
|
@@ -392,12 +421,34 @@ def run_pipeline_process(
|
|
|
392
421
|
if stderr:
|
|
393
422
|
sys.stderr = stderr
|
|
394
423
|
|
|
424
|
+
# Ensure the subprocess is killed if the parent dies to avoid hangs
|
|
425
|
+
try:
|
|
426
|
+
set_pdeathsig(signal.SIGKILL)
|
|
427
|
+
except Exception as e:
|
|
428
|
+
logger.debug(f"set_pdeathsig not available or failed: {e}")
|
|
429
|
+
|
|
395
430
|
# Create a new process group so we can terminate the entire subtree cleanly
|
|
396
431
|
try:
|
|
397
432
|
os.setpgrp()
|
|
398
433
|
except Exception as e:
|
|
399
434
|
logger.debug(f"os.setpgrp() not available or failed: {e}")
|
|
400
435
|
|
|
436
|
+
# Install signal handlers for graceful shutdown in the subprocess
|
|
437
|
+
def _handle_signal(signum, frame):
|
|
438
|
+
try:
|
|
439
|
+
_safe_log(logging.INFO, f"Received signal {signum}; shutting down Ray and exiting...")
|
|
440
|
+
if ray.is_initialized():
|
|
441
|
+
ray.shutdown()
|
|
442
|
+
finally:
|
|
443
|
+
# Exit immediately after best-effort cleanup
|
|
444
|
+
os._exit(0)
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
signal.signal(signal.SIGINT, _handle_signal)
|
|
448
|
+
signal.signal(signal.SIGTERM, _handle_signal)
|
|
449
|
+
except Exception as e:
|
|
450
|
+
logger.debug(f"Signal handlers not set: {e}")
|
|
451
|
+
|
|
401
452
|
# Test output redirection
|
|
402
453
|
print("DEBUG: Direct print to stdout - should appear in parent process")
|
|
403
454
|
sys.stderr.write("DEBUG: Direct write to stderr - should appear in parent process\n")
|
|
@@ -405,93 +456,40 @@ def run_pipeline_process(
|
|
|
405
456
|
# Test logging output
|
|
406
457
|
logger.info("DEBUG: Logger info - may not appear if logging handlers not redirected")
|
|
407
458
|
|
|
459
|
+
# If requested, start the simple broker inside this subprocess so it shares the process group
|
|
460
|
+
broker_proc = None
|
|
408
461
|
try:
|
|
462
|
+
if os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1":
|
|
463
|
+
try:
|
|
464
|
+
# Only launch if the config requests it
|
|
465
|
+
if getattr(pipeline_config, "pipeline", None) and getattr(
|
|
466
|
+
pipeline_config.pipeline, "launch_simple_broker", False
|
|
467
|
+
):
|
|
468
|
+
_safe_log(logging.INFO, "Starting SimpleMessageBroker inside subprocess")
|
|
469
|
+
broker_proc = start_simple_message_broker({})
|
|
470
|
+
except Exception as e:
|
|
471
|
+
_safe_log(logging.ERROR, f"Failed to start SimpleMessageBroker in subprocess: {e}")
|
|
472
|
+
# Continue without broker; launch will fail fast if required
|
|
473
|
+
|
|
409
474
|
# Launch the pipeline (blocking)
|
|
410
475
|
launch_pipeline(pipeline_config, block=True)
|
|
411
476
|
|
|
412
477
|
except Exception as e:
|
|
413
478
|
logger.error(f"Subprocess pipeline execution failed: {e}")
|
|
414
479
|
raise
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
Note: Although the type annotation specifies a multiprocessing.Process for
|
|
422
|
-
compatibility with existing tests and public API, this function is robust
|
|
423
|
-
to also being passed a raw PID (int) at runtime.
|
|
424
|
-
|
|
425
|
-
Behavior:
|
|
426
|
-
- Send SIGTERM to the process group; if still alive after grace period, escalate to SIGKILL.
|
|
427
|
-
- If a Process object is provided, attempt to join() with timeouts.
|
|
428
|
-
- If only a PID is provided, skip joins and just signal the process group with grace/force.
|
|
429
|
-
|
|
430
|
-
Parameters
|
|
431
|
-
----------
|
|
432
|
-
process : multiprocessing.Process
|
|
433
|
-
Process handle (or a raw PID int) for the process whose process group should be terminated.
|
|
434
|
-
"""
|
|
435
|
-
# Resolve PID and optional Process handle
|
|
436
|
-
proc: Optional[object] = None
|
|
437
|
-
pid: Optional[int] = None
|
|
438
|
-
|
|
439
|
-
if isinstance(process, int):
|
|
440
|
-
pid = process
|
|
441
|
-
elif hasattr(process, "pid"):
|
|
442
|
-
# Duck-type any object that exposes a pid (e.g., multiprocessing.Process or Mock)
|
|
443
|
-
proc = process
|
|
444
|
-
try:
|
|
445
|
-
pid = int(getattr(proc, "pid"))
|
|
446
|
-
except Exception as e:
|
|
447
|
-
raise AttributeError(f"Invalid process-like object without usable pid: {e}")
|
|
448
|
-
else:
|
|
449
|
-
raise AttributeError(
|
|
450
|
-
"kill_pipeline_process_group expects a multiprocessing.Process or a PID int (process-like object with .pid)"
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
# If we have a Process handle and it's already dead, nothing to do
|
|
454
|
-
if proc is not None and hasattr(proc, "is_alive") and not proc.is_alive():
|
|
455
|
-
_safe_log(logging.DEBUG, "Process already terminated")
|
|
456
|
-
return
|
|
457
|
-
|
|
458
|
-
if pid is None:
|
|
459
|
-
# Defensive guard; should not happen
|
|
460
|
-
raise AttributeError("Unable to determine PID for process group termination")
|
|
461
|
-
|
|
462
|
-
_safe_log(logging.INFO, f"Terminating pipeline process group (PID: {pid})")
|
|
463
|
-
try:
|
|
464
|
-
# Send graceful termination to the entire process group
|
|
465
|
-
os.killpg(os.getpgid(pid), signal.SIGTERM)
|
|
466
|
-
|
|
467
|
-
# If we have a Process handle, give it a chance to exit cleanly
|
|
468
|
-
if proc is not None and hasattr(proc, "join"):
|
|
480
|
+
finally:
|
|
481
|
+
# Best-effort: if we created a broker here and the pipeline exits normally,
|
|
482
|
+
# attempt a graceful terminate. In failure/termination paths the process group kill
|
|
483
|
+
# from parent or signal handler will take care of it.
|
|
484
|
+
if broker_proc is not None:
|
|
469
485
|
try:
|
|
470
|
-
|
|
486
|
+
if hasattr(broker_proc, "is_alive") and broker_proc.is_alive():
|
|
487
|
+
broker_proc.terminate()
|
|
471
488
|
except Exception:
|
|
472
489
|
pass
|
|
473
|
-
still_alive = getattr(proc, "is_alive", lambda: True)()
|
|
474
|
-
else:
|
|
475
|
-
# Without a handle, provide a small grace period
|
|
476
|
-
time.sleep(2.0)
|
|
477
|
-
# Best-effort check: if getpgid fails, it's gone
|
|
478
|
-
try:
|
|
479
|
-
_ = os.getpgid(pid)
|
|
480
|
-
still_alive = True
|
|
481
|
-
except Exception:
|
|
482
|
-
still_alive = False
|
|
483
490
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
if proc is not None and hasattr(proc, "join"):
|
|
490
|
-
try:
|
|
491
|
-
proc.join(timeout=3.0)
|
|
492
|
-
except Exception:
|
|
493
|
-
pass
|
|
494
|
-
|
|
495
|
-
except (ProcessLookupError, OSError) as e:
|
|
496
|
-
# Process or group may already be gone
|
|
497
|
-
_safe_log(logging.DEBUG, f"Process group already terminated or not found: {e}")
|
|
491
|
+
|
|
492
|
+
def kill_pipeline_process_group(process: multiprocessing.Process) -> None:
|
|
493
|
+
"""Backward-compatible shim that delegates to process.termination implementation."""
|
|
494
|
+
_safe_log(logging.DEBUG, "Delegating kill_pipeline_process_group to process.termination module")
|
|
495
|
+
_kill_pipeline_process_group(process)
|
|
@@ -11,11 +11,16 @@ using the configured strategy pattern.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import logging
|
|
14
|
+
import atexit
|
|
15
|
+
import multiprocessing
|
|
16
|
+
import os
|
|
17
|
+
import signal
|
|
14
18
|
from typing import Optional
|
|
15
19
|
|
|
16
20
|
from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
|
|
17
21
|
from nv_ingest.framework.orchestration.execution.options import ExecutionOptions, ExecutionResult
|
|
18
22
|
from nv_ingest.framework.orchestration.process.strategies import ProcessExecutionStrategy
|
|
23
|
+
from nv_ingest.framework.orchestration.process.strategies import SubprocessStrategy
|
|
19
24
|
from nv_ingest.framework.orchestration.process.dependent_services import start_simple_message_broker
|
|
20
25
|
|
|
21
26
|
logger = logging.getLogger(__name__)
|
|
@@ -45,6 +50,8 @@ class PipelineLifecycleManager:
|
|
|
45
50
|
The strategy to use for pipeline execution.
|
|
46
51
|
"""
|
|
47
52
|
self.strategy = strategy
|
|
53
|
+
# Track broker process so we can terminate it during teardown
|
|
54
|
+
self._broker_process: Optional[multiprocessing.Process] = None
|
|
48
55
|
|
|
49
56
|
def start(self, config: PipelineConfigSchema, options: ExecutionOptions) -> ExecutionResult:
|
|
50
57
|
"""
|
|
@@ -74,8 +81,18 @@ class PipelineLifecycleManager:
|
|
|
74
81
|
"""
|
|
75
82
|
logger.info("Starting pipeline lifecycle")
|
|
76
83
|
|
|
84
|
+
# If running pipeline in a subprocess and broker is enabled, ensure the broker
|
|
85
|
+
# is launched in the child process group by signaling via environment variable
|
|
86
|
+
prev_env = None
|
|
87
|
+
set_env = False
|
|
88
|
+
if getattr(config, "pipeline", None) and getattr(config.pipeline, "launch_simple_broker", False):
|
|
89
|
+
if isinstance(self.strategy, SubprocessStrategy):
|
|
90
|
+
prev_env = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS")
|
|
91
|
+
os.environ["NV_INGEST_BROKER_IN_SUBPROCESS"] = "1"
|
|
92
|
+
set_env = True
|
|
93
|
+
|
|
77
94
|
try:
|
|
78
|
-
# Start message broker if configured
|
|
95
|
+
# Start message broker if configured (may defer to subprocess based on env)
|
|
79
96
|
self._setup_message_broker(config)
|
|
80
97
|
|
|
81
98
|
# Execute pipeline using the configured strategy
|
|
@@ -87,6 +104,15 @@ class PipelineLifecycleManager:
|
|
|
87
104
|
except Exception as e:
|
|
88
105
|
logger.error(f"Failed to start pipeline lifecycle: {e}")
|
|
89
106
|
raise RuntimeError(f"Pipeline startup failed: {e}") from e
|
|
107
|
+
finally:
|
|
108
|
+
if set_env:
|
|
109
|
+
if prev_env is None:
|
|
110
|
+
try:
|
|
111
|
+
del os.environ["NV_INGEST_BROKER_IN_SUBPROCESS"]
|
|
112
|
+
except KeyError:
|
|
113
|
+
pass
|
|
114
|
+
else:
|
|
115
|
+
os.environ["NV_INGEST_BROKER_IN_SUBPROCESS"] = prev_env
|
|
90
116
|
|
|
91
117
|
def _setup_message_broker(self, config: PipelineConfigSchema) -> None:
|
|
92
118
|
"""
|
|
@@ -98,8 +124,21 @@ class PipelineLifecycleManager:
|
|
|
98
124
|
Pipeline configuration containing broker settings.
|
|
99
125
|
"""
|
|
100
126
|
if config.pipeline.launch_simple_broker:
|
|
127
|
+
# If requested to launch broker inside the subprocess, skip here
|
|
128
|
+
if os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1":
|
|
129
|
+
logger.info("Deferring SimpleMessageBroker launch to subprocess")
|
|
130
|
+
return
|
|
101
131
|
logger.info("Starting simple message broker")
|
|
102
|
-
|
|
132
|
+
# Start the broker and retain a handle for cleanup.
|
|
133
|
+
# Use defaults (host=0.0.0.0, port=7671) as set by the broker implementation.
|
|
134
|
+
try:
|
|
135
|
+
self._broker_process = start_simple_message_broker({})
|
|
136
|
+
# Ensure cleanup at interpreter shutdown in case caller forgets
|
|
137
|
+
atexit.register(self._terminate_broker_atexit)
|
|
138
|
+
logger.info(f"SimpleMessageBroker started (pid={getattr(self._broker_process, 'pid', None)})")
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.error(f"Failed to start SimpleMessageBroker: {e}")
|
|
141
|
+
raise
|
|
103
142
|
else:
|
|
104
143
|
logger.debug("Simple broker launch not required")
|
|
105
144
|
|
|
@@ -109,6 +148,8 @@ class PipelineLifecycleManager:
|
|
|
109
148
|
|
|
110
149
|
This method provides a hook for future pipeline stopping functionality.
|
|
111
150
|
Currently, pipeline stopping is handled by the individual interfaces.
|
|
151
|
+
Additionally, it ensures any dependent services (like the simple
|
|
152
|
+
message broker) are terminated to avoid lingering processes.
|
|
112
153
|
|
|
113
154
|
Parameters
|
|
114
155
|
----------
|
|
@@ -116,7 +157,58 @@ class PipelineLifecycleManager:
|
|
|
116
157
|
Identifier of the pipeline to stop. Currently unused.
|
|
117
158
|
"""
|
|
118
159
|
logger.info("Pipeline stop requested")
|
|
119
|
-
#
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
160
|
+
# Best-effort termination of broker if we started one
|
|
161
|
+
self._terminate_broker()
|
|
162
|
+
|
|
163
|
+
# --- Internal helpers ---
|
|
164
|
+
def _terminate_broker_atexit(self) -> None:
|
|
165
|
+
"""Atexit-safe broker termination.
|
|
166
|
+
|
|
167
|
+
Avoids raising exceptions during interpreter shutdown.
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
self._terminate_broker()
|
|
171
|
+
except Exception:
|
|
172
|
+
# Swallow errors at atexit to avoid noisy shutdowns
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
def _terminate_broker(self) -> None:
|
|
176
|
+
"""Terminate the SimpleMessageBroker process if running."""
|
|
177
|
+
proc = self._broker_process
|
|
178
|
+
if not proc:
|
|
179
|
+
return
|
|
180
|
+
try:
|
|
181
|
+
if hasattr(proc, "is_alive") and not proc.is_alive():
|
|
182
|
+
return
|
|
183
|
+
except Exception:
|
|
184
|
+
# If querying state fails, continue with termination attempt
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
pid = getattr(proc, "pid", None)
|
|
188
|
+
logger.info(f"Stopping SimpleMessageBroker (pid={pid})")
|
|
189
|
+
try:
|
|
190
|
+
# First, try graceful terminate
|
|
191
|
+
proc.terminate()
|
|
192
|
+
try:
|
|
193
|
+
proc.join(timeout=3.0)
|
|
194
|
+
except Exception:
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
# If still alive, escalate to SIGKILL on the single process
|
|
198
|
+
still_alive = False
|
|
199
|
+
try:
|
|
200
|
+
still_alive = hasattr(proc, "is_alive") and proc.is_alive()
|
|
201
|
+
except Exception:
|
|
202
|
+
still_alive = True
|
|
203
|
+
if still_alive and pid is not None:
|
|
204
|
+
try:
|
|
205
|
+
os.kill(pid, signal.SIGKILL)
|
|
206
|
+
except Exception:
|
|
207
|
+
pass
|
|
208
|
+
try:
|
|
209
|
+
proc.join(timeout=2.0)
|
|
210
|
+
except Exception:
|
|
211
|
+
pass
|
|
212
|
+
finally:
|
|
213
|
+
# Clear handle to avoid repeated attempts
|
|
214
|
+
self._broker_process = None
|
|
@@ -11,6 +11,7 @@ Strategy pattern for clean separation of execution concerns.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import atexit
|
|
14
|
+
import os
|
|
14
15
|
import logging
|
|
15
16
|
import multiprocessing
|
|
16
17
|
import time
|
|
@@ -25,6 +26,8 @@ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
|
|
|
25
26
|
from nv_ingest.framework.orchestration.process.execution import (
|
|
26
27
|
launch_pipeline,
|
|
27
28
|
run_pipeline_process,
|
|
29
|
+
)
|
|
30
|
+
from nv_ingest.framework.orchestration.process.termination import (
|
|
28
31
|
kill_pipeline_process_group,
|
|
29
32
|
)
|
|
30
33
|
|
|
@@ -140,16 +143,45 @@ class SubprocessStrategy(ProcessExecutionStrategy):
|
|
|
140
143
|
daemon=False,
|
|
141
144
|
)
|
|
142
145
|
|
|
143
|
-
|
|
146
|
+
# Hint to the lifecycle manager to skip starting the broker in the parent
|
|
147
|
+
prev_val = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS")
|
|
148
|
+
os.environ["NV_INGEST_BROKER_IN_SUBPROCESS"] = "1"
|
|
149
|
+
try:
|
|
150
|
+
process.start()
|
|
151
|
+
finally:
|
|
152
|
+
# Restore original env to avoid affecting other code paths
|
|
153
|
+
if prev_val is None:
|
|
154
|
+
try:
|
|
155
|
+
del os.environ["NV_INGEST_BROKER_IN_SUBPROCESS"]
|
|
156
|
+
except KeyError:
|
|
157
|
+
pass
|
|
158
|
+
else:
|
|
159
|
+
os.environ["NV_INGEST_BROKER_IN_SUBPROCESS"] = prev_val
|
|
144
160
|
interface = RayPipelineSubprocessInterface(process)
|
|
145
161
|
|
|
146
162
|
if options.block:
|
|
147
|
-
# Block until subprocess completes
|
|
163
|
+
# Block until subprocess completes, handling Ctrl+C to ensure teardown
|
|
148
164
|
start_time = time.time()
|
|
149
165
|
logger.info("Waiting for subprocess pipeline to complete...")
|
|
150
|
-
|
|
151
|
-
|
|
166
|
+
try:
|
|
167
|
+
process.join()
|
|
168
|
+
except KeyboardInterrupt:
|
|
169
|
+
logger.info("KeyboardInterrupt in parent; terminating subprocess group...")
|
|
170
|
+
try:
|
|
171
|
+
pid = int(process.pid)
|
|
172
|
+
kill_pipeline_process_group(pid)
|
|
173
|
+
finally:
|
|
174
|
+
# Best-effort wait for process to exit
|
|
175
|
+
try:
|
|
176
|
+
process.join(timeout=5.0)
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
finally:
|
|
180
|
+
logger.info("Pipeline subprocess completed or terminated.")
|
|
152
181
|
elapsed_time = time.time() - start_time
|
|
182
|
+
# If process ended with failure, surface it
|
|
183
|
+
if hasattr(process, "exitcode") and process.exitcode not in (0, None):
|
|
184
|
+
raise RuntimeError(f"Pipeline subprocess exited with code {process.exitcode}")
|
|
153
185
|
return ExecutionResult(interface=None, elapsed_time=elapsed_time)
|
|
154
186
|
else:
|
|
155
187
|
# Return interface for non-blocking execution
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Process termination utilities, isolated to avoid circular imports.
|
|
7
|
+
|
|
8
|
+
This module provides functions to terminate a process and its entire process
|
|
9
|
+
group safely, without depending on pipeline construction or Ray types.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import signal
|
|
15
|
+
import time
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _safe_log(level: int, msg: str) -> None:
|
|
22
|
+
"""Best-effort logging that won't crash during interpreter shutdown."""
|
|
23
|
+
try:
|
|
24
|
+
logger.log(level, msg)
|
|
25
|
+
except Exception:
|
|
26
|
+
try:
|
|
27
|
+
# Fallback to stderr if available
|
|
28
|
+
import sys
|
|
29
|
+
|
|
30
|
+
if hasattr(sys, "__stderr__") and sys.__stderr__:
|
|
31
|
+
sys.__stderr__.write(msg + "\n")
|
|
32
|
+
sys.__stderr__.flush()
|
|
33
|
+
except Exception:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def kill_pipeline_process_group(process) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Kill a process and its entire process group.
|
|
40
|
+
|
|
41
|
+
Accepts either a multiprocessing.Process-like object exposing a ``pid`` attribute
|
|
42
|
+
or a raw PID integer. Sends SIGTERM to the process group first, and escalates
|
|
43
|
+
to SIGKILL if it does not terminate within a short grace period.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
process : multiprocessing.Process | int
|
|
48
|
+
Process handle (or a raw PID int) for the process whose process group should be terminated.
|
|
49
|
+
"""
|
|
50
|
+
proc: Optional[object] = None
|
|
51
|
+
pid: Optional[int] = None
|
|
52
|
+
|
|
53
|
+
if isinstance(process, int):
|
|
54
|
+
pid = process
|
|
55
|
+
elif hasattr(process, "pid"):
|
|
56
|
+
proc = process
|
|
57
|
+
try:
|
|
58
|
+
pid = int(getattr(proc, "pid"))
|
|
59
|
+
except Exception as e:
|
|
60
|
+
raise AttributeError(f"Invalid process-like object without usable pid: {e}")
|
|
61
|
+
else:
|
|
62
|
+
raise AttributeError(
|
|
63
|
+
"kill_pipeline_process_group expects a multiprocessing.Process or a PID int (process-like object with .pid)"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if proc is not None and hasattr(proc, "is_alive") and not proc.is_alive():
|
|
67
|
+
_safe_log(logging.DEBUG, "Process already terminated")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
if pid is None:
|
|
71
|
+
raise AttributeError("Unable to determine PID for process group termination")
|
|
72
|
+
|
|
73
|
+
_safe_log(logging.INFO, f"Terminating pipeline process group (PID: {pid})")
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
# Send graceful termination to the entire process group
|
|
77
|
+
os.killpg(os.getpgid(pid), signal.SIGTERM)
|
|
78
|
+
|
|
79
|
+
# If we have a Process handle, give it a chance to exit cleanly
|
|
80
|
+
if proc is not None and hasattr(proc, "join"):
|
|
81
|
+
try:
|
|
82
|
+
proc.join(timeout=5.0)
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
still_alive = getattr(proc, "is_alive", lambda: True)()
|
|
86
|
+
else:
|
|
87
|
+
# Without a handle, provide a small grace period
|
|
88
|
+
time.sleep(2.0)
|
|
89
|
+
try:
|
|
90
|
+
_ = os.getpgid(pid)
|
|
91
|
+
still_alive = True
|
|
92
|
+
except Exception:
|
|
93
|
+
still_alive = False
|
|
94
|
+
|
|
95
|
+
if still_alive:
|
|
96
|
+
_safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
|
|
97
|
+
try:
|
|
98
|
+
os.killpg(os.getpgid(pid), signal.SIGKILL)
|
|
99
|
+
finally:
|
|
100
|
+
if proc is not None and hasattr(proc, "join"):
|
|
101
|
+
try:
|
|
102
|
+
proc.join(timeout=3.0)
|
|
103
|
+
except Exception:
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
except (ProcessLookupError, OSError) as e:
|
|
107
|
+
_safe_log(logging.DEBUG, f"Process group already terminated or not found: {e}")
|