nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Helper functions for pipeline execution configuration.
|
|
7
|
+
|
|
8
|
+
This module contains generic helper functions for converting individual parameters
|
|
9
|
+
into structured configuration objects, supporting the declarative execution architecture.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Optional, TextIO
|
|
13
|
+
|
|
14
|
+
from nv_ingest.framework.orchestration.execution.options import PipelineRuntimeOverrides, ExecutionOptions
|
|
15
|
+
from nv_ingest.framework.orchestration.process.strategies import ProcessExecutionStrategy, create_execution_strategy
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_runtime_overrides(
|
|
19
|
+
disable_dynamic_scaling: Optional[bool], dynamic_memory_threshold: Optional[float]
|
|
20
|
+
) -> PipelineRuntimeOverrides:
|
|
21
|
+
"""
|
|
22
|
+
Create runtime override object from individual parameters.
|
|
23
|
+
|
|
24
|
+
This function converts the individual override parameters into
|
|
25
|
+
a structured PipelineRuntimeOverrides object for declarative processing.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
disable_dynamic_scaling : Optional[bool]
|
|
30
|
+
Dynamic scaling override value.
|
|
31
|
+
dynamic_memory_threshold : Optional[float]
|
|
32
|
+
Memory threshold override value.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
PipelineRuntimeOverrides
|
|
37
|
+
Structured override object containing the provided values.
|
|
38
|
+
"""
|
|
39
|
+
return PipelineRuntimeOverrides(
|
|
40
|
+
disable_dynamic_scaling=disable_dynamic_scaling, dynamic_memory_threshold=dynamic_memory_threshold
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_execution_options(block: bool, stdout: Optional[TextIO], stderr: Optional[TextIO]) -> ExecutionOptions:
|
|
45
|
+
"""
|
|
46
|
+
Create execution options object from individual parameters.
|
|
47
|
+
|
|
48
|
+
This function converts individual execution parameters into
|
|
49
|
+
a structured ExecutionOptions object for declarative processing.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
block : bool
|
|
54
|
+
Whether to block until pipeline completion.
|
|
55
|
+
stdout : Optional[TextIO]
|
|
56
|
+
Output stream for subprocess redirection.
|
|
57
|
+
stderr : Optional[TextIO]
|
|
58
|
+
Error stream for subprocess redirection.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
ExecutionOptions
|
|
63
|
+
Structured options object containing the provided values.
|
|
64
|
+
"""
|
|
65
|
+
return ExecutionOptions(block=block, stdout=stdout, stderr=stderr)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def select_execution_strategy(run_in_subprocess: bool) -> ProcessExecutionStrategy:
|
|
69
|
+
"""
|
|
70
|
+
Select appropriate execution strategy based on parameters.
|
|
71
|
+
|
|
72
|
+
This function encapsulates the logic for choosing between
|
|
73
|
+
in-process and subprocess execution strategies.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
run_in_subprocess : bool
|
|
78
|
+
Whether to run in a subprocess.
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
ProcessExecutionStrategy
|
|
83
|
+
Configured execution strategy instance.
|
|
84
|
+
"""
|
|
85
|
+
return create_execution_strategy(run_in_subprocess=run_in_subprocess)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Data classes for pipeline execution configuration and options.
|
|
7
|
+
|
|
8
|
+
This module defines declarative data structures for configuring pipeline execution,
|
|
9
|
+
replacing imperative parameter passing with structured configuration objects.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Optional, TextIO, Union
|
|
14
|
+
|
|
15
|
+
from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
|
|
16
|
+
RayPipelineInterface,
|
|
17
|
+
RayPipelineSubprocessInterface,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class PipelineRuntimeOverrides:
|
|
23
|
+
"""
|
|
24
|
+
Runtime parameter overrides for pipeline configuration.
|
|
25
|
+
|
|
26
|
+
These overrides are applied to the base pipeline configuration
|
|
27
|
+
to customize runtime behavior without modifying the source config.
|
|
28
|
+
|
|
29
|
+
Attributes
|
|
30
|
+
----------
|
|
31
|
+
disable_dynamic_scaling : Optional[bool]
|
|
32
|
+
Override for dynamic scaling behavior. If provided, overrides
|
|
33
|
+
the pipeline config's disable_dynamic_scaling setting.
|
|
34
|
+
dynamic_memory_threshold : Optional[float]
|
|
35
|
+
Override for memory threshold used in dynamic scaling decisions.
|
|
36
|
+
Must be between 0.0 and 1.0 if provided.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
disable_dynamic_scaling: Optional[bool] = None
|
|
40
|
+
dynamic_memory_threshold: Optional[float] = None
|
|
41
|
+
|
|
42
|
+
def __post_init__(self):
|
|
43
|
+
"""Validate override values."""
|
|
44
|
+
if self.dynamic_memory_threshold is not None:
|
|
45
|
+
if not (0.0 <= self.dynamic_memory_threshold <= 1.0):
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"dynamic_memory_threshold must be between 0.0 and 1.0, " f"got {self.dynamic_memory_threshold}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class ExecutionOptions:
|
|
53
|
+
"""
|
|
54
|
+
Options controlling pipeline execution behavior.
|
|
55
|
+
|
|
56
|
+
These options determine how the pipeline is executed (blocking vs non-blocking)
|
|
57
|
+
and where output is directed for subprocess execution.
|
|
58
|
+
|
|
59
|
+
Attributes
|
|
60
|
+
----------
|
|
61
|
+
block : bool
|
|
62
|
+
If True, blocks until pipeline completes. If False, returns
|
|
63
|
+
immediately with a control interface.
|
|
64
|
+
stdout : Optional[TextIO]
|
|
65
|
+
Stream for subprocess stdout redirection. Only used when
|
|
66
|
+
run_in_subprocess=True. If None, redirected to /dev/null.
|
|
67
|
+
stderr : Optional[TextIO]
|
|
68
|
+
Stream for subprocess stderr redirection. Only used when
|
|
69
|
+
run_in_subprocess=True. If None, redirected to /dev/null.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
block: bool = True
|
|
73
|
+
stdout: Optional[TextIO] = None
|
|
74
|
+
stderr: Optional[TextIO] = None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class ExecutionResult:
|
|
79
|
+
"""
|
|
80
|
+
Result of pipeline execution containing interface and timing information.
|
|
81
|
+
|
|
82
|
+
This class encapsulates the results of pipeline execution and provides
|
|
83
|
+
methods to convert to the legacy return format for backward compatibility.
|
|
84
|
+
|
|
85
|
+
Attributes
|
|
86
|
+
----------
|
|
87
|
+
interface : Union[RayPipelineInterface, RayPipelineSubprocessInterface, None]
|
|
88
|
+
Pipeline control interface. None for blocking subprocess execution.
|
|
89
|
+
elapsed_time : Optional[float]
|
|
90
|
+
Total execution time in seconds. Only set for blocking execution.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
interface: Union[RayPipelineInterface, RayPipelineSubprocessInterface, None]
|
|
94
|
+
elapsed_time: Optional[float] = None
|
|
95
|
+
|
|
96
|
+
def get_return_value(self) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
|
|
97
|
+
"""
|
|
98
|
+
Convert to legacy return format for backward compatibility.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]
|
|
103
|
+
- If blocking execution: returns elapsed time (float)
|
|
104
|
+
- If non-blocking execution: returns pipeline interface
|
|
105
|
+
"""
|
|
106
|
+
if self.elapsed_time is not None:
|
|
107
|
+
return self.elapsed_time
|
|
108
|
+
elif self.interface is not None:
|
|
109
|
+
return self.interface
|
|
110
|
+
else:
|
|
111
|
+
# This should not happen in normal execution
|
|
112
|
+
raise RuntimeError("ExecutionResult has neither interface nor elapsed_time")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Dependent services management for pipeline orchestration.
|
|
7
|
+
|
|
8
|
+
This module contains utilities for starting and managing dependent services
|
|
9
|
+
that the pipeline requires, such as message brokers and other infrastructure.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import multiprocessing
|
|
15
|
+
import socket
|
|
16
|
+
from nv_ingest_api.util.message_brokers.simple_message_broker.broker import SimpleMessageBroker
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _broker_server_target(host, port, max_queue_size):
|
|
22
|
+
"""
|
|
23
|
+
Target function to be run in a separate process for the SimpleMessageBroker.
|
|
24
|
+
"""
|
|
25
|
+
server = SimpleMessageBroker(host, port, max_queue_size)
|
|
26
|
+
try:
|
|
27
|
+
server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
server.serve_forever()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
34
|
+
"""
|
|
35
|
+
Starts a SimpleMessageBroker server in a separate process.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
broker_client : dict
|
|
40
|
+
Broker configuration. Expected keys include:
|
|
41
|
+
- "port": the port to bind the server to,
|
|
42
|
+
- "broker_params": optionally including "max_queue_size",
|
|
43
|
+
- and any other parameters required by SimpleMessageBroker.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
multiprocessing.Process
|
|
48
|
+
The process running the SimpleMessageBroker server.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Resolve host/port early for pre-flight checks
|
|
52
|
+
broker_params = broker_client.get("broker_params", {})
|
|
53
|
+
max_queue_size = broker_params.get("max_queue_size", 10000)
|
|
54
|
+
server_host = broker_client.get("host", "0.0.0.0")
|
|
55
|
+
server_port = broker_client.get("port", 7671)
|
|
56
|
+
|
|
57
|
+
# Pre-flight: if something is already listening on the target port, do not spawn another broker.
|
|
58
|
+
# This avoids noisy stack traces from a failing child process when tests/pipeline are run repeatedly.
|
|
59
|
+
def _is_port_open(host: str, port: int) -> bool:
|
|
60
|
+
check_host = "127.0.0.1" if host in ("0.0.0.0", "::") else host
|
|
61
|
+
try:
|
|
62
|
+
with socket.create_connection((check_host, port), timeout=0.5):
|
|
63
|
+
return True
|
|
64
|
+
except Exception:
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
if _is_port_open(server_host, server_port):
|
|
68
|
+
logger.warning(
|
|
69
|
+
f"SimpleMessageBroker port already in use at {server_host}:{server_port}; "
|
|
70
|
+
f"continuing to spawn a broker process (tests expect a Process to be returned)"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
p = multiprocessing.Process(
|
|
74
|
+
target=_broker_server_target,
|
|
75
|
+
args=(server_host, server_port, max_queue_size),
|
|
76
|
+
daemon=True,
|
|
77
|
+
)
|
|
78
|
+
# If we're launching from inside the pipeline subprocess, mark daemon so the
|
|
79
|
+
# broker dies automatically when the subprocess exits.
|
|
80
|
+
p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
|
|
81
|
+
p.start()
|
|
82
|
+
logger.info(f"Started SimpleMessageBroker server in separate process on port {server_port}")
|
|
83
|
+
|
|
84
|
+
return p
|