nv-ingest 2025.5.22.dev20250522__py3-none-any.whl → 2025.5.29.dev20250529__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/api/main.py +3 -1
- nv_ingest/api/v1/metrics.py +29 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +20 -3
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +146 -29
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +1 -1
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +33 -33
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +7 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +215 -9
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +15 -0
- {nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/METADATA +5 -3
- {nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/RECORD +15 -13
- {nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/WHEEL +1 -1
- {nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/top_level.txt +0 -0
nv_ingest/api/main.py
CHANGED
|
@@ -14,6 +14,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
|
14
14
|
|
|
15
15
|
from .v1.health import router as HealthApiRouter
|
|
16
16
|
from .v1.ingest import router as IngestApiRouter
|
|
17
|
+
from .v1.metrics import router as MetricsApiRouter
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
22
|
app = FastAPI(
|
|
22
23
|
title="NV-Ingest Microservice",
|
|
23
24
|
description="Service for ingesting heterogenous datatypes",
|
|
24
|
-
version="25.
|
|
25
|
+
version="25.4.2",
|
|
25
26
|
contact={
|
|
26
27
|
"name": "NVIDIA Corporation",
|
|
27
28
|
"url": "https://nvidia.com",
|
|
@@ -31,6 +32,7 @@ app = FastAPI(
|
|
|
31
32
|
|
|
32
33
|
app.include_router(IngestApiRouter, prefix="/v1")
|
|
33
34
|
app.include_router(HealthApiRouter, prefix="/v1/health")
|
|
35
|
+
app.include_router(MetricsApiRouter, prefix="/v1")
|
|
34
36
|
|
|
35
37
|
# Set up the tracer provider and add a processor for exporting traces
|
|
36
38
|
resource = Resource(attributes={"service.name": "nv-ingest"})
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from fastapi import APIRouter, Response, status
|
|
7
|
+
from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
|
|
8
|
+
|
|
9
|
+
router = APIRouter()
|
|
10
|
+
|
|
11
|
+
# logger = logging.getLogger("uvicorn")
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Prometheus metrics
|
|
15
|
+
REQUEST_COUNT = Counter("http_requests_total", "Total HTTP Requests", ["method", "endpoint"])
|
|
16
|
+
REQUEST_LATENCY = Histogram("http_request_duration_seconds", "Request latency", ["method", "endpoint"])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@router.get(
|
|
20
|
+
"/metrics",
|
|
21
|
+
tags=["Health"],
|
|
22
|
+
summary="Provide prometheus formatted metrics for consumption",
|
|
23
|
+
description="""
|
|
24
|
+
Provide prometheus formatted metrics for consumption by a prometheus scraping server.
|
|
25
|
+
""",
|
|
26
|
+
status_code=status.HTTP_200_OK,
|
|
27
|
+
)
|
|
28
|
+
def metrics():
|
|
29
|
+
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
|
|
@@ -74,9 +74,26 @@ class PipelineTopology:
|
|
|
74
74
|
self._start_cleanup_thread() # Start background cleanup on init
|
|
75
75
|
|
|
76
76
|
def __del__(self):
|
|
77
|
-
"""Ensure cleanup thread is stopped
|
|
78
|
-
logger.debug("PipelineTopology destructor called
|
|
79
|
-
|
|
77
|
+
"""Ensure cleanup thread is stopped and internal actor references are released."""
|
|
78
|
+
logger.debug("PipelineTopology destructor called. Cleaning up thread and actor references.")
|
|
79
|
+
|
|
80
|
+
# Stop the background cleanup thread
|
|
81
|
+
try:
|
|
82
|
+
self._stop_cleanup_thread()
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.warning(f"Error stopping cleanup thread during __del__: {e}")
|
|
85
|
+
|
|
86
|
+
# Clear references to actor handles and shutdown futures
|
|
87
|
+
try:
|
|
88
|
+
self._stage_actors.clear()
|
|
89
|
+
self._edge_queues.clear()
|
|
90
|
+
self._scaling_state.clear()
|
|
91
|
+
self._stage_memory_overhead.clear()
|
|
92
|
+
self._pending_removal_actors.clear()
|
|
93
|
+
self._stages.clear()
|
|
94
|
+
self._connections.clear()
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(f"Error clearing internal state during __del__: {e}")
|
|
80
97
|
|
|
81
98
|
# --- Lock Context Manager ---
|
|
82
99
|
@contextlib.contextmanager
|
|
@@ -2,7 +2,11 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
import multiprocessing
|
|
6
|
+
import os
|
|
7
|
+
import signal
|
|
5
8
|
import threading
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
6
10
|
from collections import defaultdict
|
|
7
11
|
from dataclasses import dataclass
|
|
8
12
|
|
|
@@ -24,6 +28,35 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import P
|
|
|
24
28
|
logger = logging.getLogger(__name__)
|
|
25
29
|
|
|
26
30
|
|
|
31
|
+
class PipelineInterface(ABC):
|
|
32
|
+
"""
|
|
33
|
+
Abstract base class for pipeline implementations.
|
|
34
|
+
|
|
35
|
+
Any concrete pipeline must implement start and stop methods.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Start the pipeline.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
monitor_poll_interval : float
|
|
46
|
+
Interval in seconds for monitoring poll (default: 5.0).
|
|
47
|
+
scaling_poll_interval : float
|
|
48
|
+
Interval in seconds for scaling decisions (default: 30.0).
|
|
49
|
+
"""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def stop(self) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Stop the pipeline and perform any necessary cleanup.
|
|
56
|
+
"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
27
60
|
# --- Configuration Objects ---
|
|
28
61
|
|
|
29
62
|
|
|
@@ -62,7 +95,90 @@ class StatsConfig:
|
|
|
62
95
|
queue_timeout_seconds: float = 2.0
|
|
63
96
|
|
|
64
97
|
|
|
65
|
-
class
|
|
98
|
+
class RayPipelineSubprocessInterface(PipelineInterface):
|
|
99
|
+
"""
|
|
100
|
+
Pipeline interface implementation for a subprocess-based Ray pipeline.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(self, process: multiprocessing.Process):
|
|
104
|
+
"""
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
process : multiprocessing.Process
|
|
108
|
+
A handle to the running subprocess.
|
|
109
|
+
"""
|
|
110
|
+
self._process: multiprocessing.Process = process
|
|
111
|
+
|
|
112
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Start is not supported because the subprocess is assumed to already be running.
|
|
115
|
+
"""
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
def stop(self) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
|
|
121
|
+
"""
|
|
122
|
+
if not self._process.is_alive():
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
self._process.terminate()
|
|
127
|
+
self._process.join(timeout=5.0)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Failed to terminate process cleanly: {e}")
|
|
130
|
+
|
|
131
|
+
if self._process.is_alive():
|
|
132
|
+
try:
|
|
133
|
+
pgid = os.getpgid(self._process.pid)
|
|
134
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"Failed to force-kill process group: {e}")
|
|
137
|
+
self._process.join(timeout=3.0)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class RayPipelineInterface(PipelineInterface):
|
|
141
|
+
"""
|
|
142
|
+
Pipeline interface for an in-process RayPipeline instance.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(self, pipeline: "RayPipeline"):
|
|
146
|
+
"""
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
pipeline : RayPipeline
|
|
150
|
+
The instantiated pipeline to control.
|
|
151
|
+
"""
|
|
152
|
+
self._pipeline = pipeline
|
|
153
|
+
|
|
154
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
155
|
+
"""
|
|
156
|
+
Starts the RayPipeline.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
monitor_poll_interval : float
|
|
161
|
+
Unused here; provided for interface compatibility.
|
|
162
|
+
scaling_poll_interval : float
|
|
163
|
+
Unused here; provided for interface compatibility.
|
|
164
|
+
"""
|
|
165
|
+
self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
|
|
166
|
+
|
|
167
|
+
def stop(self) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Stops the RayPipeline and shuts down Ray.
|
|
170
|
+
"""
|
|
171
|
+
self._pipeline.stop()
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
import ray
|
|
175
|
+
|
|
176
|
+
ray.shutdown()
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class RayPipeline(PipelineInterface):
|
|
66
182
|
"""
|
|
67
183
|
A structured pipeline supporting dynamic scaling and queue flushing.
|
|
68
184
|
Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
|
|
@@ -151,10 +267,17 @@ class RayPipeline:
|
|
|
151
267
|
actor_timeout=self.stats_config.actor_timeout_seconds,
|
|
152
268
|
queue_timeout=self.stats_config.queue_timeout_seconds,
|
|
153
269
|
)
|
|
270
|
+
|
|
154
271
|
logger.info("RayStatsCollector initialized using StatsConfig.")
|
|
155
272
|
|
|
156
273
|
# --- Accessor Methods for Stats Collector (and internal use) ---
|
|
157
274
|
|
|
275
|
+
def __del__(self):
|
|
276
|
+
try:
|
|
277
|
+
self.stop()
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"Exception during RayPipeline cleanup: {e}")
|
|
280
|
+
|
|
158
281
|
def get_stages_info(self) -> List[StageInfo]:
|
|
159
282
|
"""Returns a snapshot of the current stage information."""
|
|
160
283
|
return self.topology.get_stages_info()
|
|
@@ -516,7 +639,9 @@ class RayPipeline:
|
|
|
516
639
|
"""
|
|
517
640
|
current_count = len(current_replicas)
|
|
518
641
|
num_to_remove = current_count - target_count
|
|
519
|
-
logger.
|
|
642
|
+
logger.debug(
|
|
643
|
+
f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
|
|
644
|
+
)
|
|
520
645
|
|
|
521
646
|
# Basic validation
|
|
522
647
|
if num_to_remove <= 0:
|
|
@@ -564,7 +689,7 @@ class RayPipeline:
|
|
|
564
689
|
logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
|
|
565
690
|
|
|
566
691
|
total_attempted = len(actors_to_remove)
|
|
567
|
-
logger.
|
|
692
|
+
logger.debug(
|
|
568
693
|
f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
|
|
569
694
|
f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
|
|
570
695
|
)
|
|
@@ -647,9 +772,6 @@ class RayPipeline:
|
|
|
647
772
|
# Activity check
|
|
648
773
|
is_quiet = global_in_flight <= self.quiet_period_threshold
|
|
649
774
|
|
|
650
|
-
if is_quiet:
|
|
651
|
-
logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
|
|
652
|
-
|
|
653
775
|
return is_quiet
|
|
654
776
|
|
|
655
777
|
def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
|
|
@@ -670,7 +792,6 @@ class RayPipeline:
|
|
|
670
792
|
return False
|
|
671
793
|
|
|
672
794
|
# --- Trigger immediate stats collection via the collector instance ---
|
|
673
|
-
drain_stats = {}
|
|
674
795
|
drain_success = False
|
|
675
796
|
collection_error = None
|
|
676
797
|
|
|
@@ -689,19 +810,18 @@ class RayPipeline:
|
|
|
689
810
|
if not collection_error
|
|
690
811
|
else f"Collection Error: {type(collection_error).__name__}"
|
|
691
812
|
)
|
|
692
|
-
logger.
|
|
693
|
-
f"[
|
|
813
|
+
logger.debug(
|
|
814
|
+
f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
|
|
694
815
|
)
|
|
695
816
|
last_in_flight = global_in_flight
|
|
696
817
|
|
|
697
818
|
# --- Check for successful drain ---
|
|
698
819
|
# Requires BOTH in-flight=0 AND the collection reporting it was successful
|
|
699
820
|
if global_in_flight == 0 and drain_success and not collection_error:
|
|
700
|
-
logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
|
|
701
821
|
return True
|
|
702
822
|
elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
|
|
703
823
|
logger.warning(
|
|
704
|
-
"[
|
|
824
|
+
"[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
|
|
705
825
|
" Cannot confirm drain yet."
|
|
706
826
|
)
|
|
707
827
|
|
|
@@ -719,7 +839,6 @@ class RayPipeline:
|
|
|
719
839
|
|
|
720
840
|
# Set flushing state in topology
|
|
721
841
|
self.topology.set_flushing(True)
|
|
722
|
-
logger.info("--- Starting Queue Flush ---")
|
|
723
842
|
overall_success = False
|
|
724
843
|
source_actors_paused = []
|
|
725
844
|
pause_refs = []
|
|
@@ -734,7 +853,7 @@ class RayPipeline:
|
|
|
734
853
|
current_connections = self.topology.get_connections()
|
|
735
854
|
|
|
736
855
|
# --- 1. Pause Source Stages (using snapshots) ---
|
|
737
|
-
logger.
|
|
856
|
+
logger.debug("Pausing source stages...")
|
|
738
857
|
pause_timeout = 60.0
|
|
739
858
|
for stage in current_stages:
|
|
740
859
|
if stage.is_source:
|
|
@@ -747,22 +866,22 @@ class RayPipeline:
|
|
|
747
866
|
except Exception as e:
|
|
748
867
|
logger.error(f"Failed sending pause to {actor}: {e}")
|
|
749
868
|
if pause_refs:
|
|
750
|
-
logger.
|
|
869
|
+
logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
|
|
751
870
|
try:
|
|
752
871
|
ray.get(pause_refs, timeout=pause_timeout)
|
|
753
|
-
logger.
|
|
872
|
+
logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
|
|
754
873
|
except GetTimeoutError:
|
|
755
874
|
logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
|
|
756
875
|
except Exception as e:
|
|
757
876
|
logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
|
|
758
877
|
|
|
759
878
|
# --- 2. Wait for Drain ---
|
|
760
|
-
logger.
|
|
879
|
+
logger.debug("Waiting for pipeline to drain...")
|
|
761
880
|
if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
|
|
762
881
|
raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
|
|
763
882
|
|
|
764
883
|
# --- 3. Create New Queues (using snapshot) ---
|
|
765
|
-
logger.
|
|
884
|
+
logger.debug("Creating new replacement queues...")
|
|
766
885
|
new_edge_queues_map = {}
|
|
767
886
|
for queue_name, (_, queue_size) in current_edge_queues.items():
|
|
768
887
|
try:
|
|
@@ -775,7 +894,7 @@ class RayPipeline:
|
|
|
775
894
|
raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
|
|
776
895
|
|
|
777
896
|
# --- 4. Re-wire Actors to New Queues (using snapshots) ---
|
|
778
|
-
logger.
|
|
897
|
+
logger.debug("Re-wiring actors to new queues...")
|
|
779
898
|
wiring_refs = []
|
|
780
899
|
wiring_timeout = 120.0
|
|
781
900
|
for from_stage_name, conns in current_connections.items():
|
|
@@ -811,7 +930,7 @@ class RayPipeline:
|
|
|
811
930
|
raise RuntimeError("Actor re-wiring failed.") from e
|
|
812
931
|
|
|
813
932
|
# --- 5. Update Topology State (Commit Point) ---
|
|
814
|
-
logger.
|
|
933
|
+
logger.debug("Committing new queues to pipeline topology.")
|
|
815
934
|
self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
|
|
816
935
|
overall_success = True
|
|
817
936
|
|
|
@@ -822,7 +941,7 @@ class RayPipeline:
|
|
|
822
941
|
finally:
|
|
823
942
|
# --- 6. Resume Source Stages (Always attempt) ---
|
|
824
943
|
if source_actors_paused:
|
|
825
|
-
logger.
|
|
944
|
+
logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
|
|
826
945
|
resume_timeout = 30.0
|
|
827
946
|
resume_refs = []
|
|
828
947
|
for actor in source_actors_paused:
|
|
@@ -831,10 +950,10 @@ class RayPipeline:
|
|
|
831
950
|
except Exception as e:
|
|
832
951
|
logger.error(f"Failed sending resume to {actor}: {e}")
|
|
833
952
|
if resume_refs:
|
|
834
|
-
logger.
|
|
953
|
+
logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
|
|
835
954
|
try:
|
|
836
955
|
ray.get(resume_refs, timeout=resume_timeout)
|
|
837
|
-
logger.
|
|
956
|
+
logger.debug(f"{len(resume_refs)} sources resumed.")
|
|
838
957
|
except GetTimeoutError:
|
|
839
958
|
logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
|
|
840
959
|
except Exception as e:
|
|
@@ -843,9 +962,6 @@ class RayPipeline:
|
|
|
843
962
|
# Update flush timestamp only on success
|
|
844
963
|
if overall_success:
|
|
845
964
|
self._last_queue_flush_time = time.time()
|
|
846
|
-
logger.info("--- Queue Flush Completed Successfully ---")
|
|
847
|
-
else:
|
|
848
|
-
logger.error("--- Queue Flush Failed ---")
|
|
849
965
|
|
|
850
966
|
# Reset flushing state in topology
|
|
851
967
|
self.topology.set_flushing(False)
|
|
@@ -977,7 +1093,7 @@ class RayPipeline:
|
|
|
977
1093
|
|
|
978
1094
|
if target_replica_count != current_count:
|
|
979
1095
|
stages_needing_action.append((stage_name, target_replica_count))
|
|
980
|
-
logger.
|
|
1096
|
+
logger.debug(
|
|
981
1097
|
f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
|
|
982
1098
|
f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
|
|
983
1099
|
)
|
|
@@ -1019,7 +1135,7 @@ class RayPipeline:
|
|
|
1019
1135
|
completed = sum(1 for r in action_results.values() if r["status"] == "completed")
|
|
1020
1136
|
errors = sum(1 for r in action_results.values() if r["status"] == "error")
|
|
1021
1137
|
timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
|
|
1022
|
-
logger.
|
|
1138
|
+
logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
|
|
1023
1139
|
|
|
1024
1140
|
def _perform_scaling_and_maintenance(self) -> None:
|
|
1025
1141
|
"""Orchestrates scaling/maintenance using topology and stats collector."""
|
|
@@ -1050,9 +1166,9 @@ class RayPipeline:
|
|
|
1050
1166
|
logger.debug("--- Performing Scaling & Maintenance Cycle ---")
|
|
1051
1167
|
|
|
1052
1168
|
if self._is_pipeline_quiet():
|
|
1053
|
-
logger.info("Pipeline quiet, initiating queue flush.")
|
|
1169
|
+
logger.info("[Drain] Pipeline quiet, initiating queue flush.")
|
|
1054
1170
|
flush_success = self._execute_queue_flush()
|
|
1055
|
-
logger.info(f"Automatic queue flush completed. Success: {flush_success}")
|
|
1171
|
+
logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
|
|
1056
1172
|
return
|
|
1057
1173
|
|
|
1058
1174
|
# Fast return check if stopping occurred while flushing or checking flush status
|
|
@@ -1201,5 +1317,6 @@ class RayPipeline:
|
|
|
1201
1317
|
|
|
1202
1318
|
# Clear runtime state in topology
|
|
1203
1319
|
self.topology.clear_runtime_state()
|
|
1320
|
+
del self.topology
|
|
1204
1321
|
|
|
1205
1322
|
logger.info("Pipeline stopped.")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
import ray
|
|
9
|
+
|
|
10
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
12
|
+
from nv_ingest_api.internal.extract.html.html_extractor import extract_markdown_from_html_internal
|
|
13
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
|
+
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
|
+
nv_ingest_node_failure_try_except,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@ray.remote
|
|
24
|
+
class HtmlExtractorStage(RayActorStage):
|
|
25
|
+
"""
|
|
26
|
+
A Ray actor stage that extracts text in markdown format from html content.
|
|
27
|
+
|
|
28
|
+
It expects an IngestControlMessage containing a DataFrame with html content. It then:
|
|
29
|
+
1. Removes the "html_content_extract" task from the message.
|
|
30
|
+
2. Calls the html extraction logic (via extract_markdown_from_html_internal) using a validated configuration.
|
|
31
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: HtmlExtractorSchema) -> None:
|
|
35
|
+
super().__init__(config, log_to_stdout=False)
|
|
36
|
+
try:
|
|
37
|
+
self.validated_config = config
|
|
38
|
+
self._logger.info("HtmlExtractorStage configuration validated successfully.")
|
|
39
|
+
except Exception as e:
|
|
40
|
+
self._logger.exception(f"Error validating Html Extractor config: {e}")
|
|
41
|
+
raise
|
|
42
|
+
|
|
43
|
+
@traceable("html_extractor")
|
|
44
|
+
@filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
|
|
45
|
+
@nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
"""
|
|
48
|
+
Process the control message by extracting content from html.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
control_message : IngestControlMessage
|
|
53
|
+
The message containing a DataFrame payload with html content.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
IngestControlMessage
|
|
58
|
+
The updated message with extracted content.
|
|
59
|
+
"""
|
|
60
|
+
self._logger.debug("HtmlExtractorStage.on_data: Starting html extraction process.")
|
|
61
|
+
|
|
62
|
+
# Extract the DataFrame payload.
|
|
63
|
+
df_ledger = control_message.payload()
|
|
64
|
+
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
65
|
+
|
|
66
|
+
# Remove the "html_content_extract" task from the message to obtain task-specific configuration.
|
|
67
|
+
task_config = remove_task_by_type(control_message, "extract")
|
|
68
|
+
self._logger.debug("Extracted task config: %s", task_config)
|
|
69
|
+
|
|
70
|
+
# Perform html content extraction.
|
|
71
|
+
new_df, extraction_info = extract_markdown_from_html_internal(
|
|
72
|
+
df_extraction_ledger=df_ledger,
|
|
73
|
+
task_config=task_config,
|
|
74
|
+
extraction_config=self.validated_config,
|
|
75
|
+
execution_trace_log=None,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Update the message payload with the extracted text DataFrame.
|
|
79
|
+
control_message.payload(new_df)
|
|
80
|
+
control_message.set_metadata("html_extraction_info", extraction_info)
|
|
81
|
+
|
|
82
|
+
return control_message
|
|
@@ -495,7 +495,7 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
|
|
|
495
495
|
server.serve_forever()
|
|
496
496
|
|
|
497
497
|
p = multiprocessing.Process(target=broker_server)
|
|
498
|
-
p.daemon =
|
|
498
|
+
p.daemon = False
|
|
499
499
|
p.start()
|
|
500
500
|
logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
|
|
501
501
|
|
|
@@ -490,7 +490,7 @@ class ResourceConstraintManager:
|
|
|
490
490
|
final_proposals_this_step = {}
|
|
491
491
|
|
|
492
492
|
if not room_to_scale_up_to_global_caps:
|
|
493
|
-
logger.
|
|
493
|
+
logger.debug(
|
|
494
494
|
"[ConstraintMgr-Proportional] Global scaling beyond effective minimums is RESTRICTED "
|
|
495
495
|
"as SumOfEffectiveMins likely meets/exceeds a global Core/MaxReplica cap. "
|
|
496
496
|
"Proposed increases from initial current values will be nullified."
|
|
@@ -502,7 +502,7 @@ class ResourceConstraintManager:
|
|
|
502
502
|
if val_from_prior_phases > original_current_replicas:
|
|
503
503
|
final_proposals_this_step[name] = original_current_replicas
|
|
504
504
|
if val_from_prior_phases != original_current_replicas:
|
|
505
|
-
logger.
|
|
505
|
+
logger.debug(
|
|
506
506
|
f"[ConstraintMgr-{name}] Proportional: Scaling restricted. "
|
|
507
507
|
f"Nullified proposed increase from {original_current_replicas} to {val_from_prior_phases}. "
|
|
508
508
|
f"Setting to {original_current_replicas}."
|
|
@@ -618,7 +618,7 @@ class ResourceConstraintManager:
|
|
|
618
618
|
|
|
619
619
|
# Apply reduction to the deltas
|
|
620
620
|
if reduction_factor <= 0.001: # Epsilon for float
|
|
621
|
-
logger.
|
|
621
|
+
logger.debug(
|
|
622
622
|
f"[ConstraintMgr-Proportional] Scale-up beyond effective minimums fully constrained by global limits. "
|
|
623
623
|
f"Reasons: {'; '.join(limiting_reasons) if limiting_reasons else 'None'}. "
|
|
624
624
|
f"Final ReductionFactor={reduction_factor:.3f}."
|
|
@@ -637,7 +637,7 @@ class ResourceConstraintManager:
|
|
|
637
637
|
)
|
|
638
638
|
|
|
639
639
|
elif reduction_factor < 1.0:
|
|
640
|
-
logger.
|
|
640
|
+
logger.debug(
|
|
641
641
|
f"[ConstraintMgr-Proportional] Reducing requested scale-up (beyond effective_mins) by "
|
|
642
642
|
f"factor {reduction_factor:.3f}. "
|
|
643
643
|
f"Limiting Factors: {'; '.join(limiting_reasons)}."
|
|
@@ -654,7 +654,7 @@ class ResourceConstraintManager:
|
|
|
654
654
|
f"-> FinalVal={final_value_for_stage}"
|
|
655
655
|
)
|
|
656
656
|
else: # reduction_factor is ~1.0, meaning full requested increase (above effective_mins) is allowed
|
|
657
|
-
logger.
|
|
657
|
+
logger.debug(
|
|
658
658
|
"[ConstraintMgr-Proportional] Full requested scale-up (beyond effective_mins) "
|
|
659
659
|
"is permissible by global limits."
|
|
660
660
|
)
|
|
@@ -713,7 +713,7 @@ class ResourceConstraintManager:
|
|
|
713
713
|
target = max(1, min_r)
|
|
714
714
|
final_target = min(target, max_r)
|
|
715
715
|
if final_target > 0:
|
|
716
|
-
logger.
|
|
716
|
+
logger.debug(
|
|
717
717
|
f"[ConstraintMgr-{name}] Forcing minimum {final_target} replica due to global wake-up."
|
|
718
718
|
)
|
|
719
719
|
final_adjustments[name] = final_target
|
|
@@ -740,19 +740,19 @@ class ResourceConstraintManager:
|
|
|
740
740
|
num_queue_actors = num_edges
|
|
741
741
|
total_ray_components_for_info = final_stage_replicas_total + num_queue_actors
|
|
742
742
|
|
|
743
|
-
logger.
|
|
743
|
+
logger.debug("[ConstraintMgr] --- Final Decision & Constraint Summary ---")
|
|
744
744
|
|
|
745
745
|
# --- I. Overall Pipeline State ---
|
|
746
|
-
logger.
|
|
747
|
-
logger.
|
|
748
|
-
logger.
|
|
746
|
+
logger.debug(f"[ConstraintMgr] Pipeline Activity: {global_in_flight} tasks in-flight.")
|
|
747
|
+
logger.debug(f"[ConstraintMgr] Effective Min Replicas (Sum): {sum_of_effective_mins}")
|
|
748
|
+
logger.debug(
|
|
749
749
|
f"[ConstraintMgr] └─ Global Scaling Beyond Mins Permitted? {can_globally_scale_beyond_effective_mins}"
|
|
750
750
|
)
|
|
751
751
|
|
|
752
752
|
# --- II. Final Component Counts ---
|
|
753
|
-
logger.
|
|
754
|
-
logger.
|
|
755
|
-
logger.
|
|
753
|
+
logger.debug(f"[ConstraintMgr] Final Stage Replicas: {final_stage_replicas_total} (Target for caps)")
|
|
754
|
+
logger.debug(f"[ConstraintMgr] Queue/Edge Actors : {num_queue_actors} (Informational)")
|
|
755
|
+
logger.debug(f"[ConstraintMgr] Total Ray Components: {total_ray_components_for_info} (Informational)")
|
|
756
756
|
|
|
757
757
|
# --- III. Resource Limits & Projected Usage (for Stages) ---
|
|
758
758
|
# Configured Limits
|
|
@@ -762,18 +762,18 @@ class ResourceConstraintManager:
|
|
|
762
762
|
)
|
|
763
763
|
eff_mem_limit_str = f"{self.effective_memory_limit_mb:.1f}MB"
|
|
764
764
|
|
|
765
|
-
logger.
|
|
766
|
-
logger.
|
|
767
|
-
logger.
|
|
765
|
+
logger.debug("[ConstraintMgr] Global Limits (Stages):")
|
|
766
|
+
logger.debug(f"[ConstraintMgr] ├─ MaxTotalReplicas : {max_r_cfg_str}")
|
|
767
|
+
logger.debug(
|
|
768
768
|
f"[ConstraintMgr] ├─ CoreBasedRepLimit : {core_based_limit_str} "
|
|
769
769
|
f"(System EffCores: {self.available_cores if self.available_cores is not None else 'N/A'})"
|
|
770
770
|
)
|
|
771
|
-
logger.
|
|
771
|
+
logger.debug(f"[ConstraintMgr] └─ EffectiveMemLimit : {eff_mem_limit_str} ")
|
|
772
772
|
|
|
773
773
|
# Usage vs Limits
|
|
774
|
-
logger.
|
|
775
|
-
logger.
|
|
776
|
-
logger.
|
|
774
|
+
logger.debug("[ConstraintMgr] Projected Usage (Stages):")
|
|
775
|
+
logger.debug(f"[ConstraintMgr] ├─ Replicas : {final_stage_replicas_total}")
|
|
776
|
+
logger.debug(
|
|
777
777
|
f"[ConstraintMgr] └─ Memory : {projected_final_memory_mb:.1f}MB "
|
|
778
778
|
f"(Current: {current_global_memory_usage_mb:.1f}MB)"
|
|
779
779
|
)
|
|
@@ -815,20 +815,20 @@ class ResourceConstraintManager:
|
|
|
815
815
|
)
|
|
816
816
|
unexpected_breaches_details.append(f"MemoryLimit: {status_mem}")
|
|
817
817
|
|
|
818
|
-
logger.
|
|
819
|
-
logger.
|
|
820
|
-
logger.
|
|
821
|
-
logger.
|
|
818
|
+
logger.debug("[ConstraintMgr] Limit Adherence (Stages):")
|
|
819
|
+
logger.debug(f"[ConstraintMgr] ├─ MaxTotalReplicas : {status_max_r}")
|
|
820
|
+
logger.debug(f"[ConstraintMgr] ├─ CoreBasedRepLimit : {status_core_r}")
|
|
821
|
+
logger.debug(f"[ConstraintMgr] └─ EffectiveMemLimit : {status_mem}")
|
|
822
822
|
|
|
823
823
|
if unexpected_breaches_details:
|
|
824
|
-
logger.
|
|
824
|
+
logger.debug(f"[ConstraintMgr] └─ UNEXPECTED BREACHES: {'; '.join(unexpected_breaches_details)}")
|
|
825
825
|
else:
|
|
826
|
-
logger.
|
|
826
|
+
logger.debug("[ConstraintMgr] └─ All hard caps (beyond tolerated minimums/wake-up) appear respected.")
|
|
827
827
|
|
|
828
828
|
# --- V. Final Decisions Per Stage ---
|
|
829
|
-
logger.
|
|
829
|
+
logger.debug("[ConstraintMgr] Final Decisions (Per Stage):")
|
|
830
830
|
if not final_adjustments:
|
|
831
|
-
logger.
|
|
831
|
+
logger.debug("[ConstraintMgr] └─ No stages to adjust.")
|
|
832
832
|
else:
|
|
833
833
|
# Determine max stage name length for alignment
|
|
834
834
|
max_name_len = 0
|
|
@@ -843,12 +843,12 @@ class ResourceConstraintManager:
|
|
|
843
843
|
eff_min_str = f"(EffMin: {min_replicas if orig_prop else 'N/A'})"
|
|
844
844
|
|
|
845
845
|
# Basic alignment, can be improved with more sophisticated padding
|
|
846
|
-
logger.
|
|
846
|
+
logger.debug(
|
|
847
847
|
f"[ConstraintMgr] └─ {stage_name:<{max_name_len}} : "
|
|
848
848
|
f"{count:<3} {pid_proposed_str} {current_str} {eff_min_str}"
|
|
849
849
|
)
|
|
850
850
|
|
|
851
|
-
logger.
|
|
851
|
+
logger.debug("[ConstraintMgr] --- Constraint Summary END ---")
|
|
852
852
|
|
|
853
853
|
# --- Public Method ---
|
|
854
854
|
|
|
@@ -863,7 +863,7 @@ class ResourceConstraintManager:
|
|
|
863
863
|
Applies all configured constraints to initial replica proposals.
|
|
864
864
|
(Docstring from previous version is fine)
|
|
865
865
|
"""
|
|
866
|
-
logger.
|
|
866
|
+
logger.debug(
|
|
867
867
|
f"[ConstraintMgr] --- Applying Constraints START --- "
|
|
868
868
|
f"GlobalInFlight={global_in_flight}, "
|
|
869
869
|
f"CurrentGlobalMemMB={current_global_memory_usage_mb}, "
|
|
@@ -904,7 +904,7 @@ class ResourceConstraintManager:
|
|
|
904
904
|
current_effective_mins[name] = eff_min
|
|
905
905
|
sum_of_effective_mins += eff_min
|
|
906
906
|
|
|
907
|
-
logger.
|
|
907
|
+
logger.debug(
|
|
908
908
|
f"[ConstraintMgr] Calculated Effective Minimums: TotalSum={sum_of_effective_mins}. "
|
|
909
909
|
# f"IndividualMins: {current_effective_mins}" # Can be verbose
|
|
910
910
|
)
|
|
@@ -985,5 +985,5 @@ class ResourceConstraintManager:
|
|
|
985
985
|
can_globally_scale_up_stages, # Pass this for context in logging
|
|
986
986
|
)
|
|
987
987
|
|
|
988
|
-
logger.
|
|
988
|
+
logger.debug("[ConstraintMgr] --- Applying Constraints END ---")
|
|
989
989
|
return final_adjustments
|
|
@@ -19,6 +19,7 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
|
|
|
19
19
|
add_image_extractor_stage,
|
|
20
20
|
add_docx_extractor_stage,
|
|
21
21
|
add_audio_extractor_stage,
|
|
22
|
+
add_html_extractor_stage,
|
|
22
23
|
add_image_dedup_stage,
|
|
23
24
|
add_image_filter_stage,
|
|
24
25
|
add_table_extractor_stage,
|
|
@@ -53,7 +54,7 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
|
|
|
53
54
|
export_config_to_env(ingest_config)
|
|
54
55
|
|
|
55
56
|
current_level = logging.getLogger().getEffectiveLevel()
|
|
56
|
-
ray.init(
|
|
57
|
+
ray_context = ray.init(
|
|
57
58
|
namespace="nv_ingest_ray",
|
|
58
59
|
logging_level=current_level,
|
|
59
60
|
ignore_reinit_error=True,
|
|
@@ -103,6 +104,7 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
|
|
|
103
104
|
docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
|
|
104
105
|
pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
|
|
105
106
|
audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
|
|
107
|
+
html_extractor_stage_id = add_html_extractor_stage(pipeline, default_cpu_count)
|
|
106
108
|
########################################################################################################
|
|
107
109
|
|
|
108
110
|
########################################################################################################
|
|
@@ -159,7 +161,8 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
|
|
|
159
161
|
pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
160
162
|
pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
161
163
|
pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
162
|
-
pipeline.make_edge(image_extractor_stage_id,
|
|
164
|
+
pipeline.make_edge(image_extractor_stage_id, html_extractor_stage_id, queue_size=ingest_edge_buffer_size)
|
|
165
|
+
pipeline.make_edge(html_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
|
|
163
166
|
|
|
164
167
|
###### Primitive Extractors ########
|
|
165
168
|
pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
|
|
@@ -193,3 +196,5 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
|
|
|
193
196
|
# pipe.add_edge(sink_stage, otel_tracer_stage)
|
|
194
197
|
|
|
195
198
|
# pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)
|
|
199
|
+
|
|
200
|
+
return ray_context
|
|
@@ -2,16 +2,26 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
import atexit
|
|
5
6
|
import logging
|
|
7
|
+
import multiprocessing
|
|
6
8
|
import os
|
|
9
|
+
import signal
|
|
10
|
+
import sys
|
|
7
11
|
import time
|
|
12
|
+
from ctypes import CDLL, c_int
|
|
8
13
|
from datetime import datetime
|
|
9
|
-
from typing import Union, Tuple
|
|
14
|
+
from typing import Union, Tuple, Optional, TextIO
|
|
10
15
|
|
|
11
16
|
import ray
|
|
12
17
|
from pydantic import BaseModel, ConfigDict
|
|
13
18
|
|
|
14
|
-
from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import
|
|
19
|
+
from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
|
|
20
|
+
RayPipeline,
|
|
21
|
+
ScalingConfig,
|
|
22
|
+
RayPipelineSubprocessInterface,
|
|
23
|
+
RayPipelineInterface,
|
|
24
|
+
)
|
|
15
25
|
from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
|
|
16
26
|
|
|
17
27
|
logger = logging.getLogger(__name__)
|
|
@@ -33,6 +43,8 @@ class PipelineCreationSchema(BaseModel):
|
|
|
33
43
|
including endpoints, API keys, and processing options.
|
|
34
44
|
"""
|
|
35
45
|
|
|
46
|
+
arrow_default_memory_pool: str = os.getenv("ARROW_DEFAULT_MEMORY_POOL", "system")
|
|
47
|
+
|
|
36
48
|
# Audio processing settings
|
|
37
49
|
audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
|
|
38
50
|
audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
|
|
@@ -100,6 +112,112 @@ class PipelineCreationSchema(BaseModel):
|
|
|
100
112
|
model_config = ConfigDict(extra="forbid")
|
|
101
113
|
|
|
102
114
|
|
|
115
|
+
def redirect_os_fds(stdout: Optional[TextIO] = None, stderr: Optional[TextIO] = None):
|
|
116
|
+
"""
|
|
117
|
+
Redirect OS-level stdout (fd=1) and stderr (fd=2) to the given file-like objects,
|
|
118
|
+
or to /dev/null if not provided.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
stdout : Optional[TextIO]
|
|
123
|
+
Stream to receive OS-level stdout. If None, redirected to /dev/null.
|
|
124
|
+
stderr : Optional[TextIO]
|
|
125
|
+
Stream to receive OS-level stderr. If None, redirected to /dev/null.
|
|
126
|
+
"""
|
|
127
|
+
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
|
128
|
+
|
|
129
|
+
if stdout is not None:
|
|
130
|
+
os.dup2(stdout.fileno(), 1)
|
|
131
|
+
else:
|
|
132
|
+
os.dup2(devnull_fd, 1)
|
|
133
|
+
|
|
134
|
+
if stderr is not None:
|
|
135
|
+
os.dup2(stderr.fileno(), 2)
|
|
136
|
+
else:
|
|
137
|
+
os.dup2(devnull_fd, 2)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def set_pdeathsig(sig=signal.SIGKILL):
|
|
141
|
+
libc = CDLL("libc.so.6")
|
|
142
|
+
PR_SET_PDEATHSIG = 1
|
|
143
|
+
libc.prctl(PR_SET_PDEATHSIG, c_int(sig))
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def kill_pipeline_process_group(pid: int):
|
|
147
|
+
"""
|
|
148
|
+
Kill the process group associated with the given PID, if it exists and is alive.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
pid : int
|
|
153
|
+
The PID of the process whose group should be killed.
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
# Get the process group ID
|
|
157
|
+
pgid = os.getpgid(pid)
|
|
158
|
+
|
|
159
|
+
# Check if the group is still alive by sending signal 0
|
|
160
|
+
os.killpg(pgid, 0) # Does not kill, just checks if it's alive
|
|
161
|
+
|
|
162
|
+
# If no exception, the group is alive — kill it
|
|
163
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
164
|
+
print(f"Killed subprocess group {pgid}")
|
|
165
|
+
|
|
166
|
+
except ProcessLookupError:
|
|
167
|
+
print(f"Process group for PID {pid} no longer exists.")
|
|
168
|
+
except PermissionError:
|
|
169
|
+
print(f"Permission denied to kill process group for PID {pid}.")
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"Failed to kill subprocess group: {e}")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _run_pipeline_process(
|
|
175
|
+
ingest_config: PipelineCreationSchema,
|
|
176
|
+
disable_dynamic_scaling: Optional[bool],
|
|
177
|
+
dynamic_memory_threshold: Optional[float],
|
|
178
|
+
raw_stdout: Optional[TextIO] = None,
|
|
179
|
+
raw_stderr: Optional[TextIO] = None,
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Subprocess entrypoint to launch the pipeline. Redirects all output to the provided
|
|
183
|
+
file-like streams or /dev/null if not specified.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
ingest_config : PipelineCreationSchema
|
|
188
|
+
Validated pipeline configuration.
|
|
189
|
+
disable_dynamic_scaling : Optional[bool]
|
|
190
|
+
Whether to disable dynamic scaling.
|
|
191
|
+
dynamic_memory_threshold : Optional[float]
|
|
192
|
+
Threshold for triggering scaling.
|
|
193
|
+
raw_stdout : Optional[TextIO]
|
|
194
|
+
Destination for stdout. Defaults to /dev/null.
|
|
195
|
+
raw_stderr : Optional[TextIO]
|
|
196
|
+
Destination for stderr. Defaults to /dev/null.
|
|
197
|
+
"""
|
|
198
|
+
# Set the death signal for the subprocess
|
|
199
|
+
set_pdeathsig()
|
|
200
|
+
os.setsid() # Creates new process group so it can be SIGKILLed as a group
|
|
201
|
+
|
|
202
|
+
# Redirect OS-level file descriptors
|
|
203
|
+
redirect_os_fds(stdout=raw_stdout, stderr=raw_stderr)
|
|
204
|
+
|
|
205
|
+
# Redirect Python-level sys.stdout/sys.stderr
|
|
206
|
+
sys.stdout = raw_stdout or open(os.devnull, "w")
|
|
207
|
+
sys.stderr = raw_stderr or open(os.devnull, "w")
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
_launch_pipeline(
|
|
211
|
+
ingest_config,
|
|
212
|
+
block=True,
|
|
213
|
+
disable_dynamic_scaling=disable_dynamic_scaling,
|
|
214
|
+
dynamic_memory_threshold=dynamic_memory_threshold,
|
|
215
|
+
)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
sys.__stderr__.write(f"Subprocess pipeline run failed: {e}\n")
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
|
|
103
221
|
def _launch_pipeline(
|
|
104
222
|
ingest_config: PipelineCreationSchema,
|
|
105
223
|
block: bool,
|
|
@@ -122,7 +240,7 @@ def _launch_pipeline(
|
|
|
122
240
|
start_abs = datetime.now()
|
|
123
241
|
|
|
124
242
|
# Set up the ingestion pipeline
|
|
125
|
-
setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
|
|
243
|
+
_ = setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
|
|
126
244
|
|
|
127
245
|
# Record setup time
|
|
128
246
|
end_setup = start_run = datetime.now()
|
|
@@ -159,12 +277,100 @@ def _launch_pipeline(
|
|
|
159
277
|
def run_pipeline(
|
|
160
278
|
ingest_config: PipelineCreationSchema,
|
|
161
279
|
block: bool = True,
|
|
162
|
-
disable_dynamic_scaling: bool = None,
|
|
163
|
-
dynamic_memory_threshold: float = None,
|
|
164
|
-
|
|
165
|
-
|
|
280
|
+
disable_dynamic_scaling: Optional[bool] = None,
|
|
281
|
+
dynamic_memory_threshold: Optional[float] = None,
|
|
282
|
+
run_in_subprocess: bool = False,
|
|
283
|
+
stdout: Optional[TextIO] = None,
|
|
284
|
+
stderr: Optional[TextIO] = None,
|
|
285
|
+
) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
|
|
286
|
+
"""
|
|
287
|
+
Launch and manage a pipeline, optionally in a subprocess.
|
|
288
|
+
|
|
289
|
+
This function is the primary entry point for executing a Ray pipeline,
|
|
290
|
+
either within the current process or in a separate Python subprocess.
|
|
291
|
+
It supports synchronous blocking execution or non-blocking lifecycle management,
|
|
292
|
+
and allows redirection of output to specified file-like objects.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
ingest_config : PipelineCreationSchema
|
|
297
|
+
The validated configuration object used to construct and launch the pipeline.
|
|
298
|
+
block : bool, default=True
|
|
299
|
+
If True, blocks until the pipeline completes.
|
|
300
|
+
If False, returns an interface to control the pipeline externally.
|
|
301
|
+
disable_dynamic_scaling : Optional[bool], default=None
|
|
302
|
+
If True, disables dynamic memory scaling. Overrides global configuration if set.
|
|
303
|
+
If None, uses the default or globally defined behavior.
|
|
304
|
+
dynamic_memory_threshold : Optional[float], default=None
|
|
305
|
+
The memory usage threshold (as a float between 0 and 1) that triggers autoscaling,
|
|
306
|
+
if dynamic scaling is enabled. Defaults to the globally configured value if None.
|
|
307
|
+
run_in_subprocess : bool, default=False
|
|
308
|
+
If True, launches the pipeline in a separate Python subprocess using `multiprocessing.Process`.
|
|
309
|
+
If False, runs the pipeline in the current process.
|
|
310
|
+
stdout : Optional[TextIO], default=None
|
|
311
|
+
Optional file-like stream to which subprocess stdout should be redirected.
|
|
312
|
+
If None, stdout is redirected to /dev/null.
|
|
313
|
+
stderr : Optional[TextIO], default=None
|
|
314
|
+
Optional file-like stream to which subprocess stderr should be redirected.
|
|
315
|
+
If None, stderr is redirected to /dev/null.
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]
|
|
320
|
+
- If run in-process with `block=True`: returns elapsed time in seconds (float).
|
|
321
|
+
- If run in-process with `block=False`: returns a `RayPipelineInterface`.
|
|
322
|
+
- If run in subprocess with `block=False`: returns a `RayPipelineSubprocessInterface`.
|
|
323
|
+
- If run in subprocess with `block=True`: returns 0.0.
|
|
324
|
+
|
|
325
|
+
Raises
|
|
326
|
+
------
|
|
327
|
+
RuntimeError
|
|
328
|
+
If the subprocess fails to start or exits with an error.
|
|
329
|
+
Exception
|
|
330
|
+
Any other exceptions raised during pipeline launch or configuration.
|
|
331
|
+
"""
|
|
332
|
+
if run_in_subprocess:
|
|
333
|
+
logger.info("Launching pipeline in Python subprocess using multiprocessing.")
|
|
334
|
+
|
|
335
|
+
ctx = multiprocessing.get_context("fork")
|
|
336
|
+
process = ctx.Process(
|
|
337
|
+
target=_run_pipeline_process,
|
|
338
|
+
args=(
|
|
339
|
+
ingest_config,
|
|
340
|
+
disable_dynamic_scaling,
|
|
341
|
+
dynamic_memory_threshold,
|
|
342
|
+
stdout, # raw_stdout
|
|
343
|
+
stderr, # raw_stderr
|
|
344
|
+
),
|
|
345
|
+
daemon=False,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
process.start()
|
|
349
|
+
|
|
350
|
+
interface = RayPipelineSubprocessInterface(process)
|
|
351
|
+
|
|
352
|
+
if block:
|
|
353
|
+
start_time = time.time()
|
|
354
|
+
logger.info("Waiting for subprocess pipeline to complete...")
|
|
355
|
+
process.join()
|
|
356
|
+
logger.info("Pipeline subprocess completed.")
|
|
357
|
+
return time.time() - start_time
|
|
358
|
+
else:
|
|
359
|
+
logger.info(f"Pipeline subprocess started (PID={process.pid})")
|
|
360
|
+
atexit.register(lambda: kill_pipeline_process_group(process.pid))
|
|
361
|
+
|
|
362
|
+
return interface
|
|
363
|
+
|
|
364
|
+
# Run inline
|
|
365
|
+
pipeline, total_elapsed = _launch_pipeline(
|
|
366
|
+
ingest_config,
|
|
367
|
+
block=block,
|
|
368
|
+
disable_dynamic_scaling=disable_dynamic_scaling,
|
|
369
|
+
dynamic_memory_threshold=dynamic_memory_threshold,
|
|
370
|
+
)
|
|
166
371
|
|
|
167
372
|
if block:
|
|
168
373
|
logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
|
|
169
|
-
|
|
170
|
-
|
|
374
|
+
return total_elapsed
|
|
375
|
+
else:
|
|
376
|
+
return RayPipelineInterface(pipeline)
|
|
@@ -23,6 +23,7 @@ from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extract
|
|
|
23
23
|
from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
|
|
24
24
|
from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
|
|
25
25
|
from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
|
|
26
|
+
from nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor import HtmlExtractorStage
|
|
26
27
|
|
|
27
28
|
from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
|
|
28
29
|
from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
|
|
@@ -49,6 +50,7 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageCon
|
|
|
49
50
|
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
|
|
50
51
|
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
|
|
51
52
|
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
53
|
+
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
|
|
52
54
|
from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
|
|
53
55
|
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
54
56
|
from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
|
|
@@ -383,6 +385,19 @@ def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_ext
|
|
|
383
385
|
return stage_name
|
|
384
386
|
|
|
385
387
|
|
|
388
|
+
def add_html_extractor_stage(pipeline, default_cpu_count, stage_name="html_extractor"):
|
|
389
|
+
|
|
390
|
+
pipeline.add_stage(
|
|
391
|
+
name=stage_name,
|
|
392
|
+
stage_actor=HtmlExtractorStage,
|
|
393
|
+
config=HtmlExtractorSchema(),
|
|
394
|
+
min_replicas=0,
|
|
395
|
+
max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return stage_name
|
|
399
|
+
|
|
400
|
+
|
|
386
401
|
def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
|
|
387
402
|
_ = default_cpu_count # Placeholder for future use
|
|
388
403
|
otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version: 2025.5.
|
|
3
|
+
Version: 2025.5.29.dev20250529
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -225,7 +225,7 @@ Requires-Dist: httpx>=0.28.1
|
|
|
225
225
|
Requires-Dist: isodate>=0.7.2
|
|
226
226
|
Requires-Dist: langdetect>=1.0.9
|
|
227
227
|
Requires-Dist: minio>=7.2.12
|
|
228
|
-
Requires-Dist: openai>=1.
|
|
228
|
+
Requires-Dist: openai>=1.82.0
|
|
229
229
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
230
230
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
231
231
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
|
|
|
239
239
|
Requires-Dist: python-docx>=1.1.2
|
|
240
240
|
Requires-Dist: python-dotenv>=1.0.1
|
|
241
241
|
Requires-Dist: python-pptx>=1.0.2
|
|
242
|
+
Requires-Dist: prometheus-client
|
|
242
243
|
Requires-Dist: torch==2.4.1
|
|
243
244
|
Requires-Dist: ray[all]>=2.37.0
|
|
244
245
|
Requires-Dist: redis>=5.2.1
|
|
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
|
|
|
255
256
|
Requires-Dist: pip
|
|
256
257
|
Requires-Dist: llama-index-embeddings-nvidia
|
|
257
258
|
Requires-Dist: opencv-python
|
|
258
|
-
Requires-Dist: pymilvus>=2.5.
|
|
259
|
+
Requires-Dist: pymilvus>=2.5.10
|
|
259
260
|
Requires-Dist: pymilvus[bulk_writer,model]
|
|
260
261
|
Requires-Dist: tritonclient
|
|
261
262
|
Requires-Dist: nvidia-riva-client>=2.18.0
|
|
262
263
|
Requires-Dist: unstructured-client
|
|
264
|
+
Requires-Dist: markitdown
|
|
263
265
|
Dynamic: license-file
|
{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/RECORD
RENAMED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
nv_ingest/__init__.py,sha256=vJLPeuxiIHqbxXPJSu9qe3MS-GPavbOUExyRq83DxxM,895
|
|
2
2
|
nv_ingest/version.py,sha256=Y9gMjlV_tnRSE3JbmS1rWIfVppM974_g0k30MRF3IQM,1352
|
|
3
3
|
nv_ingest/api/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
4
|
-
nv_ingest/api/main.py,sha256=
|
|
4
|
+
nv_ingest/api/main.py,sha256=XE-p4lJp1E7CCDOB8ENtYFrf63Dtq2bzQiGxpRfL2LA,1603
|
|
5
5
|
nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
6
6
|
nv_ingest/api/v1/health.py,sha256=zqu-isMRjh4NveS4XWh5FaAZGPIlBVxpCOg3Uu8nUHQ,4746
|
|
7
7
|
nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
|
|
8
|
+
nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
|
|
8
9
|
nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
9
10
|
nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
10
11
|
nv_ingest/framework/orchestration/ray/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
@@ -19,14 +20,15 @@ nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha25
|
|
|
19
20
|
nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
20
21
|
nv_ingest/framework/orchestration/ray/primitives/dataclasses.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
22
|
nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py,sha256=L8ENPiF-lxqhIXVEQwQD5CCqQMb710ynj5D_Y4ixGhs,11077
|
|
22
|
-
nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=
|
|
23
|
-
nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=
|
|
23
|
+
nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=gc9gZNqPmnP76M-u8sQXyJd5aTSlyY_0CjLYNa-zvzk,29106
|
|
24
|
+
nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=BEBLjkYFXIH396EUQcfuxhrWlIMs9i6z7YfeeqJ5cZg,59579
|
|
24
25
|
nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=yPIvOhxY42P-gf5dLkcPkfvfwL_I-ay0C8k5eNaU-VA,15811
|
|
25
26
|
nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
26
27
|
nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
27
28
|
nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=KV4hvY0NTGG8CjZviTgcFLQzaH8WJJGkkb9PFYbROww,3417
|
|
28
29
|
nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py,sha256=tydluNNXfZYSo-0eqqafB59icF3SaeLXWcMrZ6OzlyQ,3998
|
|
29
30
|
nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py,sha256=tSa3Z4vK6sYJ6RBNMa7_FiuOwUaDUl0rTJ6agGbI5y0,3426
|
|
31
|
+
nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py,sha256=fyr0oXokhuaGQrNu5rKyH_qNMD12AS1xPDxKgA26YHE,3426
|
|
30
32
|
nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py,sha256=c-qlLGSizLOgKqH7wl_c8dGOVKYxLtXhZEHLXil4Jc4,3734
|
|
31
33
|
nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py,sha256=dmgvzGMxVX81g7TpZO1ACnRh7sdtpc7YX5KK2QW26U4,2565
|
|
32
34
|
nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=BUVuYOCGyPdPpacVhL5rnvA56hydnBip7tPaWTXaT1c,4650
|
|
@@ -46,7 +48,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py,sha256=wQSlVx3T14
|
|
|
46
48
|
nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py,sha256=0SQHJlFuXlP16YRWduX1fMKgjhUd7UhDAWQ8XZh4_0I,1471
|
|
47
49
|
nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,sha256=enylryvcPmzirpOjCahqYJbNSLsNvv1KpMnOzGqNZQQ,11509
|
|
48
50
|
nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
49
|
-
nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=
|
|
51
|
+
nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=srDsgp8ExMHZNI76ch3iX7S0drMXmQ3NkWC_udnwqmo,20286
|
|
50
52
|
nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
51
53
|
nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=6NkwQzseAnaj0Ptpr3oKvab2EnJdMwTjI2p4dS_HzsI,3901
|
|
52
54
|
nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=SMLHQElZkKldnjy0_VHIKS65DBAAtOhwhdoaFe1yb9I,3337
|
|
@@ -62,10 +64,10 @@ nv_ingest/framework/orchestration/ray/stages/utility/__init__.py,sha256=wQSlVx3T
|
|
|
62
64
|
nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py,sha256=MB27CkoNeuirN6CUHgjsC5Wh958NF7m_N7HE4VKfx3k,2264
|
|
63
65
|
nv_ingest/framework/orchestration/ray/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
64
66
|
nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
65
|
-
nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=
|
|
66
|
-
nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=
|
|
67
|
-
nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=
|
|
68
|
-
nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=
|
|
67
|
+
nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=AWyCFPP41vp1NOkO2urqm7vh-sTGKypJxwhdq8HxK6Q,50681
|
|
68
|
+
nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=jMYnVe_0rb1OIO9mlB4LH3uXtgaXBbUG-rDPx6fe6J8,10456
|
|
69
|
+
nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=3aSYSxyunm-eKUYErDArQTHXSoNKlNJMUr9o5Ui6VTk,14037
|
|
70
|
+
nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=_MPUbOVTo9CjkBdDA--mcpu2plQ9qFY_TCBXbfpbB_A,21477
|
|
69
71
|
nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
70
72
|
nv_ingest/framework/orchestration/ray/util/system_tools/memory.py,sha256=ICqY0LLB3hFTZk03iX5yffMSKFH2q_aQomtDVzS_mKw,2228
|
|
71
73
|
nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py,sha256=2oHZdO_3L1LGuzpyNmZBDh19n0E-APAaHk4MEwBwSHs,12895
|
|
@@ -93,8 +95,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
|
|
|
93
95
|
nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
|
|
94
96
|
nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
95
97
|
nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
|
|
96
|
-
nv_ingest-2025.5.
|
|
97
|
-
nv_ingest-2025.5.
|
|
98
|
-
nv_ingest-2025.5.
|
|
99
|
-
nv_ingest-2025.5.
|
|
100
|
-
nv_ingest-2025.5.
|
|
98
|
+
nv_ingest-2025.5.29.dev20250529.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
99
|
+
nv_ingest-2025.5.29.dev20250529.dist-info/METADATA,sha256=zMIjMLHJLCUg8DdH5oZUIyIK4BVkIZx6U7iQfE9TdxM,15142
|
|
100
|
+
nv_ingest-2025.5.29.dev20250529.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
101
|
+
nv_ingest-2025.5.29.dev20250529.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
|
|
102
|
+
nv_ingest-2025.5.29.dev20250529.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|