nv-ingest 2025.5.21.dev20250521__py3-none-any.whl → 2025.5.22.dev20250522__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +87 -69
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +1 -1
- {nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/METADATA +2 -2
- {nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/RECORD +7 -7
- {nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/top_level.txt +0 -0
|
@@ -91,6 +91,8 @@ class RayPipeline:
|
|
|
91
91
|
# --- State ---
|
|
92
92
|
# self.scaling_state: Dict[str, str] = {}
|
|
93
93
|
self.prev_global_memory_usage: Optional[int] = None
|
|
94
|
+
self._state_lock: threading.Lock = threading.Lock()
|
|
95
|
+
self._stopping = False
|
|
94
96
|
|
|
95
97
|
# --- Build Time Config & State ---
|
|
96
98
|
# Use scaling_config for these
|
|
@@ -711,8 +713,8 @@ class RayPipeline:
|
|
|
711
713
|
|
|
712
714
|
def _execute_queue_flush(self) -> bool:
|
|
713
715
|
"""Executes queue flush, using topology for state and structure."""
|
|
714
|
-
if self.topology.get_is_flushing(): # Check topology state
|
|
715
|
-
logger.warning("Queue flush requested but already in progress. Ignoring.")
|
|
716
|
+
if self.topology.get_is_flushing() or self._stopping: # Check topology state
|
|
717
|
+
logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
|
|
716
718
|
return False
|
|
717
719
|
|
|
718
720
|
# Set flushing state in topology
|
|
@@ -853,8 +855,9 @@ class RayPipeline:
|
|
|
853
855
|
def request_queue_flush(self, force: bool = False) -> None:
|
|
854
856
|
"""Requests a queue flush, checking topology state."""
|
|
855
857
|
logger.info(f"Manual queue flush requested (force={force}).")
|
|
856
|
-
|
|
857
|
-
|
|
858
|
+
|
|
859
|
+
if self.topology.get_is_flushing() or self._stopping: # Check topology
|
|
860
|
+
logger.warning("Flush already in progress or pipeline is stopping.")
|
|
858
861
|
return
|
|
859
862
|
if force or self._is_pipeline_quiet():
|
|
860
863
|
# Consider running _execute_queue_flush in a separate thread
|
|
@@ -1020,65 +1023,76 @@ class RayPipeline:
|
|
|
1020
1023
|
|
|
1021
1024
|
def _perform_scaling_and_maintenance(self) -> None:
|
|
1022
1025
|
"""Orchestrates scaling/maintenance using topology and stats collector."""
|
|
1023
|
-
|
|
1026
|
+
|
|
1027
|
+
if self._stopping:
|
|
1028
|
+
logger.debug("Pipeline is stopping. Skipping scaling cycle.")
|
|
1029
|
+
return
|
|
1024
1030
|
|
|
1025
1031
|
if not self.dynamic_memory_scaling:
|
|
1026
1032
|
logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
|
|
1027
1033
|
return
|
|
1028
1034
|
|
|
1029
|
-
cycle_start_time = time.time()
|
|
1030
|
-
|
|
1031
|
-
# Check flushing state via topology
|
|
1032
1035
|
if self.topology.get_is_flushing():
|
|
1033
1036
|
logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
|
|
1034
1037
|
return
|
|
1035
1038
|
|
|
1036
|
-
|
|
1039
|
+
got_lock = self._state_lock.acquire(timeout=0.1)
|
|
1040
|
+
if not got_lock:
|
|
1041
|
+
logger.debug("Could not acquire lock for maintenance; skipping cycle.")
|
|
1042
|
+
return
|
|
1043
|
+
|
|
1044
|
+
cycle_start_time = time.time()
|
|
1037
1045
|
try:
|
|
1046
|
+
if self._stopping:
|
|
1047
|
+
logger.debug("Pipeline began stopping after acquiring lock. Skipping maintenance logic.")
|
|
1048
|
+
return
|
|
1049
|
+
|
|
1050
|
+
logger.debug("--- Performing Scaling & Maintenance Cycle ---")
|
|
1051
|
+
|
|
1038
1052
|
if self._is_pipeline_quiet():
|
|
1039
1053
|
logger.info("Pipeline quiet, initiating queue flush.")
|
|
1040
|
-
flush_success = self._execute_queue_flush()
|
|
1054
|
+
flush_success = self._execute_queue_flush()
|
|
1041
1055
|
logger.info(f"Automatic queue flush completed. Success: {flush_success}")
|
|
1042
|
-
return
|
|
1043
|
-
except Exception as e:
|
|
1044
|
-
logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
|
|
1045
|
-
return
|
|
1056
|
+
return
|
|
1046
1057
|
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
)
|
|
1058
|
+
# Fast return check if stopping occurred while flushing or checking flush status
|
|
1059
|
+
if self._stopping:
|
|
1060
|
+
return
|
|
1051
1061
|
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
|
|
1055
|
-
status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
|
|
1056
|
-
logger.warning(
|
|
1057
|
-
f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
|
|
1062
|
+
current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
|
|
1063
|
+
self.stats_collector.get_latest_stats()
|
|
1058
1064
|
)
|
|
1059
|
-
return
|
|
1060
1065
|
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
+
last_update_age = time.time() - last_update_time
|
|
1067
|
+
max_age = max(15.0, self._stats_collection_interval_seconds)
|
|
1068
|
+
if not current_stage_stats or not stats_were_successful or last_update_age > max_age:
|
|
1069
|
+
status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
|
|
1070
|
+
logger.warning(
|
|
1071
|
+
f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
|
|
1072
|
+
)
|
|
1073
|
+
return
|
|
1066
1074
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1075
|
+
current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
|
|
1076
|
+
if not current_stage_metrics:
|
|
1077
|
+
logger.error("[Scaling] Failed to gather metrics. Skipping.")
|
|
1078
|
+
return
|
|
1069
1079
|
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1080
|
+
current_global_memory_mb = self._get_current_global_memory()
|
|
1081
|
+
final_adjustments = self._calculate_scaling_adjustments(
|
|
1082
|
+
current_stage_metrics, global_in_flight, current_global_memory_mb
|
|
1083
|
+
)
|
|
1084
|
+
self.prev_global_memory_usage = current_global_memory_mb
|
|
1085
|
+
self._apply_scaling_actions(final_adjustments)
|
|
1074
1086
|
|
|
1075
|
-
|
|
1076
|
-
|
|
1087
|
+
logger.debug(
|
|
1088
|
+
f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---"
|
|
1089
|
+
)
|
|
1077
1090
|
|
|
1078
|
-
|
|
1079
|
-
|
|
1091
|
+
except Exception as e: # noqa
|
|
1092
|
+
logger.error("Exception during maintenance cycle", exc_info=True)
|
|
1080
1093
|
|
|
1081
|
-
|
|
1094
|
+
finally:
|
|
1095
|
+
self._state_lock.release()
|
|
1082
1096
|
|
|
1083
1097
|
# --- Lifecycle Methods for Monitoring/Scaling Threads ---
|
|
1084
1098
|
def _scaling_loop(self, interval: float) -> None:
|
|
@@ -1149,39 +1163,43 @@ class RayPipeline:
|
|
|
1149
1163
|
"""Stops background threads and actors (via topology)."""
|
|
1150
1164
|
logger.info("Stopping pipeline...")
|
|
1151
1165
|
|
|
1166
|
+
if self._stopping:
|
|
1167
|
+
return
|
|
1168
|
+
self._stopping = True
|
|
1169
|
+
|
|
1152
1170
|
# 1. Stop background threads first
|
|
1153
|
-
self.
|
|
1154
|
-
|
|
1171
|
+
with self._state_lock:
|
|
1172
|
+
self._stop_scaling()
|
|
1173
|
+
self.stats_collector.stop()
|
|
1155
1174
|
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
actors_to_kill = []
|
|
1175
|
+
# 2. Stop actors (using topology)
|
|
1176
|
+
logger.debug("Stopping all stage actors...")
|
|
1177
|
+
stop_refs_map: Dict[ray.ObjectRef, Any] = {}
|
|
1160
1178
|
|
|
1161
|
-
|
|
1162
|
-
|
|
1179
|
+
# Get actors snapshot from topology
|
|
1180
|
+
current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
|
|
1163
1181
|
|
|
1164
|
-
|
|
1165
|
-
|
|
1182
|
+
for stage_name, actors in current_actors.items():
|
|
1183
|
+
for actor in actors:
|
|
1184
|
+
try:
|
|
1185
|
+
stop_refs_map[actor.stop.remote()] = actor
|
|
1186
|
+
except Exception as e:
|
|
1187
|
+
logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
|
|
1188
|
+
|
|
1189
|
+
if stop_refs_map:
|
|
1190
|
+
stop_refs = list(stop_refs_map.keys())
|
|
1191
|
+
logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
|
|
1166
1192
|
try:
|
|
1167
|
-
|
|
1193
|
+
ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
|
|
1194
|
+
if not_ready:
|
|
1195
|
+
logger.warning(
|
|
1196
|
+
f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
|
|
1197
|
+
)
|
|
1198
|
+
logger.info(f"{len(ready)} actors stopped via stop().")
|
|
1168
1199
|
except Exception as e:
|
|
1169
|
-
logger.
|
|
1200
|
+
logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
|
|
1170
1201
|
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
|
|
1174
|
-
try:
|
|
1175
|
-
ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
|
|
1176
|
-
if not_ready:
|
|
1177
|
-
logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
|
|
1178
|
-
actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
|
|
1179
|
-
logger.info(f"{len(ready)} actors stopped via stop().")
|
|
1180
|
-
except Exception as e:
|
|
1181
|
-
logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
|
|
1182
|
-
actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill) # Add all on error
|
|
1183
|
-
|
|
1184
|
-
# Clear runtime state in topology
|
|
1185
|
-
self.topology.clear_runtime_state()
|
|
1202
|
+
# Clear runtime state in topology
|
|
1203
|
+
self.topology.clear_runtime_state()
|
|
1186
1204
|
|
|
1187
|
-
|
|
1205
|
+
logger.info("Pipeline stopped.")
|
|
@@ -285,7 +285,7 @@ class RayStatsCollector:
|
|
|
285
285
|
q_size_val = queue_actor.qsize()
|
|
286
286
|
queue_sizes[q_name] = int(q_size_val)
|
|
287
287
|
except Exception as e:
|
|
288
|
-
logger.
|
|
288
|
+
logger.warning(f"[StatsCollectNow] Failed to get queue size for '{q_name}': {e}", exc_info=True)
|
|
289
289
|
queue_sizes[q_name] = 0
|
|
290
290
|
overall_success = False
|
|
291
291
|
|
{nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version: 2025.5.
|
|
3
|
+
Version: 2025.5.22.dev20250522
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -231,7 +231,7 @@ Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
|
231
231
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
232
232
|
Requires-Dist: pydantic>2.0.0
|
|
233
233
|
Requires-Dist: pydantic-settings>2.0.0
|
|
234
|
-
Requires-Dist: pypdfium2
|
|
234
|
+
Requires-Dist: pypdfium2==4.30.1
|
|
235
235
|
Requires-Dist: pytest>=8.0.2
|
|
236
236
|
Requires-Dist: pytest-mock>=3.14.0
|
|
237
237
|
Requires-Dist: pytest-cov>=6.0.0
|
{nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/RECORD
RENAMED
|
@@ -20,8 +20,8 @@ nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14Zg
|
|
|
20
20
|
nv_ingest/framework/orchestration/ray/primitives/dataclasses.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
21
|
nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py,sha256=L8ENPiF-lxqhIXVEQwQD5CCqQMb710ynj5D_Y4ixGhs,11077
|
|
22
22
|
nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=MKwerM3saKAdXZDHXFb4nGSnnwr7rUcOZlDo5JxV45o,28441
|
|
23
|
-
nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=
|
|
24
|
-
nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=
|
|
23
|
+
nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=CWRxtSCTLe4S42Asv2NCA1hDEoKeblQdCEOmSKOGS0U,56500
|
|
24
|
+
nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=yPIvOhxY42P-gf5dLkcPkfvfwL_I-ay0C8k5eNaU-VA,15811
|
|
25
25
|
nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
26
26
|
nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
27
27
|
nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=KV4hvY0NTGG8CjZviTgcFLQzaH8WJJGkkb9PFYbROww,3417
|
|
@@ -93,8 +93,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
|
|
|
93
93
|
nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
|
|
94
94
|
nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
95
95
|
nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
|
|
96
|
-
nv_ingest-2025.5.
|
|
97
|
-
nv_ingest-2025.5.
|
|
98
|
-
nv_ingest-2025.5.
|
|
99
|
-
nv_ingest-2025.5.
|
|
100
|
-
nv_ingest-2025.5.
|
|
96
|
+
nv_ingest-2025.5.22.dev20250522.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
97
|
+
nv_ingest-2025.5.22.dev20250522.dist-info/METADATA,sha256=LUbvIScRcL85fCyLAHdBYw9M3MmBHRLOJAVf0ri3ZMc,15082
|
|
98
|
+
nv_ingest-2025.5.22.dev20250522.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
99
|
+
nv_ingest-2025.5.22.dev20250522.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
|
|
100
|
+
nv_ingest-2025.5.22.dev20250522.dist-info/RECORD,,
|
{nv_ingest-2025.5.21.dev20250521.dist-info → nv_ingest-2025.5.22.dev20250522.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|