nv-ingest 2025.5.21.dev20250521__py3-none-any.whl → 2025.5.22.dev20250522__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,6 +91,8 @@ class RayPipeline:
91
91
  # --- State ---
92
92
  # self.scaling_state: Dict[str, str] = {}
93
93
  self.prev_global_memory_usage: Optional[int] = None
94
+ self._state_lock: threading.Lock = threading.Lock()
95
+ self._stopping = False
94
96
 
95
97
  # --- Build Time Config & State ---
96
98
  # Use scaling_config for these
@@ -711,8 +713,8 @@ class RayPipeline:
711
713
 
712
714
  def _execute_queue_flush(self) -> bool:
713
715
  """Executes queue flush, using topology for state and structure."""
714
- if self.topology.get_is_flushing(): # Check topology state
715
- logger.warning("Queue flush requested but already in progress. Ignoring.")
716
+ if self.topology.get_is_flushing() or self._stopping: # Check topology state
717
+ logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
716
718
  return False
717
719
 
718
720
  # Set flushing state in topology
@@ -853,8 +855,9 @@ class RayPipeline:
853
855
  def request_queue_flush(self, force: bool = False) -> None:
854
856
  """Requests a queue flush, checking topology state."""
855
857
  logger.info(f"Manual queue flush requested (force={force}).")
856
- if self.topology.get_is_flushing(): # Check topology
857
- logger.warning("Flush already in progress.")
858
+
859
+ if self.topology.get_is_flushing() or self._stopping: # Check topology
860
+ logger.warning("Flush already in progress or pipeline is stopping.")
858
861
  return
859
862
  if force or self._is_pipeline_quiet():
860
863
  # Consider running _execute_queue_flush in a separate thread
@@ -1020,65 +1023,76 @@ class RayPipeline:
1020
1023
 
1021
1024
  def _perform_scaling_and_maintenance(self) -> None:
1022
1025
  """Orchestrates scaling/maintenance using topology and stats collector."""
1023
- logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1026
+
1027
+ if self._stopping:
1028
+ logger.debug("Pipeline is stopping. Skipping scaling cycle.")
1029
+ return
1024
1030
 
1025
1031
  if not self.dynamic_memory_scaling:
1026
1032
  logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
1027
1033
  return
1028
1034
 
1029
- cycle_start_time = time.time()
1030
-
1031
- # Check flushing state via topology
1032
1035
  if self.topology.get_is_flushing():
1033
1036
  logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
1034
1037
  return
1035
1038
 
1036
- # --- Check for quietness for flushing (uses topology state via helper) ---
1039
+ got_lock = self._state_lock.acquire(timeout=0.1)
1040
+ if not got_lock:
1041
+ logger.debug("Could not acquire lock for maintenance; skipping cycle.")
1042
+ return
1043
+
1044
+ cycle_start_time = time.time()
1037
1045
  try:
1046
+ if self._stopping:
1047
+ logger.debug("Pipeline began stopping after acquiring lock. Skipping maintenance logic.")
1048
+ return
1049
+
1050
+ logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1051
+
1038
1052
  if self._is_pipeline_quiet():
1039
1053
  logger.info("Pipeline quiet, initiating queue flush.")
1040
- flush_success = self._execute_queue_flush() # Uses topology internally
1054
+ flush_success = self._execute_queue_flush()
1041
1055
  logger.info(f"Automatic queue flush completed. Success: {flush_success}")
1042
- return # Skip scaling if flush occurred
1043
- except Exception as e:
1044
- logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
1045
- return
1056
+ return
1046
1057
 
1047
- # --- Get & Validate Stats ---
1048
- current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
1049
- self.stats_collector.get_latest_stats()
1050
- )
1058
+ # Fast return check if stopping occurred while flushing or checking flush status
1059
+ if self._stopping:
1060
+ return
1051
1061
 
1052
- last_update_age = time.time() - last_update_time
1053
- max_stats_age_for_scaling = max(15.0, self._stats_collection_interval_seconds)
1054
- if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
1055
- status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
1056
- logger.warning(
1057
- f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
1062
+ current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
1063
+ self.stats_collector.get_latest_stats()
1058
1064
  )
1059
- return
1060
1065
 
1061
- # --- Gather Metrics (uses topology via helper) ---
1062
- current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
1063
- if not current_stage_metrics:
1064
- logger.error("[Scaling] Failed gather metrics. Skipping.")
1065
- return
1066
+ last_update_age = time.time() - last_update_time
1067
+ max_age = max(15.0, self._stats_collection_interval_seconds)
1068
+ if not current_stage_stats or not stats_were_successful or last_update_age > max_age:
1069
+ status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
1070
+ logger.warning(
1071
+ f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
1072
+ )
1073
+ return
1066
1074
 
1067
- # --- Get Memory Usage ---
1068
- current_global_memory_mb = self._get_current_global_memory()
1075
+ current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
1076
+ if not current_stage_metrics:
1077
+ logger.error("[Scaling] Failed to gather metrics. Skipping.")
1078
+ return
1069
1079
 
1070
- # --- Calculate Scaling Adjustments (uses topology via helper) ---
1071
- final_adjustments = self._calculate_scaling_adjustments(
1072
- current_stage_metrics, global_in_flight, current_global_memory_mb
1073
- )
1080
+ current_global_memory_mb = self._get_current_global_memory()
1081
+ final_adjustments = self._calculate_scaling_adjustments(
1082
+ current_stage_metrics, global_in_flight, current_global_memory_mb
1083
+ )
1084
+ self.prev_global_memory_usage = current_global_memory_mb
1085
+ self._apply_scaling_actions(final_adjustments)
1074
1086
 
1075
- # --- Update Memory Usage *After* Decision ---
1076
- self.prev_global_memory_usage = current_global_memory_mb
1087
+ logger.debug(
1088
+ f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---"
1089
+ )
1077
1090
 
1078
- # --- Apply Scaling Actions (uses topology via helper) ---
1079
- self._apply_scaling_actions(final_adjustments)
1091
+ except Exception as e: # noqa
1092
+ logger.error("Exception during maintenance cycle", exc_info=True)
1080
1093
 
1081
- logger.debug(f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---")
1094
+ finally:
1095
+ self._state_lock.release()
1082
1096
 
1083
1097
  # --- Lifecycle Methods for Monitoring/Scaling Threads ---
1084
1098
  def _scaling_loop(self, interval: float) -> None:
@@ -1149,39 +1163,43 @@ class RayPipeline:
1149
1163
  """Stops background threads and actors (via topology)."""
1150
1164
  logger.info("Stopping pipeline...")
1151
1165
 
1166
+ if self._stopping:
1167
+ return
1168
+ self._stopping = True
1169
+
1152
1170
  # 1. Stop background threads first
1153
- self._stop_scaling()
1154
- self.stats_collector.stop()
1171
+ with self._state_lock:
1172
+ self._stop_scaling()
1173
+ self.stats_collector.stop()
1155
1174
 
1156
- # 2. Stop actors (using topology)
1157
- logger.debug("Stopping all stage actors...")
1158
- stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1159
- actors_to_kill = []
1175
+ # 2. Stop actors (using topology)
1176
+ logger.debug("Stopping all stage actors...")
1177
+ stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1160
1178
 
1161
- # Get actors snapshot from topology
1162
- current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1179
+ # Get actors snapshot from topology
1180
+ current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1163
1181
 
1164
- for stage_name, actors in current_actors.items():
1165
- for actor in actors:
1182
+ for stage_name, actors in current_actors.items():
1183
+ for actor in actors:
1184
+ try:
1185
+ stop_refs_map[actor.stop.remote()] = actor
1186
+ except Exception as e:
1187
+ logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
1188
+
1189
+ if stop_refs_map:
1190
+ stop_refs = list(stop_refs_map.keys())
1191
+ logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1166
1192
  try:
1167
- stop_refs_map[actor.stop.remote()] = actor
1193
+ ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1194
+ if not_ready:
1195
+ logger.warning(
1196
+ f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
1197
+ )
1198
+ logger.info(f"{len(ready)} actors stopped via stop().")
1168
1199
  except Exception as e:
1169
- logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Will kill.")
1200
+ logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1170
1201
 
1171
- if stop_refs_map:
1172
- stop_refs = list(stop_refs_map.keys())
1173
- logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1174
- try:
1175
- ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1176
- if not_ready:
1177
- logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
1178
- actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
1179
- logger.info(f"{len(ready)} actors stopped via stop().")
1180
- except Exception as e:
1181
- logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1182
- actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill) # Add all on error
1183
-
1184
- # Clear runtime state in topology
1185
- self.topology.clear_runtime_state()
1202
+ # Clear runtime state in topology
1203
+ self.topology.clear_runtime_state()
1186
1204
 
1187
- logger.info("Pipeline stopped.")
1205
+ logger.info("Pipeline stopped.")
@@ -285,7 +285,7 @@ class RayStatsCollector:
285
285
  q_size_val = queue_actor.qsize()
286
286
  queue_sizes[q_name] = int(q_size_val)
287
287
  except Exception as e:
288
- logger.error(f"[StatsCollectNow] Failed to get queue size for '{q_name}': {e}", exc_info=True)
288
+ logger.warning(f"[StatsCollectNow] Failed to get queue size for '{q_name}': {e}", exc_info=True)
289
289
  queue_sizes[q_name] = 0
290
290
  overall_success = False
291
291
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.5.21.dev20250521
3
+ Version: 2025.5.22.dev20250522
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -231,7 +231,7 @@ Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
232
232
  Requires-Dist: pydantic>2.0.0
233
233
  Requires-Dist: pydantic-settings>2.0.0
234
- Requires-Dist: pypdfium2>=4.30.0
234
+ Requires-Dist: pypdfium2==4.30.1
235
235
  Requires-Dist: pytest>=8.0.2
236
236
  Requires-Dist: pytest-mock>=3.14.0
237
237
  Requires-Dist: pytest-cov>=6.0.0
@@ -20,8 +20,8 @@ nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14Zg
20
20
  nv_ingest/framework/orchestration/ray/primitives/dataclasses.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py,sha256=L8ENPiF-lxqhIXVEQwQD5CCqQMb710ynj5D_Y4ixGhs,11077
22
22
  nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=MKwerM3saKAdXZDHXFb4nGSnnwr7rUcOZlDo5JxV45o,28441
23
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=Qk67mp7tMwtnFOWnrCc7_TXdXqVAKwJPCUp5_5VQye8,56099
24
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=2gU853TkIB_wo9Nl3AMPDCZMeZF_iSJWO0va8vxzN-Y,15809
23
+ nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=CWRxtSCTLe4S42Asv2NCA1hDEoKeblQdCEOmSKOGS0U,56500
24
+ nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=yPIvOhxY42P-gf5dLkcPkfvfwL_I-ay0C8k5eNaU-VA,15811
25
25
  nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
26
26
  nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
27
27
  nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=KV4hvY0NTGG8CjZviTgcFLQzaH8WJJGkkb9PFYbROww,3417
@@ -93,8 +93,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
93
93
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
94
94
  nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
95
95
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
96
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
97
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA,sha256=SB6sK4dXyR3py6ywwz5h1Elfd5gbKPYA97cQLkzmOSA,15082
98
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
99
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
100
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD,,
96
+ nv_ingest-2025.5.22.dev20250522.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
97
+ nv_ingest-2025.5.22.dev20250522.dist-info/METADATA,sha256=LUbvIScRcL85fCyLAHdBYw9M3MmBHRLOJAVf0ri3ZMc,15082
98
+ nv_ingest-2025.5.22.dev20250522.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
99
+ nv_ingest-2025.5.22.dev20250522.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
100
+ nv_ingest-2025.5.22.dev20250522.dist-info/RECORD,,