nv-ingest 2025.5.21.dev20250521__py3-none-any.whl → 2025.5.29.dev20250529__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

@@ -0,0 +1,82 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ import ray
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.extract.html.html_extractor import extract_markdown_from_html_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class HtmlExtractorStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that extracts text in markdown format from html content.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame with html content. It then:
29
+ 1. Removes the "html_content_extract" task from the message.
30
+ 2. Calls the html extraction logic (via extract_markdown_from_html_internal) using a validated configuration.
31
+ 3. Updates the message payload with the extracted text DataFrame.
32
+ """
33
+
34
+ def __init__(self, config: HtmlExtractorSchema) -> None:
35
+ super().__init__(config, log_to_stdout=False)
36
+ try:
37
+ self.validated_config = config
38
+ self._logger.info("HtmlExtractorStage configuration validated successfully.")
39
+ except Exception as e:
40
+ self._logger.exception(f"Error validating Html Extractor config: {e}")
41
+ raise
42
+
43
+ @traceable("html_extractor")
44
+ @filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
45
+ @nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ """
48
+ Process the control message by extracting content from html.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The message containing a DataFrame payload with html content.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with extracted content.
59
+ """
60
+ self._logger.debug("HtmlExtractorStage.on_data: Starting html extraction process.")
61
+
62
+ # Extract the DataFrame payload.
63
+ df_ledger = control_message.payload()
64
+ self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
65
+
66
+ # Remove the "html_content_extract" task from the message to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "extract")
68
+ self._logger.debug("Extracted task config: %s", task_config)
69
+
70
+ # Perform html content extraction.
71
+ new_df, extraction_info = extract_markdown_from_html_internal(
72
+ df_extraction_ledger=df_ledger,
73
+ task_config=task_config,
74
+ extraction_config=self.validated_config,
75
+ execution_trace_log=None,
76
+ )
77
+
78
+ # Update the message payload with the extracted text DataFrame.
79
+ control_message.payload(new_df)
80
+ control_message.set_metadata("html_extraction_info", extraction_info)
81
+
82
+ return control_message
@@ -495,7 +495,7 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
495
495
  server.serve_forever()
496
496
 
497
497
  p = multiprocessing.Process(target=broker_server)
498
- p.daemon = True
498
+ p.daemon = False
499
499
  p.start()
500
500
  logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
501
501
 
@@ -490,7 +490,7 @@ class ResourceConstraintManager:
490
490
  final_proposals_this_step = {}
491
491
 
492
492
  if not room_to_scale_up_to_global_caps:
493
- logger.info(
493
+ logger.debug(
494
494
  "[ConstraintMgr-Proportional] Global scaling beyond effective minimums is RESTRICTED "
495
495
  "as SumOfEffectiveMins likely meets/exceeds a global Core/MaxReplica cap. "
496
496
  "Proposed increases from initial current values will be nullified."
@@ -502,7 +502,7 @@ class ResourceConstraintManager:
502
502
  if val_from_prior_phases > original_current_replicas:
503
503
  final_proposals_this_step[name] = original_current_replicas
504
504
  if val_from_prior_phases != original_current_replicas:
505
- logger.info(
505
+ logger.debug(
506
506
  f"[ConstraintMgr-{name}] Proportional: Scaling restricted. "
507
507
  f"Nullified proposed increase from {original_current_replicas} to {val_from_prior_phases}. "
508
508
  f"Setting to {original_current_replicas}."
@@ -618,7 +618,7 @@ class ResourceConstraintManager:
618
618
 
619
619
  # Apply reduction to the deltas
620
620
  if reduction_factor <= 0.001: # Epsilon for float
621
- logger.info(
621
+ logger.debug(
622
622
  f"[ConstraintMgr-Proportional] Scale-up beyond effective minimums fully constrained by global limits. "
623
623
  f"Reasons: {'; '.join(limiting_reasons) if limiting_reasons else 'None'}. "
624
624
  f"Final ReductionFactor={reduction_factor:.3f}."
@@ -637,7 +637,7 @@ class ResourceConstraintManager:
637
637
  )
638
638
 
639
639
  elif reduction_factor < 1.0:
640
- logger.info(
640
+ logger.debug(
641
641
  f"[ConstraintMgr-Proportional] Reducing requested scale-up (beyond effective_mins) by "
642
642
  f"factor {reduction_factor:.3f}. "
643
643
  f"Limiting Factors: {'; '.join(limiting_reasons)}."
@@ -654,7 +654,7 @@ class ResourceConstraintManager:
654
654
  f"-> FinalVal={final_value_for_stage}"
655
655
  )
656
656
  else: # reduction_factor is ~1.0, meaning full requested increase (above effective_mins) is allowed
657
- logger.info(
657
+ logger.debug(
658
658
  "[ConstraintMgr-Proportional] Full requested scale-up (beyond effective_mins) "
659
659
  "is permissible by global limits."
660
660
  )
@@ -713,7 +713,7 @@ class ResourceConstraintManager:
713
713
  target = max(1, min_r)
714
714
  final_target = min(target, max_r)
715
715
  if final_target > 0:
716
- logger.info(
716
+ logger.debug(
717
717
  f"[ConstraintMgr-{name}] Forcing minimum {final_target} replica due to global wake-up."
718
718
  )
719
719
  final_adjustments[name] = final_target
@@ -740,19 +740,19 @@ class ResourceConstraintManager:
740
740
  num_queue_actors = num_edges
741
741
  total_ray_components_for_info = final_stage_replicas_total + num_queue_actors
742
742
 
743
- logger.info("[ConstraintMgr] --- Final Decision & Constraint Summary ---")
743
+ logger.debug("[ConstraintMgr] --- Final Decision & Constraint Summary ---")
744
744
 
745
745
  # --- I. Overall Pipeline State ---
746
- logger.info(f"[ConstraintMgr] Pipeline Activity: {global_in_flight} tasks in-flight.")
747
- logger.info(f"[ConstraintMgr] Effective Min Replicas (Sum): {sum_of_effective_mins}")
748
- logger.info(
746
+ logger.debug(f"[ConstraintMgr] Pipeline Activity: {global_in_flight} tasks in-flight.")
747
+ logger.debug(f"[ConstraintMgr] Effective Min Replicas (Sum): {sum_of_effective_mins}")
748
+ logger.debug(
749
749
  f"[ConstraintMgr] └─ Global Scaling Beyond Mins Permitted? {can_globally_scale_beyond_effective_mins}"
750
750
  )
751
751
 
752
752
  # --- II. Final Component Counts ---
753
- logger.info(f"[ConstraintMgr] Final Stage Replicas: {final_stage_replicas_total} (Target for caps)")
754
- logger.info(f"[ConstraintMgr] Queue/Edge Actors : {num_queue_actors} (Informational)")
755
- logger.info(f"[ConstraintMgr] Total Ray Components: {total_ray_components_for_info} (Informational)")
753
+ logger.debug(f"[ConstraintMgr] Final Stage Replicas: {final_stage_replicas_total} (Target for caps)")
754
+ logger.debug(f"[ConstraintMgr] Queue/Edge Actors : {num_queue_actors} (Informational)")
755
+ logger.debug(f"[ConstraintMgr] Total Ray Components: {total_ray_components_for_info} (Informational)")
756
756
 
757
757
  # --- III. Resource Limits & Projected Usage (for Stages) ---
758
758
  # Configured Limits
@@ -762,18 +762,18 @@ class ResourceConstraintManager:
762
762
  )
763
763
  eff_mem_limit_str = f"{self.effective_memory_limit_mb:.1f}MB"
764
764
 
765
- logger.info("[ConstraintMgr] Global Limits (Stages):")
766
- logger.info(f"[ConstraintMgr] ├─ MaxTotalReplicas : {max_r_cfg_str}")
767
- logger.info(
765
+ logger.debug("[ConstraintMgr] Global Limits (Stages):")
766
+ logger.debug(f"[ConstraintMgr] ├─ MaxTotalReplicas : {max_r_cfg_str}")
767
+ logger.debug(
768
768
  f"[ConstraintMgr] ├─ CoreBasedRepLimit : {core_based_limit_str} "
769
769
  f"(System EffCores: {self.available_cores if self.available_cores is not None else 'N/A'})"
770
770
  )
771
- logger.info(f"[ConstraintMgr] └─ EffectiveMemLimit : {eff_mem_limit_str} ")
771
+ logger.debug(f"[ConstraintMgr] └─ EffectiveMemLimit : {eff_mem_limit_str} ")
772
772
 
773
773
  # Usage vs Limits
774
- logger.info("[ConstraintMgr] Projected Usage (Stages):")
775
- logger.info(f"[ConstraintMgr] ├─ Replicas : {final_stage_replicas_total}")
776
- logger.info(
774
+ logger.debug("[ConstraintMgr] Projected Usage (Stages):")
775
+ logger.debug(f"[ConstraintMgr] ├─ Replicas : {final_stage_replicas_total}")
776
+ logger.debug(
777
777
  f"[ConstraintMgr] └─ Memory : {projected_final_memory_mb:.1f}MB "
778
778
  f"(Current: {current_global_memory_usage_mb:.1f}MB)"
779
779
  )
@@ -815,20 +815,20 @@ class ResourceConstraintManager:
815
815
  )
816
816
  unexpected_breaches_details.append(f"MemoryLimit: {status_mem}")
817
817
 
818
- logger.info("[ConstraintMgr] Limit Adherence (Stages):")
819
- logger.info(f"[ConstraintMgr] ├─ MaxTotalReplicas : {status_max_r}")
820
- logger.info(f"[ConstraintMgr] ├─ CoreBasedRepLimit : {status_core_r}")
821
- logger.info(f"[ConstraintMgr] └─ EffectiveMemLimit : {status_mem}")
818
+ logger.debug("[ConstraintMgr] Limit Adherence (Stages):")
819
+ logger.debug(f"[ConstraintMgr] ├─ MaxTotalReplicas : {status_max_r}")
820
+ logger.debug(f"[ConstraintMgr] ├─ CoreBasedRepLimit : {status_core_r}")
821
+ logger.debug(f"[ConstraintMgr] └─ EffectiveMemLimit : {status_mem}")
822
822
 
823
823
  if unexpected_breaches_details:
824
- logger.warning(f"[ConstraintMgr] └─ UNEXPECTED BREACHES: {'; '.join(unexpected_breaches_details)}")
824
+ logger.debug(f"[ConstraintMgr] └─ UNEXPECTED BREACHES: {'; '.join(unexpected_breaches_details)}")
825
825
  else:
826
- logger.info("[ConstraintMgr] └─ All hard caps (beyond tolerated minimums/wake-up) appear respected.")
826
+ logger.debug("[ConstraintMgr] └─ All hard caps (beyond tolerated minimums/wake-up) appear respected.")
827
827
 
828
828
  # --- V. Final Decisions Per Stage ---
829
- logger.info("[ConstraintMgr] Final Decisions (Per Stage):")
829
+ logger.debug("[ConstraintMgr] Final Decisions (Per Stage):")
830
830
  if not final_adjustments:
831
- logger.info("[ConstraintMgr] └─ No stages to adjust.")
831
+ logger.debug("[ConstraintMgr] └─ No stages to adjust.")
832
832
  else:
833
833
  # Determine max stage name length for alignment
834
834
  max_name_len = 0
@@ -843,12 +843,12 @@ class ResourceConstraintManager:
843
843
  eff_min_str = f"(EffMin: {min_replicas if orig_prop else 'N/A'})"
844
844
 
845
845
  # Basic alignment, can be improved with more sophisticated padding
846
- logger.info(
846
+ logger.debug(
847
847
  f"[ConstraintMgr] └─ {stage_name:<{max_name_len}} : "
848
848
  f"{count:<3} {pid_proposed_str} {current_str} {eff_min_str}"
849
849
  )
850
850
 
851
- logger.info("[ConstraintMgr] --- Constraint Summary END ---")
851
+ logger.debug("[ConstraintMgr] --- Constraint Summary END ---")
852
852
 
853
853
  # --- Public Method ---
854
854
 
@@ -863,7 +863,7 @@ class ResourceConstraintManager:
863
863
  Applies all configured constraints to initial replica proposals.
864
864
  (Docstring from previous version is fine)
865
865
  """
866
- logger.info(
866
+ logger.debug(
867
867
  f"[ConstraintMgr] --- Applying Constraints START --- "
868
868
  f"GlobalInFlight={global_in_flight}, "
869
869
  f"CurrentGlobalMemMB={current_global_memory_usage_mb}, "
@@ -904,7 +904,7 @@ class ResourceConstraintManager:
904
904
  current_effective_mins[name] = eff_min
905
905
  sum_of_effective_mins += eff_min
906
906
 
907
- logger.info(
907
+ logger.debug(
908
908
  f"[ConstraintMgr] Calculated Effective Minimums: TotalSum={sum_of_effective_mins}. "
909
909
  # f"IndividualMins: {current_effective_mins}" # Can be verbose
910
910
  )
@@ -985,5 +985,5 @@ class ResourceConstraintManager:
985
985
  can_globally_scale_up_stages, # Pass this for context in logging
986
986
  )
987
987
 
988
- logger.info("[ConstraintMgr] --- Applying Constraints END ---")
988
+ logger.debug("[ConstraintMgr] --- Applying Constraints END ---")
989
989
  return final_adjustments
@@ -19,6 +19,7 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
19
19
  add_image_extractor_stage,
20
20
  add_docx_extractor_stage,
21
21
  add_audio_extractor_stage,
22
+ add_html_extractor_stage,
22
23
  add_image_dedup_stage,
23
24
  add_image_filter_stage,
24
25
  add_table_extractor_stage,
@@ -53,7 +54,7 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
53
54
  export_config_to_env(ingest_config)
54
55
 
55
56
  current_level = logging.getLogger().getEffectiveLevel()
56
- ray.init(
57
+ ray_context = ray.init(
57
58
  namespace="nv_ingest_ray",
58
59
  logging_level=current_level,
59
60
  ignore_reinit_error=True,
@@ -103,6 +104,7 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
103
104
  docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
104
105
  pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
105
106
  audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
107
+ html_extractor_stage_id = add_html_extractor_stage(pipeline, default_cpu_count)
106
108
  ########################################################################################################
107
109
 
108
110
  ########################################################################################################
@@ -159,7 +161,8 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
159
161
  pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
160
162
  pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
161
163
  pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
162
- pipeline.make_edge(image_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
164
+ pipeline.make_edge(image_extractor_stage_id, html_extractor_stage_id, queue_size=ingest_edge_buffer_size)
165
+ pipeline.make_edge(html_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
163
166
 
164
167
  ###### Primitive Extractors ########
165
168
  pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
@@ -193,3 +196,5 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
193
196
  # pipe.add_edge(sink_stage, otel_tracer_stage)
194
197
 
195
198
  # pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)
199
+
200
+ return ray_context
@@ -2,16 +2,26 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import atexit
5
6
  import logging
7
+ import multiprocessing
6
8
  import os
9
+ import signal
10
+ import sys
7
11
  import time
12
+ from ctypes import CDLL, c_int
8
13
  from datetime import datetime
9
- from typing import Union, Tuple
14
+ from typing import Union, Tuple, Optional, TextIO
10
15
 
11
16
  import ray
12
17
  from pydantic import BaseModel, ConfigDict
13
18
 
14
- from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline, ScalingConfig
19
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
20
+ RayPipeline,
21
+ ScalingConfig,
22
+ RayPipelineSubprocessInterface,
23
+ RayPipelineInterface,
24
+ )
15
25
  from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
16
26
 
17
27
  logger = logging.getLogger(__name__)
@@ -33,6 +43,8 @@ class PipelineCreationSchema(BaseModel):
33
43
  including endpoints, API keys, and processing options.
34
44
  """
35
45
 
46
+ arrow_default_memory_pool: str = os.getenv("ARROW_DEFAULT_MEMORY_POOL", "system")
47
+
36
48
  # Audio processing settings
37
49
  audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
38
50
  audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
@@ -100,6 +112,112 @@ class PipelineCreationSchema(BaseModel):
100
112
  model_config = ConfigDict(extra="forbid")
101
113
 
102
114
 
115
+ def redirect_os_fds(stdout: Optional[TextIO] = None, stderr: Optional[TextIO] = None):
116
+ """
117
+ Redirect OS-level stdout (fd=1) and stderr (fd=2) to the given file-like objects,
118
+ or to /dev/null if not provided.
119
+
120
+ Parameters
121
+ ----------
122
+ stdout : Optional[TextIO]
123
+ Stream to receive OS-level stdout. If None, redirected to /dev/null.
124
+ stderr : Optional[TextIO]
125
+ Stream to receive OS-level stderr. If None, redirected to /dev/null.
126
+ """
127
+ devnull_fd = os.open(os.devnull, os.O_WRONLY)
128
+
129
+ if stdout is not None:
130
+ os.dup2(stdout.fileno(), 1)
131
+ else:
132
+ os.dup2(devnull_fd, 1)
133
+
134
+ if stderr is not None:
135
+ os.dup2(stderr.fileno(), 2)
136
+ else:
137
+ os.dup2(devnull_fd, 2)
138
+
139
+
140
+ def set_pdeathsig(sig=signal.SIGKILL):
141
+ libc = CDLL("libc.so.6")
142
+ PR_SET_PDEATHSIG = 1
143
+ libc.prctl(PR_SET_PDEATHSIG, c_int(sig))
144
+
145
+
146
+ def kill_pipeline_process_group(pid: int):
147
+ """
148
+ Kill the process group associated with the given PID, if it exists and is alive.
149
+
150
+ Parameters
151
+ ----------
152
+ pid : int
153
+ The PID of the process whose group should be killed.
154
+ """
155
+ try:
156
+ # Get the process group ID
157
+ pgid = os.getpgid(pid)
158
+
159
+ # Check if the group is still alive by sending signal 0
160
+ os.killpg(pgid, 0) # Does not kill, just checks if it's alive
161
+
162
+ # If no exception, the group is alive — kill it
163
+ os.killpg(pgid, signal.SIGKILL)
164
+ print(f"Killed subprocess group {pgid}")
165
+
166
+ except ProcessLookupError:
167
+ print(f"Process group for PID {pid} no longer exists.")
168
+ except PermissionError:
169
+ print(f"Permission denied to kill process group for PID {pid}.")
170
+ except Exception as e:
171
+ print(f"Failed to kill subprocess group: {e}")
172
+
173
+
174
+ def _run_pipeline_process(
175
+ ingest_config: PipelineCreationSchema,
176
+ disable_dynamic_scaling: Optional[bool],
177
+ dynamic_memory_threshold: Optional[float],
178
+ raw_stdout: Optional[TextIO] = None,
179
+ raw_stderr: Optional[TextIO] = None,
180
+ ):
181
+ """
182
+ Subprocess entrypoint to launch the pipeline. Redirects all output to the provided
183
+ file-like streams or /dev/null if not specified.
184
+
185
+ Parameters
186
+ ----------
187
+ ingest_config : PipelineCreationSchema
188
+ Validated pipeline configuration.
189
+ disable_dynamic_scaling : Optional[bool]
190
+ Whether to disable dynamic scaling.
191
+ dynamic_memory_threshold : Optional[float]
192
+ Threshold for triggering scaling.
193
+ raw_stdout : Optional[TextIO]
194
+ Destination for stdout. Defaults to /dev/null.
195
+ raw_stderr : Optional[TextIO]
196
+ Destination for stderr. Defaults to /dev/null.
197
+ """
198
+ # Set the death signal for the subprocess
199
+ set_pdeathsig()
200
+ os.setsid() # Creates new process group so it can be SIGKILLed as a group
201
+
202
+ # Redirect OS-level file descriptors
203
+ redirect_os_fds(stdout=raw_stdout, stderr=raw_stderr)
204
+
205
+ # Redirect Python-level sys.stdout/sys.stderr
206
+ sys.stdout = raw_stdout or open(os.devnull, "w")
207
+ sys.stderr = raw_stderr or open(os.devnull, "w")
208
+
209
+ try:
210
+ _launch_pipeline(
211
+ ingest_config,
212
+ block=True,
213
+ disable_dynamic_scaling=disable_dynamic_scaling,
214
+ dynamic_memory_threshold=dynamic_memory_threshold,
215
+ )
216
+ except Exception as e:
217
+ sys.__stderr__.write(f"Subprocess pipeline run failed: {e}\n")
218
+ raise
219
+
220
+
103
221
  def _launch_pipeline(
104
222
  ingest_config: PipelineCreationSchema,
105
223
  block: bool,
@@ -122,7 +240,7 @@ def _launch_pipeline(
122
240
  start_abs = datetime.now()
123
241
 
124
242
  # Set up the ingestion pipeline
125
- setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
243
+ _ = setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
126
244
 
127
245
  # Record setup time
128
246
  end_setup = start_run = datetime.now()
@@ -159,12 +277,100 @@ def _launch_pipeline(
159
277
  def run_pipeline(
160
278
  ingest_config: PipelineCreationSchema,
161
279
  block: bool = True,
162
- disable_dynamic_scaling: bool = None,
163
- dynamic_memory_threshold: float = None,
164
- ) -> Union[RayPipeline, float]:
165
- pipeline, total_elapsed = _launch_pipeline(ingest_config, block, disable_dynamic_scaling, dynamic_memory_threshold)
280
+ disable_dynamic_scaling: Optional[bool] = None,
281
+ dynamic_memory_threshold: Optional[float] = None,
282
+ run_in_subprocess: bool = False,
283
+ stdout: Optional[TextIO] = None,
284
+ stderr: Optional[TextIO] = None,
285
+ ) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
286
+ """
287
+ Launch and manage a pipeline, optionally in a subprocess.
288
+
289
+ This function is the primary entry point for executing a Ray pipeline,
290
+ either within the current process or in a separate Python subprocess.
291
+ It supports synchronous blocking execution or non-blocking lifecycle management,
292
+ and allows redirection of output to specified file-like objects.
293
+
294
+ Parameters
295
+ ----------
296
+ ingest_config : PipelineCreationSchema
297
+ The validated configuration object used to construct and launch the pipeline.
298
+ block : bool, default=True
299
+ If True, blocks until the pipeline completes.
300
+ If False, returns an interface to control the pipeline externally.
301
+ disable_dynamic_scaling : Optional[bool], default=None
302
+ If True, disables dynamic memory scaling. Overrides global configuration if set.
303
+ If None, uses the default or globally defined behavior.
304
+ dynamic_memory_threshold : Optional[float], default=None
305
+ The memory usage threshold (as a float between 0 and 1) that triggers autoscaling,
306
+ if dynamic scaling is enabled. Defaults to the globally configured value if None.
307
+ run_in_subprocess : bool, default=False
308
+ If True, launches the pipeline in a separate Python subprocess using `multiprocessing.Process`.
309
+ If False, runs the pipeline in the current process.
310
+ stdout : Optional[TextIO], default=None
311
+ Optional file-like stream to which subprocess stdout should be redirected.
312
+ If None, stdout is redirected to /dev/null.
313
+ stderr : Optional[TextIO], default=None
314
+ Optional file-like stream to which subprocess stderr should be redirected.
315
+ If None, stderr is redirected to /dev/null.
316
+
317
+ Returns
318
+ -------
319
+ Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]
320
+ - If run in-process with `block=True`: returns elapsed time in seconds (float).
321
+ - If run in-process with `block=False`: returns a `RayPipelineInterface`.
322
+ - If run in subprocess with `block=False`: returns a `RayPipelineSubprocessInterface`.
323
+ - If run in subprocess with `block=True`: returns 0.0.
324
+
325
+ Raises
326
+ ------
327
+ RuntimeError
328
+ If the subprocess fails to start or exits with an error.
329
+ Exception
330
+ Any other exceptions raised during pipeline launch or configuration.
331
+ """
332
+ if run_in_subprocess:
333
+ logger.info("Launching pipeline in Python subprocess using multiprocessing.")
334
+
335
+ ctx = multiprocessing.get_context("fork")
336
+ process = ctx.Process(
337
+ target=_run_pipeline_process,
338
+ args=(
339
+ ingest_config,
340
+ disable_dynamic_scaling,
341
+ dynamic_memory_threshold,
342
+ stdout, # raw_stdout
343
+ stderr, # raw_stderr
344
+ ),
345
+ daemon=False,
346
+ )
347
+
348
+ process.start()
349
+
350
+ interface = RayPipelineSubprocessInterface(process)
351
+
352
+ if block:
353
+ start_time = time.time()
354
+ logger.info("Waiting for subprocess pipeline to complete...")
355
+ process.join()
356
+ logger.info("Pipeline subprocess completed.")
357
+ return time.time() - start_time
358
+ else:
359
+ logger.info(f"Pipeline subprocess started (PID={process.pid})")
360
+ atexit.register(lambda: kill_pipeline_process_group(process.pid))
361
+
362
+ return interface
363
+
364
+ # Run inline
365
+ pipeline, total_elapsed = _launch_pipeline(
366
+ ingest_config,
367
+ block=block,
368
+ disable_dynamic_scaling=disable_dynamic_scaling,
369
+ dynamic_memory_threshold=dynamic_memory_threshold,
370
+ )
166
371
 
167
372
  if block:
168
373
  logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
169
-
170
- return pipeline
374
+ return total_elapsed
375
+ else:
376
+ return RayPipelineInterface(pipeline)
@@ -23,6 +23,7 @@ from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extract
23
23
  from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
24
24
  from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
25
25
  from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
26
+ from nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor import HtmlExtractorStage
26
27
 
27
28
  from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
28
29
  from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
@@ -49,6 +50,7 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageCon
49
50
  from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
50
51
  from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
51
52
  from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
53
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
52
54
  from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
53
55
  from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
54
56
  from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
@@ -383,6 +385,19 @@ def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_ext
383
385
  return stage_name
384
386
 
385
387
 
388
+ def add_html_extractor_stage(pipeline, default_cpu_count, stage_name="html_extractor"):
389
+
390
+ pipeline.add_stage(
391
+ name=stage_name,
392
+ stage_actor=HtmlExtractorStage,
393
+ config=HtmlExtractorSchema(),
394
+ min_replicas=0,
395
+ max_replicas=int(max(1, (default_cpu_count // 14))), # 7% of available CPU cores
396
+ )
397
+
398
+ return stage_name
399
+
400
+
386
401
  def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
387
402
  _ = default_cpu_count # Placeholder for future use
388
403
  otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.5.21.dev20250521
3
+ Version: 2025.5.29.dev20250529
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -225,13 +225,13 @@ Requires-Dist: httpx>=0.28.1
225
225
  Requires-Dist: isodate>=0.7.2
226
226
  Requires-Dist: langdetect>=1.0.9
227
227
  Requires-Dist: minio>=7.2.12
228
- Requires-Dist: openai>=1.57.1
228
+ Requires-Dist: openai>=1.82.0
229
229
  Requires-Dist: opentelemetry-api>=1.27.0
230
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
232
232
  Requires-Dist: pydantic>2.0.0
233
233
  Requires-Dist: pydantic-settings>2.0.0
234
- Requires-Dist: pypdfium2>=4.30.0
234
+ Requires-Dist: pypdfium2==4.30.1
235
235
  Requires-Dist: pytest>=8.0.2
236
236
  Requires-Dist: pytest-mock>=3.14.0
237
237
  Requires-Dist: pytest-cov>=6.0.0
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
239
239
  Requires-Dist: python-docx>=1.1.2
240
240
  Requires-Dist: python-dotenv>=1.0.1
241
241
  Requires-Dist: python-pptx>=1.0.2
242
+ Requires-Dist: prometheus-client
242
243
  Requires-Dist: torch==2.4.1
243
244
  Requires-Dist: ray[all]>=2.37.0
244
245
  Requires-Dist: redis>=5.2.1
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
255
256
  Requires-Dist: pip
256
257
  Requires-Dist: llama-index-embeddings-nvidia
257
258
  Requires-Dist: opencv-python
258
- Requires-Dist: pymilvus>=2.5.0
259
+ Requires-Dist: pymilvus>=2.5.10
259
260
  Requires-Dist: pymilvus[bulk_writer,model]
260
261
  Requires-Dist: tritonclient
261
262
  Requires-Dist: nvidia-riva-client>=2.18.0
262
263
  Requires-Dist: unstructured-client
264
+ Requires-Dist: markitdown
263
265
  Dynamic: license-file