nv-ingest 2025.8.16.dev20250816__py3-none-any.whl → 2025.11.21.dev20251121__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +43 -14
- nv_ingest/framework/orchestration/process/execution.py +92 -94
- nv_ingest/framework/orchestration/process/lifecycle.py +98 -6
- nv_ingest/framework/orchestration/process/strategies.py +41 -5
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +9 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +2 -3
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +46 -9
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +2 -1
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +5 -1
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +5 -1
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +4 -3
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/config/loaders.py +33 -2
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +514 -0
- nv_ingest/pipeline/default_pipeline_impl.py +111 -88
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/METADATA +4 -3
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/RECORD +38 -31
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Process termination utilities, isolated to avoid circular imports.
|
|
7
|
+
|
|
8
|
+
This module provides functions to terminate a process and its entire process
|
|
9
|
+
group safely, without depending on pipeline construction or Ray types.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import signal
|
|
15
|
+
import time
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _safe_log(level: int, msg: str) -> None:
|
|
22
|
+
"""Best-effort logging that won't emit handler tracebacks on closed streams.
|
|
23
|
+
|
|
24
|
+
Temporarily disables logging.raiseExceptions to prevent the logging module
|
|
25
|
+
from printing "--- Logging error ---" to stderr if a handler's stream is
|
|
26
|
+
already closed (common during process teardown). Falls back to writing to
|
|
27
|
+
sys.__stderr__ if available.
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
import logging as _logging
|
|
31
|
+
|
|
32
|
+
prev = getattr(_logging, "raiseExceptions", True)
|
|
33
|
+
# Suppress handler errors being printed to stderr
|
|
34
|
+
_logging.raiseExceptions = False
|
|
35
|
+
|
|
36
|
+
# If there are no handlers, skip and use stderr fallback
|
|
37
|
+
if logger.handlers:
|
|
38
|
+
logger.log(level, msg)
|
|
39
|
+
return
|
|
40
|
+
except Exception:
|
|
41
|
+
# Intentionally ignore and try stderr fallback
|
|
42
|
+
pass
|
|
43
|
+
finally:
|
|
44
|
+
try:
|
|
45
|
+
import logging as _logging # re-import safe even if earlier failed
|
|
46
|
+
|
|
47
|
+
_logging.raiseExceptions = prev # type: ignore[name-defined]
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
# Fallback to stderr if available
|
|
52
|
+
try:
|
|
53
|
+
import sys
|
|
54
|
+
|
|
55
|
+
if hasattr(sys, "__stderr__") and sys.__stderr__:
|
|
56
|
+
sys.__stderr__.write(msg + "\n")
|
|
57
|
+
sys.__stderr__.flush()
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def kill_pipeline_process_group(process) -> None:
|
|
63
|
+
"""
|
|
64
|
+
Kill a process and its entire process group.
|
|
65
|
+
|
|
66
|
+
Accepts either a multiprocessing.Process-like object exposing a ``pid`` attribute
|
|
67
|
+
or a raw PID integer. Sends SIGTERM to the process group first, and escalates
|
|
68
|
+
to SIGKILL if it does not terminate within a short grace period.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
process : multiprocessing.Process | int
|
|
73
|
+
Process handle (or a raw PID int) for the process whose process group should be terminated.
|
|
74
|
+
"""
|
|
75
|
+
proc: Optional[object] = None
|
|
76
|
+
pid: Optional[int] = None
|
|
77
|
+
|
|
78
|
+
if isinstance(process, int):
|
|
79
|
+
pid = process
|
|
80
|
+
elif hasattr(process, "pid"):
|
|
81
|
+
proc = process
|
|
82
|
+
try:
|
|
83
|
+
pid = int(getattr(proc, "pid"))
|
|
84
|
+
except Exception as e:
|
|
85
|
+
raise AttributeError(f"Invalid process-like object without usable pid: {e}")
|
|
86
|
+
else:
|
|
87
|
+
raise AttributeError(
|
|
88
|
+
"kill_pipeline_process_group expects a multiprocessing.Process or a PID int (process-like object with .pid)"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if proc is not None and hasattr(proc, "is_alive") and not proc.is_alive():
|
|
92
|
+
_safe_log(logging.DEBUG, "Process already terminated")
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
if pid is None:
|
|
96
|
+
raise AttributeError("Unable to determine PID for process group termination")
|
|
97
|
+
|
|
98
|
+
_safe_log(logging.INFO, f"Terminating pipeline process group (PID: {pid})")
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Send graceful termination to the entire process group
|
|
102
|
+
try:
|
|
103
|
+
pgid = os.getpgid(pid)
|
|
104
|
+
except Exception:
|
|
105
|
+
# Process already gone
|
|
106
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} not found during SIGTERM phase")
|
|
107
|
+
return
|
|
108
|
+
try:
|
|
109
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
110
|
+
except ProcessLookupError:
|
|
111
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} no longer exists (SIGTERM)")
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
# If we have a Process handle, give it a chance to exit cleanly
|
|
115
|
+
if proc is not None and hasattr(proc, "join"):
|
|
116
|
+
try:
|
|
117
|
+
proc.join(timeout=5.0)
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
120
|
+
still_alive = getattr(proc, "is_alive", lambda: True)()
|
|
121
|
+
else:
|
|
122
|
+
# Without a handle, provide a small grace period
|
|
123
|
+
time.sleep(2.0)
|
|
124
|
+
try:
|
|
125
|
+
_ = os.getpgid(pid)
|
|
126
|
+
still_alive = True
|
|
127
|
+
except Exception:
|
|
128
|
+
still_alive = False
|
|
129
|
+
|
|
130
|
+
if still_alive:
|
|
131
|
+
_safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
|
|
132
|
+
try:
|
|
133
|
+
try:
|
|
134
|
+
pgid2 = os.getpgid(pid)
|
|
135
|
+
except Exception:
|
|
136
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} vanished before SIGKILL")
|
|
137
|
+
return
|
|
138
|
+
os.killpg(pgid2, signal.SIGKILL)
|
|
139
|
+
finally:
|
|
140
|
+
if proc is not None and hasattr(proc, "join"):
|
|
141
|
+
try:
|
|
142
|
+
proc.join(timeout=3.0)
|
|
143
|
+
except Exception:
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
except (ProcessLookupError, OSError) as e:
|
|
147
|
+
_safe_log(logging.DEBUG, f"Process group already terminated or not found: {e}")
|
|
@@ -152,11 +152,11 @@ if __name__ == "__main__":
|
|
|
152
152
|
os.environ["OCR_MODEL_NAME"] = "paddle"
|
|
153
153
|
os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
154
154
|
os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
155
|
-
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/
|
|
155
|
+
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
|
|
156
156
|
logger.info("Environment variables set.")
|
|
157
157
|
|
|
158
158
|
image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
159
|
-
model_name = "nvidia/
|
|
159
|
+
model_name = "nvidia/nemotron-nano-12b-v2-vl"
|
|
160
160
|
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
161
161
|
(
|
|
162
162
|
yolox_table_structure_grpc,
|
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import multiprocessing
|
|
6
|
-
import os
|
|
7
|
-
import signal
|
|
8
6
|
import threading
|
|
9
7
|
from abc import ABC, abstractmethod
|
|
10
8
|
from dataclasses import dataclass
|
|
@@ -22,6 +20,7 @@ import logging
|
|
|
22
20
|
import time
|
|
23
21
|
|
|
24
22
|
from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
|
|
23
|
+
from nv_ingest.framework.orchestration.process.termination import kill_pipeline_process_group
|
|
25
24
|
from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
|
|
26
25
|
from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
|
|
27
26
|
from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
|
|
@@ -120,24 +119,19 @@ class RayPipelineSubprocessInterface(PipelineInterface):
|
|
|
120
119
|
|
|
121
120
|
def stop(self) -> None:
|
|
122
121
|
"""
|
|
123
|
-
Stops the subprocess pipeline
|
|
122
|
+
Stops the subprocess pipeline and its entire process group to ensure
|
|
123
|
+
any child processes (e.g., the simple message broker) are terminated.
|
|
124
124
|
"""
|
|
125
|
-
|
|
125
|
+
try:
|
|
126
|
+
pid = int(self._process.pid)
|
|
127
|
+
except Exception:
|
|
126
128
|
return
|
|
127
129
|
|
|
130
|
+
# Always attempt to terminate the entire process group
|
|
128
131
|
try:
|
|
129
|
-
|
|
130
|
-
self._process.join(timeout=5.0)
|
|
132
|
+
kill_pipeline_process_group(pid)
|
|
131
133
|
except Exception as e:
|
|
132
|
-
logger.warning(f"
|
|
133
|
-
|
|
134
|
-
if self._process.is_alive():
|
|
135
|
-
try:
|
|
136
|
-
pgid = os.getpgid(self._process.pid)
|
|
137
|
-
os.killpg(pgid, signal.SIGKILL)
|
|
138
|
-
except Exception as e:
|
|
139
|
-
logger.error(f"Failed to force-kill process group: {e}")
|
|
140
|
-
self._process.join(timeout=3.0)
|
|
134
|
+
logger.warning(f"kill_pipeline_process_group failed: {e}")
|
|
141
135
|
|
|
142
136
|
|
|
143
137
|
class RayPipelineInterface(PipelineInterface):
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from typing import Optional
|
|
8
|
-
|
|
9
8
|
import ray
|
|
10
9
|
|
|
11
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
@@ -17,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExt
|
|
|
17
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
17
|
nv_ingest_node_failure_try_except,
|
|
19
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
20
|
|
|
21
21
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
22
22
|
|
|
@@ -66,10 +66,9 @@ class AudioExtractorStage(RayActorStage):
|
|
|
66
66
|
# Extract the DataFrame payload.
|
|
67
67
|
df_ledger = control_message.payload()
|
|
68
68
|
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
69
|
-
|
|
70
69
|
# Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
|
|
71
70
|
task_config = remove_task_by_type(control_message, "extract")
|
|
72
|
-
self._logger.debug("Extracted task config: %s", task_config)
|
|
71
|
+
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
73
72
|
|
|
74
73
|
# Perform audio text extraction.
|
|
75
74
|
new_df, extraction_info = extract_text_from_audio_internal(
|
|
@@ -13,8 +13,11 @@ from nv_ingest.framework.util.flow_control import filter_by_task
|
|
|
13
13
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
14
14
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
15
|
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
16
|
-
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
17
16
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
17
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
|
+
nv_ingest_node_failure_try_except,
|
|
19
|
+
)
|
|
20
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
18
21
|
|
|
19
22
|
logger = logging.getLogger(__name__)
|
|
20
23
|
|
|
@@ -66,7 +69,7 @@ class ChartExtractorStage(RayActorStage):
|
|
|
66
69
|
|
|
67
70
|
# Remove the "chart_data_extract" task to obtain task-specific configuration.
|
|
68
71
|
task_config = remove_task_by_type(control_message, "chart_data_extract")
|
|
69
|
-
logger.debug("ChartExtractorStage: Task config extracted: %s", task_config)
|
|
72
|
+
logger.debug("ChartExtractorStage: Task config extracted: %s", sanitize_for_logging(task_config))
|
|
70
73
|
|
|
71
74
|
# Perform chart data extraction.
|
|
72
75
|
execution_trace_log = {}
|
|
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtra
|
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
18
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
20
|
|
|
20
21
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
21
22
|
|
|
@@ -68,7 +69,7 @@ class DocxExtractorStage(RayActorStage):
|
|
|
68
69
|
|
|
69
70
|
# Remove the "docx-extract" task from the message to obtain task-specific configuration.
|
|
70
71
|
task_config = remove_task_by_type(control_message, "extract")
|
|
71
|
-
self._logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
72
73
|
|
|
73
74
|
# Perform DOCX content extraction.
|
|
74
75
|
new_df, extraction_info = extract_primitives_from_docx_internal(
|
|
@@ -17,6 +17,7 @@ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtra
|
|
|
17
17
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
18
|
nv_ingest_node_failure_try_except,
|
|
19
19
|
)
|
|
20
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
21
|
|
|
21
22
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
22
23
|
|
|
@@ -69,7 +70,7 @@ class HtmlExtractorStage(RayActorStage):
|
|
|
69
70
|
|
|
70
71
|
# Remove the "html_content_extract" task from the message to obtain task-specific configuration.
|
|
71
72
|
task_config = remove_task_by_type(control_message, "extract")
|
|
72
|
-
self._logger.debug("Extracted task config: %s", task_config)
|
|
73
|
+
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
73
74
|
|
|
74
75
|
# Perform html content extraction.
|
|
75
76
|
new_df, extraction_info = extract_markdown_from_html_internal(
|
|
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExt
|
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
18
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
20
|
|
|
20
21
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
21
22
|
|
|
@@ -68,7 +69,7 @@ class ImageExtractorStage(RayActorStage):
|
|
|
68
69
|
|
|
69
70
|
# Remove the "extract" task from the message to obtain task-specific configuration.
|
|
70
71
|
task_config = remove_task_by_type(control_message, "extract")
|
|
71
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
72
73
|
|
|
73
74
|
# Perform image primitives extraction.
|
|
74
75
|
new_df, extraction_info = extract_primitives_from_image_internal(
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import ray
|
|
7
|
+
|
|
8
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
9
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
10
|
+
from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class OCRExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts text data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "text_data_extract" task from the message.
|
|
29
|
+
2. Calls the text extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
self._logger.info("OCRExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
self._logger.exception(f"Error validating Text extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
45
|
+
@filter_by_task(required_tasks=["ocr_data_extract"])
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
# Extract DataFrame payload
|
|
48
|
+
df_ledger = control_message.payload()
|
|
49
|
+
if df_ledger.empty:
|
|
50
|
+
return control_message
|
|
51
|
+
|
|
52
|
+
# Remove the "text_data_extract" task from the message
|
|
53
|
+
task_config = remove_task_by_type(control_message, "ocr_data_extract")
|
|
54
|
+
|
|
55
|
+
execution_trace_log = {}
|
|
56
|
+
new_df, extraction_info = extract_text_data_from_image_internal(
|
|
57
|
+
df_extraction_ledger=df_ledger,
|
|
58
|
+
task_config=task_config,
|
|
59
|
+
extraction_config=self.validated_config,
|
|
60
|
+
execution_trace_log=execution_trace_log,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
control_message.payload(new_df)
|
|
64
|
+
control_message.set_metadata("ocr_extraction_info", extraction_info)
|
|
65
|
+
|
|
66
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
67
|
+
if do_trace_tagging and execution_trace_log:
|
|
68
|
+
parent_name = self.stage_name if self.stage_name else "ocr_extractor"
|
|
69
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
70
|
+
|
|
71
|
+
return control_message
|
|
@@ -15,7 +15,10 @@ from nv_ingest_api.internal.primitives.tracing.tagging import set_trace_timestam
|
|
|
15
15
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
16
16
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
17
17
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
-
from nv_ingest_api.util.exception_handlers.decorators import
|
|
18
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
19
|
+
nv_ingest_node_failure_try_except,
|
|
20
|
+
)
|
|
21
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
22
|
|
|
20
23
|
logger = logging.getLogger(__name__)
|
|
21
24
|
|
|
@@ -87,7 +90,7 @@ class PDFExtractorStage(RayActorStage):
|
|
|
87
90
|
|
|
88
91
|
# Remove the "extract" task from the message to obtain task-specific configuration.
|
|
89
92
|
task_config = remove_task_by_type(control_message, "extract")
|
|
90
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
93
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
91
94
|
|
|
92
95
|
# Perform PDF extraction.
|
|
93
96
|
execution_trace_log = {}
|
|
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExt
|
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
18
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -65,7 +66,7 @@ class TableExtractorStage(RayActorStage):
|
|
|
65
66
|
|
|
66
67
|
# Remove the "table_data_extract" task to obtain task-specific configuration.
|
|
67
68
|
task_config = remove_task_by_type(control_message, "table_data_extract")
|
|
68
|
-
logger.debug("Extracted task configuration: %s", task_config)
|
|
69
|
+
logger.debug("Extracted task configuration: %s", sanitize_for_logging(task_config))
|
|
69
70
|
|
|
70
71
|
# Perform table data extraction.
|
|
71
72
|
execution_trace_log = {}
|
|
@@ -25,6 +25,7 @@ from nv_ingest_api.util.exception_handlers.decorators import (
|
|
|
25
25
|
nv_ingest_node_failure_try_except,
|
|
26
26
|
)
|
|
27
27
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
28
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
28
29
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
30
31
|
|
|
@@ -42,7 +43,7 @@ class MetadataInjectionStage(RayActorStage):
|
|
|
42
43
|
# Call the base initializer to set attributes like self._running.
|
|
43
44
|
super().__init__(config, stage_name=stage_name)
|
|
44
45
|
# Additional initialization can be added here if necessary.
|
|
45
|
-
self._logger.debug("MetadataInjectionStage initialized with config: %s", config)
|
|
46
|
+
self._logger.debug("MetadataInjectionStage initialized with config: %s", sanitize_for_logging(config))
|
|
46
47
|
|
|
47
48
|
@nv_ingest_node_failure_try_except()
|
|
48
49
|
@traceable()
|
|
@@ -18,6 +18,7 @@ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import Imag
|
|
|
18
18
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
19
19
|
nv_ingest_node_failure_try_except,
|
|
20
20
|
)
|
|
21
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
@@ -68,7 +69,7 @@ class ImageDedupStage(RayActorStage):
|
|
|
68
69
|
|
|
69
70
|
# Remove the "dedup" task from the message to obtain task-specific configuration.
|
|
70
71
|
task_config = remove_task_by_type(control_message, "dedup")
|
|
71
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
72
73
|
|
|
73
74
|
# Perform image deduplication.
|
|
74
75
|
new_df = deduplicate_images_internal(
|
|
@@ -17,6 +17,7 @@ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema impo
|
|
|
17
17
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
18
|
nv_ingest_node_failure_try_except,
|
|
19
19
|
)
|
|
20
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
21
|
|
|
21
22
|
logger = logging.getLogger(__name__)
|
|
22
23
|
|
|
@@ -67,7 +68,7 @@ class ImageFilterStage(RayActorStage):
|
|
|
67
68
|
|
|
68
69
|
# Remove the "filter" task from the message to obtain task-specific configuration.
|
|
69
70
|
task_config = remove_task_by_type(control_message, "filter")
|
|
70
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
71
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
71
72
|
|
|
72
73
|
task_params: Dict[str, Any] = task_config.get("params", {})
|
|
73
74
|
|
|
@@ -29,6 +29,8 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
|
|
|
29
29
|
# Import clients
|
|
30
30
|
from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
|
|
31
31
|
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
32
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
33
|
+
from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
|
|
32
34
|
|
|
33
35
|
logger = logging.getLogger(__name__)
|
|
34
36
|
|
|
@@ -88,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
|
|
|
88
90
|
|
|
89
91
|
# Use the discriminated union for broker_client
|
|
90
92
|
broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
|
|
91
|
-
task_queue: str = Field(
|
|
92
|
-
|
|
93
|
+
task_queue: str = Field(
|
|
94
|
+
..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
|
|
95
|
+
)
|
|
96
|
+
poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
|
|
93
97
|
|
|
94
98
|
|
|
95
99
|
@ray.remote
|
|
@@ -104,8 +108,11 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
104
108
|
def __init__(self, config: MessageBrokerTaskSourceConfig, stage_name: Optional[str] = None) -> None:
|
|
105
109
|
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
106
110
|
self.config: MessageBrokerTaskSourceConfig # Add a type hint for self.config
|
|
111
|
+
|
|
112
|
+
# Sanitize config before logging to avoid leaking secrets
|
|
113
|
+
_sanitized = sanitize_for_logging(config)
|
|
107
114
|
self._logger.debug(
|
|
108
|
-
"Initializing MessageBrokerTaskSourceStage with config: %s",
|
|
115
|
+
"Initializing MessageBrokerTaskSourceStage with config: %s", _sanitized
|
|
109
116
|
) # Log validated config
|
|
110
117
|
|
|
111
118
|
# Access validated configuration directly via self.config
|
|
@@ -130,7 +137,29 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
130
137
|
self._current_backoff_sleep: float = 0.0
|
|
131
138
|
self._last_backoff_log_time: float = 0.0
|
|
132
139
|
|
|
133
|
-
|
|
140
|
+
# Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
|
|
141
|
+
strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
|
|
142
|
+
self.scheduler = QosScheduler(
|
|
143
|
+
self.task_queue,
|
|
144
|
+
num_prefetch_threads=6, # one per category (no-op for simple strategy)
|
|
145
|
+
total_buffer_capacity=96, # e.g., ~16 per thread
|
|
146
|
+
prefetch_poll_interval=0.002, # faster polling for responsiveness
|
|
147
|
+
prefetch_non_immediate=True, # enable prefetch for non-immediate categories
|
|
148
|
+
strategy=strategy,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self._logger.info(
|
|
152
|
+
"MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
|
|
153
|
+
self.task_queue,
|
|
154
|
+
{
|
|
155
|
+
"immediate": f"{self.task_queue}_immediate",
|
|
156
|
+
"micro": f"{self.task_queue}_micro",
|
|
157
|
+
"small": f"{self.task_queue}_small",
|
|
158
|
+
"medium": f"{self.task_queue}_medium",
|
|
159
|
+
"large": f"{self.task_queue}_large",
|
|
160
|
+
"default": f"{self.task_queue}",
|
|
161
|
+
},
|
|
162
|
+
)
|
|
134
163
|
|
|
135
164
|
# --- Private helper methods ---
|
|
136
165
|
def _create_client(self):
|
|
@@ -261,14 +290,21 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
261
290
|
|
|
262
291
|
return control_message
|
|
263
292
|
|
|
264
|
-
def _fetch_message(self, timeout=
|
|
293
|
+
def _fetch_message(self, timeout=0):
|
|
265
294
|
"""
|
|
266
|
-
Fetch a message from the message broker.
|
|
295
|
+
Fetch a message from the message broker using fair scheduling across derived queues.
|
|
296
|
+
This is a non-blocking sweep across all queues for the current scheduling cycle. If no
|
|
297
|
+
message is found across any queue, return None so the caller can sleep briefly.
|
|
267
298
|
"""
|
|
268
299
|
try:
|
|
269
|
-
|
|
300
|
+
# Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
|
|
301
|
+
job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
|
|
270
302
|
if job is None:
|
|
271
|
-
self._logger.debug(
|
|
303
|
+
self._logger.debug(
|
|
304
|
+
"No message received from derived queues for base "
|
|
305
|
+
"'%s' (immediate, micro, small, medium, large, default)",
|
|
306
|
+
self.task_queue,
|
|
307
|
+
)
|
|
272
308
|
# Do not treat normal empty polls as failures
|
|
273
309
|
self._fetch_failure_count = 0
|
|
274
310
|
self._current_backoff_sleep = 0.0
|
|
@@ -332,7 +368,8 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
332
368
|
Instead of reading from an input edge, fetch a message from the broker.
|
|
333
369
|
"""
|
|
334
370
|
self._logger.debug("read_input: calling _fetch_message()")
|
|
335
|
-
|
|
371
|
+
# Perform a non-blocking sweep across all queues for this cycle
|
|
372
|
+
job = self._fetch_message(timeout=0)
|
|
336
373
|
if job is None:
|
|
337
374
|
# Sleep for either the configured poll interval or the current backoff, whichever is larger
|
|
338
375
|
sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
|
|
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings
|
|
|
16
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
17
17
|
nv_ingest_node_failure_try_except,
|
|
18
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
19
20
|
|
|
20
21
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
21
22
|
|
|
@@ -68,7 +69,7 @@ class EmbeddingStorageStage(RayActorStage):
|
|
|
68
69
|
|
|
69
70
|
# Remove the "store_embedding" task from the message to obtain task-specific configuration.
|
|
70
71
|
task_config = remove_task_by_type(control_message, "store_embedding")
|
|
71
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
72
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
72
73
|
|
|
73
74
|
# Perform embedding storage.
|
|
74
75
|
new_df = store_text_embeddings_internal(
|
|
@@ -18,6 +18,7 @@ from nv_ingest_api.internal.transform.caption_image import transform_image_creat
|
|
|
18
18
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
19
19
|
nv_ingest_node_failure_try_except,
|
|
20
20
|
)
|
|
21
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
@@ -67,7 +68,10 @@ class ImageCaptionTransformStage(RayActorStage):
|
|
|
67
68
|
|
|
68
69
|
# Remove the "caption" task to obtain task-specific configuration.
|
|
69
70
|
task_config = remove_task_by_type(control_message, "caption")
|
|
70
|
-
logger.debug(
|
|
71
|
+
logger.debug(
|
|
72
|
+
"ImageCaptionTransformStage: Task configuration extracted: %s",
|
|
73
|
+
pprint.pformat(sanitize_for_logging(task_config)),
|
|
74
|
+
)
|
|
71
75
|
|
|
72
76
|
# Call the caption extraction function.
|
|
73
77
|
new_df = transform_image_create_vlm_caption_internal(
|
|
@@ -15,6 +15,7 @@ from nv_ingest_api.internal.transform.embed_text import transform_create_text_em
|
|
|
15
15
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
16
|
nv_ingest_node_failure_try_except,
|
|
17
17
|
)
|
|
18
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
18
19
|
|
|
19
20
|
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
20
21
|
|
|
@@ -62,7 +63,10 @@ class TextEmbeddingTransformStage(RayActorStage):
|
|
|
62
63
|
|
|
63
64
|
# Remove the "embed" task to obtain task-specific configuration.
|
|
64
65
|
task_config = remove_task_by_type(control_message, "embed")
|
|
65
|
-
self._logger.debug(
|
|
66
|
+
self._logger.debug(
|
|
67
|
+
"TextEmbeddingTransformStage: Task configuration extracted: %s",
|
|
68
|
+
pprint.pformat(sanitize_for_logging(task_config)),
|
|
69
|
+
)
|
|
66
70
|
|
|
67
71
|
# Call the text embedding extraction function.
|
|
68
72
|
new_df, execution_trace_log = transform_create_text_embeddings_internal(
|