nv-ingest 2025.8.4.dev20250804__py3-none-any.whl → 2025.12.10.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/__init__.py +6 -0
- nv_ingest/api/main.py +2 -0
- nv_ingest/api/tracing.py +82 -0
- nv_ingest/api/v2/README.md +203 -0
- nv_ingest/api/v2/__init__.py +3 -0
- nv_ingest/api/v2/ingest.py +1300 -0
- nv_ingest/framework/orchestration/execution/__init__.py +3 -0
- nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest/framework/orchestration/process/__init__.py +3 -0
- nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest/framework/orchestration/process/execution.py +495 -0
- nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest/framework/orchestration/process/termination.py +147 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +3 -3
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +79 -11
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -18
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
- nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +528 -0
- nv_ingest/pipeline/default_pipeline_impl.py +557 -0
- nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/METADATA +6 -3
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/RECORD +64 -43
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.8.4.dev20250804.dist-info → nv_ingest-2025.12.10.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
import ray
|
|
9
9
|
|
|
@@ -14,6 +14,8 @@ from nv_ingest.framework.util.telemetry.global_stats import GlobalStats
|
|
|
14
14
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
15
15
|
nv_ingest_node_failure_try_except,
|
|
16
16
|
)
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
17
19
|
|
|
18
20
|
# Import the JobCounter schema and global stats singleton.
|
|
19
21
|
|
|
@@ -30,15 +32,17 @@ class JobCounterStage(RayActorStage):
|
|
|
30
32
|
statistic each time it processes a message.
|
|
31
33
|
"""
|
|
32
34
|
|
|
33
|
-
def __init__(self, config: BaseModel) -> None:
|
|
35
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
|
|
34
36
|
# Ensure base attributes (e.g. self._running) are initialized.
|
|
35
|
-
super().__init__(config)
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
38
|
# The validated config should be a JobCounterSchema instance.
|
|
37
39
|
self.validated_config: JobCounterSchema = config
|
|
38
40
|
# Obtain the global stats' singleton.
|
|
39
41
|
self.stats = GlobalStats.get_instance()
|
|
40
42
|
|
|
41
|
-
@nv_ingest_node_failure_try_except(
|
|
43
|
+
@nv_ingest_node_failure_try_except()
|
|
44
|
+
@traceable()
|
|
45
|
+
@udf_intercept_hook()
|
|
42
46
|
async def on_data(self, message: Any) -> Any:
|
|
43
47
|
"""
|
|
44
48
|
Process an incoming IngestControlMessage by counting jobs.
|
|
@@ -24,6 +24,7 @@ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_fail
|
|
|
24
24
|
|
|
25
25
|
from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus
|
|
26
26
|
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
27
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
@ray.remote
|
|
@@ -35,8 +36,8 @@ class OpenTelemetryTracerStage(RayActorStage):
|
|
|
35
36
|
It creates spans for tasks and exports them to a configured OpenTelemetry endpoint.
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
|
-
def __init__(self, config: OpenTelemetryTracerSchema) -> None:
|
|
39
|
-
super().__init__(config)
|
|
39
|
+
def __init__(self, config: OpenTelemetryTracerSchema, stage_name: Optional[str] = None) -> None:
|
|
40
|
+
super().__init__(config, stage_name=stage_name)
|
|
40
41
|
|
|
41
42
|
# self._logger.info(f"[Telemetry] Initializing OpenTelemetry tracer stage with config: {config}")
|
|
42
43
|
|
|
@@ -81,7 +82,7 @@ class OpenTelemetryTracerStage(RayActorStage):
|
|
|
81
82
|
parent_ctx = trace.set_span_in_context(NonRecordingSpan(span_context))
|
|
82
83
|
parent_span = self.tracer.start_span(str(job_id), context=parent_ctx, start_time=start_time)
|
|
83
84
|
|
|
84
|
-
event_count = create_span_with_timestamps(self.tracer, parent_span, message)
|
|
85
|
+
event_count = create_span_with_timestamps(self.tracer, parent_span, message, self._logger)
|
|
85
86
|
|
|
86
87
|
if message.has_metadata("cm_failed") and message.get_metadata("cm_failed"):
|
|
87
88
|
parent_span.set_status(Status(StatusCode.ERROR))
|
|
@@ -96,7 +97,8 @@ class OpenTelemetryTracerStage(RayActorStage):
|
|
|
96
97
|
|
|
97
98
|
self._logger.debug(f"[Telemetry] Exported spans for message {job_id} with {event_count} total events.")
|
|
98
99
|
|
|
99
|
-
@nv_ingest_node_failure_try_except(
|
|
100
|
+
@nv_ingest_node_failure_try_except()
|
|
101
|
+
@udf_intercept_hook()
|
|
100
102
|
def on_data(self, control_message: IngestControlMessage) -> Optional[Any]:
|
|
101
103
|
try:
|
|
102
104
|
do_trace_tagging = bool(control_message.get_metadata("config::add_trace_tagging"))
|
|
@@ -160,7 +162,7 @@ def extract_annotated_task_results(message):
|
|
|
160
162
|
return task_results
|
|
161
163
|
|
|
162
164
|
|
|
163
|
-
def create_span_with_timestamps(tracer, parent_span, message) -> int:
|
|
165
|
+
def create_span_with_timestamps(tracer, parent_span, message, logger) -> int:
|
|
164
166
|
timestamps = extract_timestamps_from_message(message)
|
|
165
167
|
task_results = extract_annotated_task_results(message)
|
|
166
168
|
|
|
@@ -175,8 +177,16 @@ def create_span_with_timestamps(tracer, parent_span, message) -> int:
|
|
|
175
177
|
if not subtask:
|
|
176
178
|
span = tracer.start_span(main_task, context=child_ctx, start_time=ts_entry)
|
|
177
179
|
else:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
+
# Check if parent context exists, otherwise create standalone span with warning
|
|
181
|
+
if main_task in ctx_store:
|
|
182
|
+
subtask_ctx = trace.set_span_in_context(ctx_store[main_task][0])
|
|
183
|
+
span = tracer.start_span(subtask, context=subtask_ctx, start_time=ts_entry)
|
|
184
|
+
else:
|
|
185
|
+
logger.warning(
|
|
186
|
+
f"Missing parent context for subtask '{subtask}'"
|
|
187
|
+
f" (expected parent: '{main_task}'). Creating standalone span."
|
|
188
|
+
)
|
|
189
|
+
span = tracer.start_span(f"{main_task}::{subtask}", context=child_ctx, start_time=ts_entry)
|
|
180
190
|
|
|
181
191
|
span.add_event("entry", timestamp=ts_entry)
|
|
182
192
|
span.add_event("exit", timestamp=ts_exit)
|
|
@@ -4,12 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import pprint
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Optional
|
|
8
8
|
|
|
9
9
|
import ray
|
|
10
10
|
|
|
11
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
12
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
13
14
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
14
15
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
16
|
from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
|
|
@@ -17,6 +18,7 @@ from nv_ingest_api.internal.transform.caption_image import transform_image_creat
|
|
|
17
18
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
19
|
nv_ingest_node_failure_try_except,
|
|
19
20
|
)
|
|
21
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
22
|
|
|
21
23
|
logger = logging.getLogger(__name__)
|
|
22
24
|
|
|
@@ -31,8 +33,8 @@ class ImageCaptionTransformStage(RayActorStage):
|
|
|
31
33
|
are stored in the control message.
|
|
32
34
|
"""
|
|
33
35
|
|
|
34
|
-
def __init__(self, config: ImageCaptionExtractionSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: ImageCaptionExtractionSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
38
|
try:
|
|
37
39
|
self.validated_config = config
|
|
38
40
|
logger.info("ImageCaptionTransformStage configuration validated.")
|
|
@@ -40,9 +42,10 @@ class ImageCaptionTransformStage(RayActorStage):
|
|
|
40
42
|
logger.exception("Error validating caption extraction config")
|
|
41
43
|
raise e
|
|
42
44
|
|
|
43
|
-
@
|
|
45
|
+
@nv_ingest_node_failure_try_except()
|
|
46
|
+
@traceable()
|
|
47
|
+
@udf_intercept_hook()
|
|
44
48
|
@filter_by_task(required_tasks=["caption"])
|
|
45
|
-
@nv_ingest_node_failure_try_except(annotation_id="image_captioning", raise_on_failure=False)
|
|
46
49
|
def on_data(self, control_message: Any) -> Any:
|
|
47
50
|
"""
|
|
48
51
|
Process the control message by extracting image captions.
|
|
@@ -65,7 +68,10 @@ class ImageCaptionTransformStage(RayActorStage):
|
|
|
65
68
|
|
|
66
69
|
# Remove the "caption" task to obtain task-specific configuration.
|
|
67
70
|
task_config = remove_task_by_type(control_message, "caption")
|
|
68
|
-
logger.debug(
|
|
71
|
+
logger.debug(
|
|
72
|
+
"ImageCaptionTransformStage: Task configuration extracted: %s",
|
|
73
|
+
pprint.pformat(sanitize_for_logging(task_config)),
|
|
74
|
+
)
|
|
69
75
|
|
|
70
76
|
# Call the caption extraction function.
|
|
71
77
|
new_df = transform_image_create_vlm_caption_internal(
|
|
@@ -2,12 +2,10 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
import logging
|
|
6
5
|
import pprint
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
import ray
|
|
9
8
|
|
|
10
|
-
# Assume these imports come from your project:
|
|
11
9
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
10
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
13
11
|
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
|
|
@@ -17,8 +15,9 @@ from nv_ingest_api.internal.transform.embed_text import transform_create_text_em
|
|
|
17
15
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
18
16
|
nv_ingest_node_failure_try_except,
|
|
19
17
|
)
|
|
18
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
19
|
|
|
21
|
-
|
|
20
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
@ray.remote
|
|
@@ -31,19 +30,20 @@ class TextEmbeddingTransformStage(RayActorStage):
|
|
|
31
30
|
trace or extraction metadata is added.
|
|
32
31
|
"""
|
|
33
32
|
|
|
34
|
-
def __init__(self, config: TextEmbeddingSchema) -> None:
|
|
35
|
-
super().__init__(config,
|
|
33
|
+
def __init__(self, config: TextEmbeddingSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, stage_name=stage_name)
|
|
36
35
|
try:
|
|
37
36
|
self.validated_config = config
|
|
38
|
-
|
|
37
|
+
self._logger.info("TextEmbeddingTransformStage configuration validated successfully.")
|
|
39
38
|
except Exception as e:
|
|
40
|
-
|
|
41
|
-
raise
|
|
39
|
+
self._logger.exception(f"Error validating text embedding config: {e}")
|
|
40
|
+
raise
|
|
42
41
|
|
|
43
|
-
@
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
44
45
|
@filter_by_task(required_tasks=["embed"])
|
|
45
|
-
|
|
46
|
-
def on_data(self, control_message: IngestControlMessage) -> Any:
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
47
|
"""
|
|
48
48
|
Process the control message by generating text embeddings.
|
|
49
49
|
|
|
@@ -57,25 +57,24 @@ class TextEmbeddingTransformStage(RayActorStage):
|
|
|
57
57
|
IngestControlMessage
|
|
58
58
|
The updated message with text embeddings and trace info added.
|
|
59
59
|
"""
|
|
60
|
-
logger.info("TextEmbeddingTransformStage.on_data: Starting text embedding transformation.")
|
|
61
|
-
|
|
62
60
|
# Get the DataFrame payload.
|
|
63
61
|
df_payload = control_message.payload()
|
|
64
|
-
|
|
62
|
+
self._logger.debug("TextEmbeddingTransformStage: Extracted payload with %d rows.", len(df_payload))
|
|
65
63
|
|
|
66
64
|
# Remove the "embed" task to obtain task-specific configuration.
|
|
67
65
|
task_config = remove_task_by_type(control_message, "embed")
|
|
68
|
-
|
|
66
|
+
self._logger.debug(
|
|
67
|
+
"TextEmbeddingTransformStage: Task configuration extracted: %s",
|
|
68
|
+
pprint.pformat(sanitize_for_logging(task_config)),
|
|
69
|
+
)
|
|
69
70
|
|
|
70
71
|
# Call the text embedding extraction function.
|
|
71
72
|
new_df, execution_trace_log = transform_create_text_embeddings_internal(
|
|
72
73
|
df_payload, task_config=task_config, transform_config=self.validated_config
|
|
73
74
|
)
|
|
74
|
-
logger.info("Text embedding transformation completed. New payload has %d rows.", len(new_df))
|
|
75
75
|
|
|
76
76
|
# Update the control message payload.
|
|
77
77
|
control_message.payload(new_df)
|
|
78
78
|
# Annotate the message metadata with trace info.
|
|
79
79
|
control_message.set_metadata("text_embedding_trace", execution_trace_log)
|
|
80
|
-
logger.info("Text embedding trace metadata added.")
|
|
81
80
|
return control_message
|
|
@@ -3,18 +3,22 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
7
8
|
import ray
|
|
8
9
|
|
|
9
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
10
11
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
11
|
-
from nv_ingest_api.internal.primitives.ingest_control_message import
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
13
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
13
14
|
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
14
15
|
from nv_ingest_api.internal.transform.split_text import transform_text_split_and_tokenize_internal
|
|
15
16
|
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
16
17
|
nv_ingest_node_failure_try_except,
|
|
17
18
|
)
|
|
19
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
20
|
+
|
|
21
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
22
|
|
|
19
23
|
logger = logging.getLogger(__name__)
|
|
20
24
|
|
|
@@ -29,15 +33,16 @@ class TextSplitterStage(RayActorStage):
|
|
|
29
33
|
and tokenization logic. The updated DataFrame is then set back into the message.
|
|
30
34
|
"""
|
|
31
35
|
|
|
32
|
-
def __init__(self, config: TextSplitterSchema) -> None:
|
|
33
|
-
super().__init__(config)
|
|
36
|
+
def __init__(self, config: TextSplitterSchema, stage_name: Optional[str] = None) -> None:
|
|
37
|
+
super().__init__(config, stage_name=stage_name)
|
|
34
38
|
# Store the validated configuration (assumed to be an instance of TextSplitterSchema)
|
|
35
39
|
self.validated_config: TextSplitterSchema = config
|
|
36
|
-
logger.info("TextSplitterStage initialized with config: %s", config)
|
|
40
|
+
logger.info("TextSplitterStage initialized with config: %s", sanitize_for_logging(config))
|
|
37
41
|
|
|
38
|
-
@
|
|
39
|
-
@
|
|
40
|
-
@
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
45
|
+
@filter_by_task(required_tasks=["split"])
|
|
41
46
|
def on_data(self, message: Any) -> Any:
|
|
42
47
|
"""
|
|
43
48
|
Process an incoming IngestControlMessage by splitting and tokenizing its text.
|
|
@@ -59,7 +64,7 @@ class TextSplitterStage(RayActorStage):
|
|
|
59
64
|
|
|
60
65
|
# Remove the "split" task to obtain task-specific configuration.
|
|
61
66
|
task_config = remove_task_by_type(message, "split")
|
|
62
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
67
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
63
68
|
|
|
64
69
|
# Transform the DataFrame (split text and tokenize).
|
|
65
70
|
df_updated = transform_text_split_and_tokenize_internal(
|
|
@@ -68,11 +73,13 @@ class TextSplitterStage(RayActorStage):
|
|
|
68
73
|
transform_config=self.validated_config,
|
|
69
74
|
execution_trace_log=None,
|
|
70
75
|
)
|
|
71
|
-
logger.
|
|
76
|
+
logger.debug(
|
|
77
|
+
"TextSplitterStage.on_data: Transformation complete. Updated payload has %d rows.", len(df_updated)
|
|
78
|
+
)
|
|
72
79
|
|
|
73
80
|
# Update the message payload.
|
|
74
81
|
message.payload(df_updated)
|
|
75
|
-
logger.
|
|
82
|
+
logger.debug("TextSplitterStage.on_data: Finished processing, returning updated message.")
|
|
76
83
|
|
|
77
84
|
return message
|
|
78
85
|
|
|
@@ -101,7 +108,7 @@ def text_splitter_fn(control_message: IngestControlMessage, stage_config: TextSp
|
|
|
101
108
|
|
|
102
109
|
# Remove the "split" task to obtain task-specific configuration.
|
|
103
110
|
task_config = remove_task_by_type(control_message, "split")
|
|
104
|
-
logger.debug("Extracted task config: %s", task_config)
|
|
111
|
+
logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
105
112
|
|
|
106
113
|
# Transform the DataFrame (split text and tokenize).
|
|
107
114
|
df_updated = transform_text_split_and_tokenize_internal(
|
|
@@ -110,10 +117,10 @@ def text_splitter_fn(control_message: IngestControlMessage, stage_config: TextSp
|
|
|
110
117
|
transform_config=stage_config,
|
|
111
118
|
execution_trace_log=None,
|
|
112
119
|
)
|
|
113
|
-
logger.
|
|
120
|
+
logger.debug("TextSplitterStage.on_data: Transformation complete. Updated payload has %d rows.", len(df_updated))
|
|
114
121
|
|
|
115
122
|
# Update the message payload.
|
|
116
123
|
control_message.payload(df_updated)
|
|
117
|
-
logger.
|
|
124
|
+
logger.debug("TextSplitterStage.on_data: Finished processing, returning updated message.")
|
|
118
125
|
|
|
119
126
|
return control_message
|
|
@@ -4,11 +4,16 @@
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Optional
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
import ray
|
|
10
10
|
|
|
11
11
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
12
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
13
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
15
|
+
nv_ingest_node_failure_try_except,
|
|
16
|
+
)
|
|
12
17
|
|
|
13
18
|
logger = logging.getLogger(__name__)
|
|
14
19
|
|
|
@@ -22,12 +27,15 @@ class ThroughputMonitorStage(RayActorStage):
|
|
|
22
27
|
It also adds the throughput as metadata on the control message before passing it on.
|
|
23
28
|
"""
|
|
24
29
|
|
|
25
|
-
def __init__(self, config: BaseModel) -> None:
|
|
30
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
|
|
26
31
|
# Initialize base attributes (e.g., self._running, self.start_time) via the base class.
|
|
27
|
-
super().__init__(config)
|
|
32
|
+
super().__init__(config, stage_name=stage_name)
|
|
28
33
|
self.count = 0
|
|
29
34
|
self.last_emit_time = None # Timestamp when the last throughput measure was emitted
|
|
30
35
|
|
|
36
|
+
@nv_ingest_node_failure_try_except()
|
|
37
|
+
@traceable()
|
|
38
|
+
@udf_intercept_hook()
|
|
31
39
|
async def on_data(self, message: Any) -> Any:
|
|
32
40
|
"""
|
|
33
41
|
Process an incoming control message. Increment the internal counter and, every 100 messages,
|
|
@@ -10,7 +10,6 @@ from typing import Dict, Any, List, Tuple, Optional
|
|
|
10
10
|
|
|
11
11
|
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
12
12
|
|
|
13
|
-
logging.basicConfig(level=logging.INFO)
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
|
|
16
15
|
# --- Constants ---
|
|
@@ -259,7 +258,7 @@ class ResourceConstraintManager:
|
|
|
259
258
|
else:
|
|
260
259
|
self.core_based_replica_limit = None # Treat as unlimited if detection failed
|
|
261
260
|
|
|
262
|
-
logger.
|
|
261
|
+
logger.debug(
|
|
263
262
|
f"[ConstraintMgr] Initialized. MaxReplicas={max_replicas}, "
|
|
264
263
|
f"EffectiveCoreLimit={self.available_cores:.2f} " # Log the potentially fractional value
|
|
265
264
|
f"(Method: {self.core_detection_details.get('detection_method')}), "
|