nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,66 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any
7
+ from pydantic import BaseModel
8
+ import ray
9
+
10
+ # Import the base class for our stages.
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+ from nv_ingest.framework.schemas.framework_job_counter_schema import JobCounterSchema
13
+ from nv_ingest.framework.util.telemetry.global_stats import GlobalStats
14
+ from nv_ingest_api.util.exception_handlers.decorators import (
15
+ nv_ingest_node_failure_try_except,
16
+ )
17
+
18
+ # Import the JobCounter schema and global stats singleton.
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @ray.remote
25
+ class JobCounterStage(RayActorStage):
26
+ """
27
+ A Ray actor stage that counts jobs and updates global statistics.
28
+
29
+ Based on the configuration (a JobCounterSchema instance), it increments a specific
30
+ statistic each time it processes a message.
31
+ """
32
+
33
+ def __init__(self, config: BaseModel) -> None:
34
+ # Ensure base attributes (e.g. self._running) are initialized.
35
+ super().__init__(config)
36
+ # The validated config should be a JobCounterSchema instance.
37
+ self.validated_config: JobCounterSchema = config
38
+ # Obtain the global stats' singleton.
39
+ self.stats = GlobalStats.get_instance()
40
+
41
+ @nv_ingest_node_failure_try_except(annotation_id="job_counter", raise_on_failure=False)
42
+ async def on_data(self, message: Any) -> Any:
43
+ """
44
+ Process an incoming IngestControlMessage by counting jobs.
45
+
46
+ If the validated configuration name is "completed_jobs", then if the message metadata
47
+ indicates a failure (cm_failed == True), increments "failed_jobs"; otherwise, increments "completed_jobs".
48
+ For any other configuration name, it increments that statistic.
49
+
50
+ Returns the original message.
51
+ """
52
+ logger.debug(f"Performing job counter: {self.validated_config.name}")
53
+ try:
54
+ if self.validated_config.name == "completed_jobs":
55
+ if message.has_metadata("cm_failed") and message.get_metadata("cm_failed"):
56
+ self.stats.increment_stat("failed_jobs")
57
+ else:
58
+ self.stats.increment_stat("completed_jobs")
59
+ return message
60
+
61
+ self.stats.increment_stat(self.validated_config.name)
62
+ return message
63
+ except Exception as e:
64
+ new_message = f"on_data: Failed to run job counter. Original error: {str(e)}"
65
+ logger.exception(new_message)
66
+ raise type(e)(new_message) from e
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,205 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from datetime import datetime
6
+ from typing import Any, Optional
7
+
8
+ import ray
9
+ from opentelemetry import trace
10
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
11
+ from opentelemetry.sdk.resources import Resource
12
+ from opentelemetry.sdk.trace import TracerProvider
13
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
14
+ from opentelemetry.sdk.trace.id_generator import RandomIdGenerator
15
+ from opentelemetry.trace import NonRecordingSpan
16
+ from opentelemetry.trace import SpanContext
17
+ from opentelemetry.trace import Status
18
+ from opentelemetry.trace import StatusCode
19
+ from opentelemetry.trace import TraceFlags
20
+
21
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
22
+ from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
23
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
24
+
25
+ from nv_ingest_api.internal.primitives.tracing.logging import TaskResultStatus
26
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
27
+
28
+
29
+ @ray.remote
30
+ class OpenTelemetryTracerStage(RayActorStage):
31
+ """
32
+ A Ray actor stage that collects and exports traces to OpenTelemetry.
33
+
34
+ This stage uses OpenTelemetry to trace the execution of tasks within the system.
35
+ It creates spans for tasks and exports them to a configured OpenTelemetry endpoint.
36
+ """
37
+
38
+ def __init__(self, config: OpenTelemetryTracerSchema) -> None:
39
+ super().__init__(config)
40
+
41
+ # self._logger.info(f"[Telemetry] Initializing OpenTelemetry tracer stage with config: {config}")
42
+
43
+ self.validated_config: OpenTelemetryTracerSchema = config
44
+ self.resource = Resource(attributes={"service.name": "nv-ingest"})
45
+ self.otlp_exporter = OTLPSpanExporter(endpoint=self.validated_config.otel_endpoint, insecure=True)
46
+ self.span_processor = BatchSpanProcessor(self.otlp_exporter)
47
+
48
+ trace.set_tracer_provider(TracerProvider(resource=self.resource))
49
+ trace.get_tracer_provider().add_span_processor(self.span_processor)
50
+
51
+ self.tracer = trace.get_tracer(__name__)
52
+
53
+ def collect_timestamps(self, message):
54
+ job_id = message.get_metadata("job_id")
55
+ if isinstance(job_id, str) and len(job_id) == 36:
56
+ trace_id = uuid_to_trace_id(job_id)
57
+ elif isinstance(job_id, str):
58
+ trace_id = int(job_id, 16)
59
+ else:
60
+ trace_id = RandomIdGenerator().generate_trace_id()
61
+
62
+ span_id = RandomIdGenerator().generate_span_id()
63
+ timestamps = extract_timestamps_from_message(message)
64
+
65
+ flattened = [x for t in timestamps.values() for x in t]
66
+ if not flattened:
67
+ self._logger.debug("No timestamps found for message; skipping tracing.")
68
+ return
69
+
70
+ start_time = min(flattened)
71
+ end_time = max(flattened)
72
+
73
+ self._logger.debug(f"[Telemetry] trace_id: {trace_id}, span_id: {span_id}")
74
+
75
+ span_context = SpanContext(
76
+ trace_id=trace_id,
77
+ span_id=span_id,
78
+ is_remote=True,
79
+ trace_flags=TraceFlags(0x01),
80
+ )
81
+ parent_ctx = trace.set_span_in_context(NonRecordingSpan(span_context))
82
+ parent_span = self.tracer.start_span(str(job_id), context=parent_ctx, start_time=start_time)
83
+
84
+ event_count = create_span_with_timestamps(self.tracer, parent_span, message)
85
+
86
+ if message.has_metadata("cm_failed") and message.get_metadata("cm_failed"):
87
+ parent_span.set_status(Status(StatusCode.ERROR))
88
+ else:
89
+ parent_span.set_status(Status(StatusCode.OK))
90
+
91
+ try:
92
+ parent_span.add_event("start", timestamp=start_time)
93
+ parent_span.add_event("end", timestamp=end_time)
94
+ finally:
95
+ parent_span.end(end_time=end_time)
96
+
97
+ self._logger.debug(f"[Telemetry] Exported spans for message {job_id} with {event_count} total events.")
98
+
99
+ @nv_ingest_node_failure_try_except(annotation_id="otel_tracer", raise_on_failure=False)
100
+ def on_data(self, control_message: IngestControlMessage) -> Optional[Any]:
101
+ try:
102
+ do_trace_tagging = bool(control_message.get_metadata("config::add_trace_tagging"))
103
+
104
+ if not do_trace_tagging:
105
+ self._logger.debug("Skipping OpenTelemetry tracing, do_trace_tagging is False.")
106
+ return control_message
107
+
108
+ self._logger.debug("Sending telemetry data to OpenTelemetry")
109
+
110
+ self.collect_timestamps(control_message)
111
+
112
+ return control_message
113
+ except Exception as e:
114
+ self._logger.warning(f"Error in OpenTelemetry tracer stage: {e}")
115
+ raise e
116
+
117
+
118
+ def extract_timestamps_from_message(message):
119
+ timestamps = {}
120
+ dedup_counter = {}
121
+
122
+ for key, val in message.filter_timestamp("trace::exit::").items():
123
+ exit_key = key
124
+ entry_key = exit_key.replace("trace::exit::", "trace::entry::")
125
+
126
+ task_name = key.replace("trace::exit::", "")
127
+ if task_name in dedup_counter:
128
+ dedup_counter[task_name] += 1
129
+ task_name = task_name + "_" + str(dedup_counter[task_name])
130
+ else:
131
+ dedup_counter[task_name] = 0
132
+
133
+ ts_entry = message.get_timestamp(entry_key)
134
+ if ts_entry is None:
135
+ continue
136
+
137
+ ts_exit = (
138
+ message.get_timestamp(exit_key) or datetime.now()
139
+ ) # When a job fails, it may not have exit time. Default to current time.
140
+ ts_entry_ns = int(ts_entry.timestamp() * 1e9)
141
+ ts_exit_ns = int(ts_exit.timestamp() * 1e9)
142
+
143
+ timestamps[task_name] = (ts_entry_ns, ts_exit_ns)
144
+
145
+ return timestamps
146
+
147
+
148
+ def extract_annotated_task_results(message):
149
+ task_results = {}
150
+ for key in message.list_metadata():
151
+ if not key.startswith("annotation::"):
152
+ continue
153
+ task = message.get_metadata(key)
154
+ if not (("task_id" in task) and ("task_result" in task)):
155
+ continue
156
+ task_id = task["task_id"]
157
+ task_result = task["task_result"]
158
+ task_results[task_id] = task_result
159
+
160
+ return task_results
161
+
162
+
163
+ def create_span_with_timestamps(tracer, parent_span, message) -> int:
164
+ timestamps = extract_timestamps_from_message(message)
165
+ task_results = extract_annotated_task_results(message)
166
+
167
+ ctx_store = {}
168
+ event_counter = 0
169
+ child_ctx = trace.set_span_in_context(parent_span)
170
+
171
+ for task_name, (ts_entry, ts_exit) in sorted(timestamps.items(), key=lambda x: x[1]):
172
+ main_task, *subtask = task_name.split("::", 1)
173
+ subtask = "::".join(subtask)
174
+
175
+ if not subtask:
176
+ span = tracer.start_span(main_task, context=child_ctx, start_time=ts_entry)
177
+ else:
178
+ subtask_ctx = trace.set_span_in_context(ctx_store[main_task][0])
179
+ span = tracer.start_span(subtask, context=subtask_ctx, start_time=ts_entry)
180
+
181
+ span.add_event("entry", timestamp=ts_entry)
182
+ span.add_event("exit", timestamp=ts_exit)
183
+ event_counter += 2
184
+
185
+ if task_name in task_results:
186
+ task_result = task_results[main_task]
187
+ if task_result == TaskResultStatus.SUCCESS.value:
188
+ span.set_status(Status(StatusCode.OK))
189
+ if task_result == TaskResultStatus.FAILURE.value:
190
+ span.set_status(Status(StatusCode.ERROR))
191
+
192
+ ctx_store[task_name] = (span, ts_exit)
193
+
194
+ for _, (span, ts_exit) in ctx_store.items():
195
+ span.end(end_time=ts_exit)
196
+
197
+ return event_counter
198
+
199
+
200
+ def uuid_to_trace_id(uuid_str: str) -> int:
201
+ """Convert a UUID-like string to an integer OpenTelemetry trace ID."""
202
+ if not isinstance(uuid_str, str) or len(uuid_str) != 36:
203
+ raise ValueError("UUID must be a 36-character string with hyphens")
204
+
205
+ return int(uuid_str.replace("-", ""), 16)
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,81 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import pprint
7
+ from typing import Any
8
+
9
+ import ray
10
+
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+ from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.transform.transform_image_caption_schema import ImageCaptionExtractionSchema
16
+ from nv_ingest_api.internal.transform.caption_image import transform_image_create_vlm_caption_internal
17
+ from nv_ingest_api.util.exception_handlers.decorators import (
18
+ nv_ingest_node_failure_try_except,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @ray.remote
25
+ class ImageCaptionTransformStage(RayActorStage):
26
+ """
27
+ A Ray actor stage that extracts image captions from a DataFrame payload.
28
+
29
+ This stage validates its configuration (using ImageCaptionExtractionSchema), then processes the DataFrame
30
+ via transform_image_create_vlm_caption_internal. The updated DataFrame and any extraction trace info
31
+ are stored in the control message.
32
+ """
33
+
34
+ def __init__(self, config: ImageCaptionExtractionSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ logger.info("ImageCaptionTransformStage configuration validated.")
39
+ except Exception as e:
40
+ logger.exception("Error validating caption extraction config")
41
+ raise e
42
+
43
+ @traceable("image_captioning")
44
+ @filter_by_task(required_tasks=["caption"])
45
+ @nv_ingest_node_failure_try_except(annotation_id="image_captioning", raise_on_failure=False)
46
+ def on_data(self, control_message: Any) -> Any:
47
+ """
48
+ Process the control message by extracting image captions.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The incoming message containing the DataFrame payload.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with the extracted captions.
59
+ """
60
+ logger.info("ImageCaptionTransformStage.on_data: Starting image caption extraction.")
61
+
62
+ # Retrieve the DataFrame payload.
63
+ df_payload = control_message.payload()
64
+ logger.debug("ImageCaptionTransformStage: Payload extracted with %d rows.", len(df_payload))
65
+
66
+ # Remove the "caption" task to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "caption")
68
+ logger.debug("ImageCaptionTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
69
+
70
+ # Call the caption extraction function.
71
+ new_df = transform_image_create_vlm_caption_internal(
72
+ df_payload, task_config=task_config, transform_config=self.validated_config
73
+ )
74
+ logger.info("Image caption extraction completed. New payload has %d rows.", len(new_df))
75
+
76
+ # Update the control message with the new DataFrame.
77
+ control_message.payload(new_df)
78
+ # Optionally, annotate the control message with extraction trace info.
79
+ # control_message.set_metadata("caption_extraction_trace", execution_trace_log)
80
+ logger.info("ImageCaptionTransformStage.on_data: Updated control message and returning.")
81
+ return control_message
@@ -0,0 +1,81 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import pprint
7
+ from typing import Any
8
+ import ray
9
+
10
+ # Assume these imports come from your project:
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+ from nv_ingest.framework.util.flow_control import filter_by_task
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
16
+ from nv_ingest_api.internal.transform.embed_text import transform_create_text_embeddings_internal
17
+ from nv_ingest_api.util.exception_handlers.decorators import (
18
+ nv_ingest_node_failure_try_except,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @ray.remote
25
+ class TextEmbeddingTransformStage(RayActorStage):
26
+ """
27
+ A Ray actor stage that extracts text embeddings from a DataFrame payload.
28
+
29
+ This stage uses the validated configuration (TextEmbeddingSchema) to process the DataFrame
30
+ and generate text embeddings. The resulting DataFrame is set back on the message, and any
31
+ trace or extraction metadata is added.
32
+ """
33
+
34
+ def __init__(self, config: TextEmbeddingSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ logger.info("TextEmbeddingTransformStage configuration validated successfully.")
39
+ except Exception as e:
40
+ logger.exception("Error validating text embedding extractor config")
41
+ raise e
42
+
43
+ @traceable("text_embedding")
44
+ @filter_by_task(required_tasks=["embed"])
45
+ @nv_ingest_node_failure_try_except(annotation_id="text_embedding", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> Any:
47
+ """
48
+ Process the control message by generating text embeddings.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The incoming message containing the DataFrame payload.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with text embeddings and trace info added.
59
+ """
60
+ logger.info("TextEmbeddingTransformStage.on_data: Starting text embedding transformation.")
61
+
62
+ # Get the DataFrame payload.
63
+ df_payload = control_message.payload()
64
+ logger.debug("TextEmbeddingTransformStage: Extracted payload with %d rows.", len(df_payload))
65
+
66
+ # Remove the "embed" task to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "embed")
68
+ logger.debug("TextEmbeddingTransformStage: Task configuration extracted: %s", pprint.pformat(task_config))
69
+
70
+ # Call the text embedding extraction function.
71
+ new_df, execution_trace_log = transform_create_text_embeddings_internal(
72
+ df_payload, task_config=task_config, transform_config=self.validated_config
73
+ )
74
+ logger.info("Text embedding transformation completed. New payload has %d rows.", len(new_df))
75
+
76
+ # Update the control message payload.
77
+ control_message.payload(new_df)
78
+ # Annotate the message metadata with trace info.
79
+ control_message.set_metadata("text_embedding_trace", execution_trace_log)
80
+ logger.info("Text embedding trace metadata added.")
81
+ return control_message
@@ -0,0 +1,74 @@
1
+ import logging
2
+ from typing import Any
3
+ import ray
4
+
5
+ # Assume these imports come from your project:
6
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
7
+ from nv_ingest.framework.util.flow_control import filter_by_task
8
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
9
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
10
+ from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
11
+ from nv_ingest_api.internal.transform.split_text import transform_text_split_and_tokenize_internal
12
+ from nv_ingest_api.util.exception_handlers.decorators import (
13
+ nv_ingest_node_failure_try_except,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @ray.remote
20
+ class TextSplitterStage(RayActorStage):
21
+ """
22
+ A Ray actor stage that splits documents into smaller parts based on specified criteria.
23
+
24
+ This stage extracts the DataFrame payload from an IngestControlMessage, removes the "split"
25
+ task (if present) to obtain the task configuration, and then calls the internal text splitting
26
+ and tokenization logic. The updated DataFrame is then set back into the message.
27
+ """
28
+
29
+ def __init__(self, config: TextSplitterSchema) -> None:
30
+ super().__init__(config)
31
+ # Store the validated configuration (assumed to be an instance of TextSplitterSchema)
32
+ self.validated_config: TextSplitterSchema = config
33
+ logger.info("TextSplitterStage initialized with config: %s", config)
34
+
35
+ @traceable("text_splitter")
36
+ @filter_by_task(["split"])
37
+ @nv_ingest_node_failure_try_except(annotation_id="text_splitter", raise_on_failure=False)
38
+ def on_data(self, message: Any) -> Any:
39
+ """
40
+ Process an incoming IngestControlMessage by splitting and tokenizing its text.
41
+
42
+ Parameters
43
+ ----------
44
+ message : IngestControlMessage
45
+ The incoming message containing the payload DataFrame.
46
+
47
+ Returns
48
+ -------
49
+ IngestControlMessage
50
+ The updated message with its payload transformed.
51
+ """
52
+
53
+ # Extract the DataFrame payload.
54
+ df_payload = message.payload()
55
+ logger.debug("Extracted payload with %d rows.", len(df_payload))
56
+
57
+ # Remove the "split" task to obtain task-specific configuration.
58
+ task_config = remove_task_by_type(message, "split")
59
+ logger.debug("Extracted task config: %s", task_config)
60
+
61
+ # Transform the DataFrame (split text and tokenize).
62
+ df_updated = transform_text_split_and_tokenize_internal(
63
+ df_transform_ledger=df_payload,
64
+ task_config=task_config,
65
+ transform_config=self.validated_config,
66
+ execution_trace_log=None,
67
+ )
68
+ logger.info("TextSplitterStage.on_data: Transformation complete. Updated payload has %d rows.", len(df_updated))
69
+
70
+ # Update the message payload.
71
+ message.payload(df_updated)
72
+ logger.info("TextSplitterStage.on_data: Finished processing, returning updated message.")
73
+
74
+ return message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,65 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import time
7
+ from typing import Any
8
+ from pydantic import BaseModel
9
+ import ray
10
+
11
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @ray.remote
17
+ class ThroughputMonitorStage(RayActorStage):
18
+ """
19
+ A Ray actor stage that monitors throughput by counting messages.
20
+
21
+ Every 100 messages, it calculates the throughput (messages per second) and logs the measure.
22
+ It also adds the throughput as metadata on the control message before passing it on.
23
+ """
24
+
25
+ def __init__(self, config: BaseModel) -> None:
26
+ # Initialize base attributes (e.g., self._running, self.start_time) via the base class.
27
+ super().__init__(config)
28
+ self.count = 0
29
+ self.last_emit_time = None # Timestamp when the last throughput measure was emitted
30
+
31
+ async def on_data(self, message: Any) -> Any:
32
+ """
33
+ Process an incoming control message. Increment the internal counter and, every 100 messages,
34
+ calculate and log the throughput. The throughput value is also added to the message metadata.
35
+
36
+ Parameters
37
+ ----------
38
+ message : Any
39
+ The incoming control message.
40
+
41
+ Returns
42
+ -------
43
+ Any
44
+ The (possibly modified) control message.
45
+ """
46
+ self.count += 1
47
+ if self.last_emit_time is None:
48
+ self.last_emit_time = time.time()
49
+
50
+ if self.count % 1000 == 0:
51
+ now = time.time()
52
+ elapsed = now - self.last_emit_time
53
+ throughput = 1000 / elapsed if elapsed > 0 else 0
54
+ logger.warning(
55
+ f"ThroughputMonitorStage: Processed {self.count} messages. Throughput: {throughput:.2f} messages/sec"
56
+ )
57
+ try:
58
+ # Attempt to add throughput information to the message metadata.
59
+ message.set_metadata("throughput", throughput)
60
+ except Exception:
61
+ # If the message doesn't support metadata, skip.
62
+ pass
63
+ self.last_emit_time = now
64
+
65
+ return message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0