nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,85 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ import ray
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class ImageDedupStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that deduplicates images within a DataFrame payload.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame with image documents. It then:
29
+ 1. Removes the "dedup" task from the message.
30
+ 2. Calls the image deduplication logic (via deduplicate_images_internal) using a validated configuration.
31
+ 3. Updates the message payload with the deduplicated DataFrame.
32
+ """
33
+
34
+ def __init__(self, config: ImageDedupSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ logger.info("ImageDedupStage configuration validated successfully.")
39
+ except Exception as e:
40
+ logger.exception(f"Error validating Image Deduplication config: {e}")
41
+ raise
42
+
43
+ @traceable("image_deduplication")
44
+ @filter_by_task(required_tasks=["dedup"])
45
+ @nv_ingest_node_failure_try_except(annotation_id="image_dedup", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ """
48
+ Process the control message by deduplicating images.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The message containing a DataFrame payload with image documents.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with deduplicated images in the payload.
59
+ """
60
+ logger.info("ImageDedupStage.on_data: Starting image deduplication process.")
61
+ try:
62
+ # Extract the DataFrame payload.
63
+ df_ledger = control_message.payload()
64
+ logger.debug("Extracted payload with %d rows.", len(df_ledger))
65
+
66
+ # Remove the "dedup" task from the message to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "dedup")
68
+ logger.debug("Extracted task config: %s", task_config)
69
+
70
+ # Perform image deduplication.
71
+ new_df = deduplicate_images_internal(
72
+ df_ledger=df_ledger,
73
+ task_config=task_config,
74
+ mutate_config=self.validated_config,
75
+ execution_trace_log=None,
76
+ )
77
+ logger.info("Image deduplication completed. Resulting DataFrame has %d rows.", len(new_df))
78
+
79
+ # Update the message payload with the deduplicated DataFrame.
80
+ control_message.payload(new_df)
81
+
82
+ return control_message
83
+ except Exception as e:
84
+ logger.exception(f"ImageDedupStage failed processing control message: {e}")
85
+ raise
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Dict, Any
7
+
8
+ import ray
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.mutate.filter import filter_images_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class ImageFilterStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that filters images within a DataFrame payload.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame with image documents. It then:
29
+ 1. Removes the "filter" task from the message.
30
+ 2. Calls the image filtering logic (via filter_images_internal) using a validated configuration.
31
+ 3. Updates the message payload with the filtered DataFrame.
32
+ """
33
+
34
+ def __init__(self, config: ImageFilterSchema) -> None:
35
+ super().__init__(config)
36
+ try:
37
+ self.validated_config = config
38
+ logger.info("ImageFilterStage configuration validated successfully.")
39
+ except Exception as e:
40
+ logger.exception(f"Error validating Image Filter config: {e}")
41
+ raise
42
+
43
+ @traceable("image_filter")
44
+ @filter_by_task(required_tasks=["filter"])
45
+ @nv_ingest_node_failure_try_except(annotation_id="image_filter", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ """
48
+ Process the control message by filtering images.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The message containing a DataFrame payload with image documents.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with filtered images in the payload.
59
+ """
60
+ logger.info("ImageFilterStage.on_data: Starting image filtering process.")
61
+
62
+ # Extract the DataFrame payload.
63
+ df_ledger = control_message.payload()
64
+ logger.debug("Extracted payload with %d rows.", len(df_ledger))
65
+
66
+ # Remove the "filter" task from the message to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "filter")
68
+ logger.debug("Extracted task config: %s", task_config)
69
+
70
+ task_params: Dict[str, Any] = task_config.get("params", {})
71
+
72
+ # Perform image filtering.
73
+ new_df = filter_images_internal(
74
+ df_ledger=df_ledger,
75
+ task_config=task_params,
76
+ mutate_config=self.validated_config,
77
+ execution_trace_log=None,
78
+ )
79
+ logger.info("Image filtering completed. Resulting DataFrame has %d rows.", len(new_df))
80
+
81
+ # Update the message payload with the filtered DataFrame.
82
+ control_message.payload(new_df)
83
+
84
+ return control_message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from typing import Any, Dict
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_sink_stage_base import RayActorSinkStage
9
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
10
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
11
+
12
+
13
+ @ray.remote
14
+ class DefaultDrainSink(RayActorSinkStage):
15
+ def __init__(self, config: Any) -> None:
16
+ super().__init__(config, log_to_stdout=False)
17
+
18
+ self._last_sunk_count = 0
19
+ self._sunk_count = 0
20
+
21
+ @nv_ingest_node_failure_try_except(annotation_id="drain_sink", raise_on_failure=False)
22
+ def on_data(self, message: IngestControlMessage) -> IngestControlMessage:
23
+ self._sunk_count += 1
24
+
25
+ return message
26
+
27
+ @ray.method(num_returns=1)
28
+ def get_stats(self) -> Dict[str, Any]:
29
+ delta = self._sunk_count - self._last_sunk_count
30
+ self._last_sunk_count = self._sunk_count
31
+
32
+ return {
33
+ "active_processing": False,
34
+ "delta_processed": delta,
35
+ "elapsed": 0.0,
36
+ "prcessing_rate_cps": 0.0,
37
+ "processed": self._sunk_count,
38
+ "successful_queue_reads": self.stats.get("successful_queue_reads", 0),
39
+ "successful_queue_writes": 0,
40
+ "queue_full": 0,
41
+ }
@@ -0,0 +1,268 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import sys
6
+ import json
7
+ import logging
8
+ from typing import Any, Dict, List, Tuple, Literal, Optional, Union
9
+ from pydantic import BaseModel, Field
10
+ import ray
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
13
+ from nv_ingest_api.internal.primitives.tracing.logging import annotate_cm
14
+ from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
15
+ from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class BrokerParamsRedis(BaseModel):
21
+ """Specific parameters for Redis broker_params."""
22
+
23
+ db: int = 0
24
+ use_ssl: bool = False
25
+
26
+
27
+ class BaseBrokerClientConfig(BaseModel):
28
+ """Base configuration common to all broker clients."""
29
+
30
+ host: str = Field(..., description="Hostname or IP address of the message broker.")
31
+ port: int = Field(..., description="Port number of the message broker.")
32
+ max_retries: int = Field(default=5, ge=0, description="Maximum number of connection retries.")
33
+ max_backoff: float = Field(default=5.0, gt=0, description="Maximum backoff delay in seconds between retries.")
34
+ connection_timeout: float = Field(default=30.0, gt=0, description="Connection timeout in seconds.")
35
+
36
+
37
+ class RedisClientConfig(BaseBrokerClientConfig):
38
+ """Configuration specific to the Redis client."""
39
+
40
+ client_type: Literal["redis"] = Field(..., description="Specifies the client type as Redis.")
41
+ broker_params: BrokerParamsRedis = Field(
42
+ default_factory=BrokerParamsRedis, description="Redis-specific parameters like db and ssl."
43
+ )
44
+
45
+
46
+ class SimpleClientConfig(BaseBrokerClientConfig):
47
+ """Configuration specific to the Simple client."""
48
+
49
+ client_type: Literal["simple"] = Field(..., description="Specifies the client type as Simple.")
50
+ broker_params: Optional[Dict[str, Any]] = Field(
51
+ default={}, description="Optional parameters for Simple client (currently unused)."
52
+ )
53
+
54
+
55
+ # --- Update the Main Sink Configuration ---
56
+
57
+
58
+ class MessageBrokerTaskSinkConfig(BaseModel):
59
+ """
60
+ Configuration for the MessageBrokerTaskSinkStage.
61
+
62
+ Attributes
63
+ ----------
64
+ broker_client : Union[RedisClientConfig, SimpleClientConfig]
65
+ Configuration parameters for connecting to the message broker.
66
+ The specific schema is determined by the 'client_type' field.
67
+ poll_interval : float, optional
68
+ The polling interval (in seconds) for processing messages. Defaults to 0.1.
69
+ """
70
+
71
+ # Use the discriminated union for broker_client
72
+ broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
73
+ poll_interval: float = Field(default=0.1, gt=0)
74
+
75
+
76
+ @ray.remote
77
+ class MessageBrokerTaskSinkStage(RayActorStage):
78
+ def __init__(self, config: MessageBrokerTaskSinkConfig) -> None:
79
+ super().__init__(config, log_to_stdout=False)
80
+
81
+ self.config: MessageBrokerTaskSinkConfig
82
+
83
+ self.poll_interval = self.config.poll_interval
84
+
85
+ # Create the appropriate broker client (e.g., Redis or Simple).
86
+ self.client = self._create_client()
87
+ self.start_time = None
88
+ self.message_count = 0
89
+
90
+ # --- Private Helper Methods ---
91
+ def _create_client(self):
92
+ broker_config = self.config.broker_client
93
+
94
+ if broker_config.client_type == "redis":
95
+ return RedisClient(
96
+ host=broker_config.host,
97
+ port=broker_config.port,
98
+ db=broker_config.broker_params.db, # Access nested Pydantic model
99
+ max_retries=broker_config.max_retries,
100
+ max_backoff=broker_config.max_backoff,
101
+ connection_timeout=broker_config.connection_timeout,
102
+ use_ssl=broker_config.broker_params.use_ssl, # Access nested Pydantic model
103
+ )
104
+ elif broker_config.client_type == "simple":
105
+ server_host = broker_config.host
106
+ server_host = "0.0.0.0"
107
+ return SimpleClient(
108
+ host=server_host, # Using the potentially overridden host
109
+ port=broker_config.port,
110
+ max_retries=broker_config.max_retries,
111
+ max_backoff=broker_config.max_backoff,
112
+ connection_timeout=broker_config.connection_timeout,
113
+ # broker_params is available via broker_config.broker_params if needed
114
+ )
115
+
116
+ @staticmethod
117
+ def _extract_data_frame(message: Any) -> Tuple[Any, Any]:
118
+ """
119
+ Extracts a DataFrame from a message payload and returns it along with selected columns.
120
+ """
121
+ try:
122
+ df = message.payload()
123
+ logger.debug(f"Sink received DataFrame with {len(df)} rows.")
124
+ keep_cols = ["document_type", "metadata"]
125
+ return df, df[keep_cols].to_dict(orient="records")
126
+ except Exception as err:
127
+ logger.warning(f"Failed to extract DataFrame: {err}")
128
+ return None, None
129
+
130
+ @staticmethod
131
+ def _split_large_dict(json_data: List[Dict[str, Any]], size_limit: int) -> List[List[Dict[str, Any]]]:
132
+ fragments = []
133
+ current_fragment = []
134
+ current_size = sys.getsizeof(json.dumps(current_fragment))
135
+ for item in json_data:
136
+ item_size = sys.getsizeof(json.dumps(item))
137
+ if current_size + item_size > size_limit:
138
+ fragments.append(current_fragment)
139
+ current_fragment = []
140
+ current_size = sys.getsizeof(json.dumps(current_fragment))
141
+ current_fragment.append(item)
142
+ current_size += item_size
143
+ if current_fragment:
144
+ fragments.append(current_fragment)
145
+ return fragments
146
+
147
+ def _create_json_payload(self, message: Any, df_json: Any) -> List[Dict[str, Any]]:
148
+ """
149
+ Creates JSON payloads based on the message data. Splits the data if it exceeds a size limit.
150
+ """
151
+ df_json_str = json.dumps(df_json)
152
+ df_json_size = sys.getsizeof(df_json_str)
153
+ size_limit = 128 * 1024 * 1024 # 128 MB limit
154
+ if df_json_size > size_limit:
155
+ data_fragments = self._split_large_dict(df_json, size_limit)
156
+ fragment_count = len(data_fragments)
157
+ else:
158
+ data_fragments = [df_json]
159
+ fragment_count = 1
160
+
161
+ ret_val_json_list = []
162
+ for i, fragment_data in enumerate(data_fragments):
163
+ ret_val_json = {
164
+ "status": "success" if not message.get_metadata("cm_failed", False) else "failed",
165
+ "description": (
166
+ "Successfully processed the message."
167
+ if not message.get_metadata("cm_failed", False)
168
+ else "Failed to process the message."
169
+ ),
170
+ "data": fragment_data,
171
+ "fragment": i,
172
+ "fragment_count": fragment_count,
173
+ }
174
+ if i == 0 and message.get_metadata("add_trace_tagging", True):
175
+ trace_snapshot = message.filter_timestamp("trace::")
176
+ ret_val_json["trace"] = {key: ts.timestamp() * 1e9 for key, ts in trace_snapshot.items()}
177
+ ret_val_json["annotations"] = {
178
+ key: message.get_metadata(key) for key in message.list_metadata() if key.startswith("annotation::")
179
+ }
180
+ ret_val_json_list.append(ret_val_json)
181
+ logger.debug(f"Sink created {len(ret_val_json_list)} JSON payloads.")
182
+ return ret_val_json_list
183
+
184
+ def _push_to_broker(self, json_payloads: List[str], response_channel: str, retry_count: int = 2) -> None:
185
+ """
186
+ Pushes JSON payloads to the broker channel, retrying on failure.
187
+ """
188
+ for payload in json_payloads:
189
+ payload_size = sys.getsizeof(payload)
190
+ size_limit = 2**28 # 256 MB
191
+ if payload_size > size_limit:
192
+ raise ValueError(f"Payload size {payload_size} exceeds limit of {size_limit / 1e6} MB.")
193
+ for attempt in range(retry_count):
194
+ try:
195
+ for payload in json_payloads:
196
+ self.client.submit_message(response_channel, payload)
197
+ logger.debug(f"Sink forwarded message to channel '{response_channel}'.")
198
+ return
199
+ except ValueError as e:
200
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
201
+ if attempt == retry_count - 1:
202
+ raise
203
+
204
+ def _handle_failure(
205
+ self, response_channel: str, json_result_fragments: List[Dict[str, Any]], e: Exception, mdf_size: int
206
+ ) -> None:
207
+ """
208
+ Handles failure by logging and pushing a failure message to the broker.
209
+ """
210
+ error_description = (
211
+ f"Failed to forward message: {e}. "
212
+ f"Payload size: {sys.getsizeof(json.dumps(json_result_fragments)) / 1e6} MB, "
213
+ f"Rows: {mdf_size}"
214
+ )
215
+ logger.error(error_description)
216
+ fail_msg = {
217
+ "data": None,
218
+ "status": "failed",
219
+ "description": error_description,
220
+ "trace": json_result_fragments[0].get("trace", {}) if json_result_fragments else {},
221
+ }
222
+
223
+ self.client.submit_message(response_channel, json.dumps(fail_msg))
224
+
225
+ # --- Public API Methods for message broker sink ---
226
+
227
+ def on_data(self, control_message: Any) -> Any:
228
+ """
229
+ Processes the control message and pushes the resulting JSON payloads to the broker.
230
+ """
231
+ mdf, df_json = None, None
232
+ json_result_fragments = []
233
+ response_channel = control_message.get_metadata("response_channel")
234
+ try:
235
+ cm_failed = control_message.get_metadata("cm_failed", False)
236
+ if not cm_failed:
237
+ mdf, df_json = self._extract_data_frame(control_message)
238
+ json_result_fragments = self._create_json_payload(control_message, df_json)
239
+ else:
240
+ json_result_fragments = self._create_json_payload(control_message, None)
241
+
242
+ total_payload_size = 0
243
+ json_payloads = []
244
+ for i, fragment in enumerate(json_result_fragments, start=1):
245
+ payload = json.dumps(fragment)
246
+ size_bytes = len(payload.encode("utf-8"))
247
+ total_payload_size += size_bytes
248
+ size_mb = size_bytes / (1024 * 1024)
249
+ logger.debug(f"Sink Fragment {i} size: {size_mb:.2f} MB")
250
+ json_payloads.append(payload)
251
+
252
+ total_size_mb = total_payload_size / (1024 * 1024)
253
+ logger.debug(f"Sink Total JSON payload size: {total_size_mb:.2f} MB")
254
+ annotate_cm(control_message, message="Pushed")
255
+ self._push_to_broker(json_payloads, response_channel)
256
+
257
+ except ValueError as e:
258
+ mdf_size = len(mdf) if mdf is not None and not mdf.empty else 0
259
+ self._handle_failure(response_channel, json_result_fragments, e, mdf_size)
260
+ except Exception as e:
261
+ logger.exception(f"Critical error processing message: {e}")
262
+ mdf_size = len(mdf) if mdf is not None and not mdf.empty else 0
263
+ self._handle_failure(response_channel, json_result_fragments, e, mdf_size)
264
+
265
+ self.message_count += 1
266
+ self._logger.debug(f"[Message Broker Sink] Processed message count: {self.message_count}")
267
+
268
+ return control_message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0