nv-ingest-api 2025.3.27.dev20250327__py3-none-any.whl → 2025.3.28.dev20250328__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.3.28.dev20250328.dist-info/RECORD +9 -0
  6. nv_ingest_api/interface/__init__.py +0 -215
  7. nv_ingest_api/interface/extract.py +0 -972
  8. nv_ingest_api/interface/mutate.py +0 -154
  9. nv_ingest_api/interface/store.py +0 -218
  10. nv_ingest_api/interface/transform.py +0 -382
  11. nv_ingest_api/interface/utility.py +0 -200
  12. nv_ingest_api/internal/enums/__init__.py +0 -3
  13. nv_ingest_api/internal/enums/common.py +0 -494
  14. nv_ingest_api/internal/extract/__init__.py +0 -3
  15. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  17. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  18. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  19. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  20. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  23. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  24. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  25. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  26. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  27. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  28. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  29. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  30. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  34. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  40. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  44. nv_ingest_api/internal/mutate/__init__.py +0 -3
  45. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  46. nv_ingest_api/internal/mutate/filter.py +0 -133
  47. nv_ingest_api/internal/primitives/__init__.py +0 -0
  48. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  49. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  50. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  51. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  52. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  53. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  54. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -272
  55. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  56. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -452
  57. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  58. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  59. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  60. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  61. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  62. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  63. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  64. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  65. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  66. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  67. nv_ingest_api/internal/schemas/__init__.py +0 -3
  68. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  70. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  71. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  72. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  74. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  75. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  76. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  77. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  78. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  79. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  80. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  81. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  82. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  83. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  84. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  85. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  86. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  87. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  88. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  89. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  90. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  91. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  92. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  93. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  94. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  95. nv_ingest_api/internal/store/__init__.py +0 -3
  96. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  97. nv_ingest_api/internal/store/image_upload.py +0 -232
  98. nv_ingest_api/internal/transform/__init__.py +0 -3
  99. nv_ingest_api/internal/transform/caption_image.py +0 -205
  100. nv_ingest_api/internal/transform/embed_text.py +0 -496
  101. nv_ingest_api/internal/transform/split_text.py +0 -157
  102. nv_ingest_api/util/__init__.py +0 -0
  103. nv_ingest_api/util/control_message/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/validators.py +0 -47
  105. nv_ingest_api/util/converters/__init__.py +0 -0
  106. nv_ingest_api/util/converters/bytetools.py +0 -78
  107. nv_ingest_api/util/converters/containers.py +0 -65
  108. nv_ingest_api/util/converters/datetools.py +0 -90
  109. nv_ingest_api/util/converters/dftools.py +0 -127
  110. nv_ingest_api/util/converters/formats.py +0 -64
  111. nv_ingest_api/util/converters/type_mappings.py +0 -27
  112. nv_ingest_api/util/detectors/__init__.py +0 -5
  113. nv_ingest_api/util/detectors/language.py +0 -38
  114. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  115. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  116. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  117. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  118. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  119. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  120. nv_ingest_api/util/image_processing/__init__.py +0 -5
  121. nv_ingest_api/util/image_processing/clustering.py +0 -260
  122. nv_ingest_api/util/image_processing/processing.py +0 -179
  123. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  124. nv_ingest_api/util/image_processing/transforms.py +0 -407
  125. nv_ingest_api/util/logging/__init__.py +0 -0
  126. nv_ingest_api/util/logging/configuration.py +0 -31
  127. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  128. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  129. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  130. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  131. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  132. nv_ingest_api/util/metadata/__init__.py +0 -5
  133. nv_ingest_api/util/metadata/aggregators.py +0 -469
  134. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  135. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  136. nv_ingest_api/util/nim/__init__.py +0 -56
  137. nv_ingest_api/util/pdf/__init__.py +0 -3
  138. nv_ingest_api/util/pdf/pdfium.py +0 -427
  139. nv_ingest_api/util/schema/__init__.py +0 -0
  140. nv_ingest_api/util/schema/schema_validator.py +0 -10
  141. nv_ingest_api/util/service_clients/__init__.py +0 -3
  142. nv_ingest_api/util/service_clients/client_base.py +0 -72
  143. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  144. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  146. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  147. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -368
  148. nv_ingest_api/util/string_processing/__init__.py +0 -51
  149. nv_ingest_api-2025.3.27.dev20250327.dist-info/RECORD +0 -152
  150. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  151. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.28.dev20250328.dist-info}/top_level.txt +0 -0
@@ -1,334 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import json
6
- import logging
7
- import time
8
- from typing import Any
9
- from typing import Dict
10
- from typing import List
11
- from typing import Optional
12
- from typing import Tuple
13
- from typing import Union
14
-
15
- import redis
16
- from redis.exceptions import RedisError
17
-
18
- from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase
19
-
20
- # pylint: skip-file
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class RedisClient(MessageBrokerClientBase):
26
- """
27
- A client for interfacing with Redis, providing mechanisms for sending and receiving messages
28
- with retry logic and connection management.
29
-
30
- Parameters
31
- ----------
32
- host : str
33
- The hostname of the Redis server.
34
- port : int
35
- The port number of the Redis server.
36
- db : int, optional
37
- The database number to connect to. Default is 0.
38
- max_retries : int, optional
39
- The maximum number of retry attempts for operations. Default is 0 (no retries).
40
- max_backoff : int, optional
41
- The maximum backoff delay between retries in seconds. Default is 32 seconds.
42
- connection_timeout : int, optional
43
- The timeout in seconds for connecting to the Redis server. Default is 300 seconds.
44
- max_pool_size : int, optional
45
- The maximum number of connections in the Redis connection pool. Default is 128.
46
- use_ssl : bool, optional
47
- Specifies if SSL should be used for the connection. Default is False.
48
- redis_allocator : Any, optional
49
- The Redis client allocator, allowing for custom Redis client instances. Default is redis.Redis.
50
-
51
- Attributes
52
- ----------
53
- client : Any
54
- The Redis client instance used for operations.
55
- """
56
-
57
- def __init__(
58
- self,
59
- host: str,
60
- port: int,
61
- db: int = 0,
62
- max_retries: int = 0,
63
- max_backoff: int = 32,
64
- connection_timeout: int = 300,
65
- max_pool_size: int = 128,
66
- use_ssl: bool = False,
67
- redis_allocator: Any = redis.Redis, # Type hint as 'Any' due to dynamic nature
68
- ):
69
- self._host = host
70
- self._port = port
71
- self._db = db
72
- self._max_retries = max_retries
73
- self._max_backoff = max_backoff
74
- self._connection_timeout = connection_timeout
75
- self._use_ssl = use_ssl
76
- self._pool = redis.ConnectionPool(
77
- host=self._host,
78
- port=self._port,
79
- db=self._db,
80
- socket_connect_timeout=self._connection_timeout,
81
- max_connections=max_pool_size,
82
- )
83
- self._redis_allocator = redis_allocator
84
- self._client = self._redis_allocator(connection_pool=self._pool)
85
- self._retries = 0
86
-
87
- def _connect(self) -> None:
88
- """
89
- Attempts to reconnect to the Redis server if the current connection is not responsive.
90
- """
91
- if not self.ping():
92
- logger.debug("Reconnecting to Redis")
93
- self._client = self._redis_allocator(connection_pool=self._pool)
94
-
95
- @property
96
- def max_retries(self) -> int:
97
- return self._max_retries
98
-
99
- @max_retries.setter
100
- def max_retries(self, value: int) -> None:
101
- self._max_retries = value
102
-
103
- def get_client(self) -> Any:
104
- """
105
- Returns a Redis client instance, reconnecting if necessary.
106
-
107
- Returns
108
- -------
109
- Any
110
- The Redis client instance.
111
- """
112
- if self._client is None or not self.ping():
113
- self._connect()
114
- return self._client
115
-
116
- def ping(self) -> bool:
117
- """
118
- Checks if the Redis server is responsive.
119
-
120
- Returns
121
- -------
122
- bool
123
- True if the server responds to a ping, False otherwise.
124
- """
125
- try:
126
- self._client.ping()
127
- return True
128
- except (RedisError, AttributeError):
129
- return False
130
-
131
- def _check_response(
132
- self, channel_name: str, timeout: float
133
- ) -> Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]:
134
- """
135
- Checks for a response from the Redis queue and processes it into a message, fragment, and fragment count.
136
-
137
- Parameters
138
- ----------
139
- channel_name : str
140
- The name of the Redis channel from which to receive the response.
141
- timeout : float
142
- The time in seconds to wait for a response from the Redis queue before timing out.
143
-
144
- Returns
145
- -------
146
- Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]
147
- A tuple containing:
148
- - message: A dictionary containing the decoded message if successful,
149
- or None if no message was retrieved.
150
- - fragment: An integer representing the fragment number of the message,
151
- or None if no fragment was found.
152
- - fragment_count: An integer representing the total number of message fragments,
153
- or None if no fragment count was found.
154
-
155
- Raises
156
- ------
157
- ValueError
158
- If the message retrieved from Redis cannot be decoded from JSON.
159
- """
160
-
161
- response = self.get_client().blpop([channel_name], timeout)
162
- if response is None:
163
- raise TimeoutError("No response was received in the specified timeout period")
164
-
165
- if len(response) > 1 and response[1]:
166
- try:
167
- message = json.loads(response[1])
168
- fragment = message.get("fragment", 0)
169
- fragment_count = message.get("fragment_count", 1)
170
-
171
- return message, fragment, fragment_count
172
- except json.JSONDecodeError as e:
173
- logger.error(f"Failed to decode message: {e}")
174
- raise ValueError(f"Failed to decode message from Redis: {e}")
175
-
176
- return None, None, None
177
-
178
- def fetch_message(self, channel_name: str, timeout: float = 10) -> Optional[Union[str, Dict]]:
179
- """
180
- Fetches a message from the specified queue with retries on failure. If the message is fragmented, it will
181
- continue fetching fragments until all parts have been collected.
182
-
183
- Parameters
184
- ----------
185
- channel_name: str
186
- Channel to fetch the message from.
187
- timeout : float
188
- The timeout in seconds for blocking until a message is available. If we receive a multi-part message,
189
- this value will be temporarily extended in order to collect all fragments.
190
-
191
- Returns
192
- -------
193
- Optional[str or Dict]
194
- The full fetched message, or None if no message could be fetched after retries.
195
-
196
- Raises
197
- ------
198
- ValueError
199
- If fetching the message fails after the specified number of retries or due to other critical errors.
200
- """
201
- accumulated_time = 0
202
- collected_fragments = []
203
- fragment_count = None
204
- retries = 0
205
-
206
- logger.debug(f"Starting fetch_message on channel '{channel_name}' with timeout {timeout}s.")
207
-
208
- while True:
209
- try:
210
- # Attempt to fetch a message from the Redis queue
211
- message, fragment, fragment_count = self._check_response(channel_name, timeout)
212
- logger.debug(f"Fetched fragment: {fragment} (fragment_count: {fragment_count}).")
213
-
214
- if message is not None:
215
- if fragment_count == 1:
216
- return message
217
-
218
- collected_fragments.append(message)
219
- logger.debug(f"Collected {len(collected_fragments)} of {fragment_count} fragments so far.")
220
-
221
- # If we have collected all fragments, combine and return
222
- if len(collected_fragments) == fragment_count:
223
- logger.debug("All fragments received. Sorting and combining fragments.")
224
- # Sort fragments by the 'fragment' field to ensure correct order
225
- collected_fragments.sort(key=lambda x: x["fragment"])
226
- reconstructed_message = self._combine_fragments(collected_fragments)
227
- logger.debug("Message reconstructed successfully. Returning combined message.")
228
- return reconstructed_message
229
- else:
230
- logger.debug("Received empty response; returning None.")
231
- return message
232
-
233
- except TimeoutError:
234
- # When fragments are expected but not all received before timeout
235
- if fragment_count and fragment_count > 1:
236
- accumulated_time += timeout
237
- logger.debug(
238
- f"Timeout occurred waiting for fragments. "
239
- f"Accumulated timeout: {accumulated_time}s (Threshold: {timeout * fragment_count}s)."
240
- )
241
- if accumulated_time >= (timeout * fragment_count):
242
- err_msg = f"Failed to reconstruct message from {channel_name} after {accumulated_time} sec."
243
- logger.error(err_msg)
244
- raise ValueError(err_msg)
245
- else:
246
- raise # This is expected in many cases, so re-raise it
247
-
248
- except RedisError as err:
249
- retries += 1
250
- logger.error(f"Redis error during fetch: {err}")
251
- backoff_delay = min(2**retries, self._max_backoff)
252
-
253
- if self.max_retries > 0 and retries <= self.max_retries:
254
- logger.error(f"Fetch attempt failed, retrying in {backoff_delay}s...")
255
- time.sleep(backoff_delay)
256
- else:
257
- logger.error(f"Failed to fetch message from {channel_name} after {retries} attempts.")
258
- raise ValueError(f"Failed to fetch message from Redis queue after {retries} attempts: {err}")
259
-
260
- # Invalidate client to force reconnection on the next try
261
- self._client = None
262
-
263
- except Exception as e:
264
- # Handle non-Redis specific exceptions
265
- logger.error(f"Unexpected error during fetch from {channel_name}: {e}")
266
- raise ValueError(f"Unexpected error during fetch: {e}")
267
-
268
- @staticmethod
269
- def _combine_fragments(fragments: List[Dict[str, Any]]) -> Dict:
270
- """
271
- Combines multiple message fragments into a single message by extending the 'data' elements,
272
- retaining the 'status' and 'description' of the first fragment, and removing 'fragment' and 'fragment_counts'.
273
-
274
- Parameters
275
- ----------
276
- fragments : List[Dict[str, Any]]
277
- A list of fragments to be combined.
278
-
279
- Returns
280
- -------
281
- str
282
- The combined message as a JSON string, containing 'status', 'description', and combined 'data'.
283
- """
284
- if not fragments:
285
- raise ValueError("Fragments list is empty")
286
-
287
- # Use 'status' and 'description' from the first fragment
288
- combined_message = {
289
- "status": fragments[0]["status"],
290
- "description": fragments[0]["description"],
291
- "data": [],
292
- "trace": fragments[0].get("trace", {}),
293
- }
294
-
295
- # Combine the 'data' elements from all fragments
296
- for fragment in fragments:
297
- combined_message["data"].extend(fragment["data"])
298
-
299
- return combined_message
300
-
301
- def submit_message(self, channel_name: str, message: str) -> None:
302
- """
303
- Submits a message to a specified Redis queue with retries on failure.
304
-
305
- Parameters
306
- ----------
307
- channel_name : str
308
- The name of the queue to submit the message to.
309
- message : str
310
- The message to submit.
311
-
312
- Raises
313
- ------
314
- RedisError
315
- If submitting the message fails after the specified number of retries.
316
- """
317
- retries = 0
318
- while True:
319
- try:
320
- self.get_client().rpush(channel_name, message)
321
- logger.debug(f"Message submitted to {channel_name}")
322
- break
323
- except RedisError as e:
324
- logger.error(f"Failed to submit message, retrying... Error: {e}")
325
- self._client = None # Invalidate client to force reconnection
326
- retries += 1
327
- backoff_delay = min(2**retries, self._max_backoff)
328
-
329
- if self.max_retries == 0 or retries < self.max_retries:
330
- logger.error(f"Submit attempt failed, retrying in {backoff_delay}s...")
331
- time.sleep(backoff_delay)
332
- else:
333
- logger.error(f"Failed to submit message to {channel_name} after {retries} attempts.")
334
- raise
File without changes
@@ -1,368 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- # pylint: skip-file
6
-
7
- import logging
8
- import re
9
- import time
10
- from typing import Any
11
-
12
- import httpx
13
- import requests
14
-
15
- from nv_ingest_api.internal.schemas.message_brokers.response_schema import ResponseSchema
16
- from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
- # HTTP Response Statuses that result in marking submission as failed
21
- # 4XX - Any 4XX status is considered a client derived error and will result in failure
22
- # 5XX - Not all 500's are terminal but most are. Those which are listed below
23
- _TERMINAL_RESPONSE_STATUSES = [
24
- 400,
25
- 401,
26
- 402,
27
- 403,
28
- 404,
29
- 405,
30
- 406,
31
- 407,
32
- 408,
33
- 409,
34
- 410,
35
- 411,
36
- 412,
37
- 413,
38
- 414,
39
- 415,
40
- 416,
41
- 417,
42
- 418,
43
- 421,
44
- 422,
45
- 423,
46
- 424,
47
- 425,
48
- 426,
49
- 428,
50
- 429,
51
- 431,
52
- 451,
53
- 500,
54
- 501,
55
- 503,
56
- 505,
57
- 506,
58
- 507,
59
- 508,
60
- 510,
61
- 511,
62
- ]
63
-
64
-
65
- class RestClient(MessageBrokerClientBase):
66
- """
67
- A client for interfacing with the nv-ingest HTTP endpoint, providing mechanisms for sending and receiving messages
68
- with retry logic and connection management.
69
-
70
- Parameters
71
- ----------
72
- host : str
73
- The hostname of the HTTP server.
74
- port : int
75
- The port number of the HTTP server.
76
- max_retries : int, optional
77
- The maximum number of retry attempts for operations. Default is 0 (no retries).
78
- max_backoff : int, optional
79
- The maximum backoff delay between retries in seconds. Default is 32 seconds.
80
- connection_timeout : int, optional
81
- The timeout in seconds for connecting to the HTTP server. Default is 300 seconds.
82
- http_allocator : Any, optional
83
- The HTTP client allocator.
84
-
85
- Attributes
86
- ----------
87
- client : Any
88
- The HTTP client instance used for operations.
89
- """
90
-
91
- def __init__(
92
- self,
93
- host: str,
94
- port: int,
95
- max_retries: int = 0,
96
- max_backoff: int = 32,
97
- connection_timeout: int = 300,
98
- http_allocator: Any = httpx.AsyncClient,
99
- ):
100
- self._host = host
101
- self._port = port
102
- self._max_retries = max_retries
103
- self._max_backoff = max_backoff
104
- self._connection_timeout = connection_timeout
105
- self._http_allocator = http_allocator
106
- self._client = self._http_allocator()
107
- self._retries = 0
108
-
109
- self._submit_endpoint = "/v1/submit_job"
110
- self._fetch_endpoint = "/v1/fetch_job"
111
-
112
- def _connect(self) -> None:
113
- """
114
- Attempts to reconnect to the HTTP server if the current connection is not responsive.
115
- """
116
- ping_result = self.ping()
117
-
118
- if ping_result.response_code != 0:
119
- logger.debug("Reconnecting to HTTP server")
120
- self._client = self._http_allocator()
121
-
122
- @property
123
- def max_retries(self) -> int:
124
- return self._max_retries
125
-
126
- @max_retries.setter
127
- def max_retries(self, value: int) -> None:
128
- self._max_retries = value
129
-
130
- def get_client(self) -> Any:
131
- """
132
- Returns a HTTP client instance, reconnecting if necessary.
133
-
134
- Returns
135
- -------
136
- Any
137
- The HTTP client instance.
138
- """
139
- if self._client is None:
140
- self._connect()
141
- return self._client
142
-
143
- def ping(self) -> ResponseSchema:
144
- """
145
- Checks if the HTTP server is responsive.
146
-
147
- Returns
148
- -------
149
- bool
150
- True if the server responds to a ping, False otherwise.
151
- """
152
- try:
153
- # Implement a simple GET request to a health endpoint or root
154
- self._client.ping()
155
- return ResponseSchema(response_code=0)
156
- except (httpx.HTTPError, AttributeError):
157
- return ResponseSchema(response_code=1, response_reason="Failed to ping HTTP server")
158
-
159
- @staticmethod
160
- def generate_url(user_provided_url, user_provided_port) -> str:
161
- """Examines the user defined URL for http*://. If that
162
- pattern is detected the URL is used as provided by the user.
163
- If that pattern does not exist then the assumption is made that
164
- the endpoint is simply `http://` and that is prepended
165
- to the user supplied endpoint.
166
-
167
- Args:
168
- user_provided_url str: Endpoint where the Rest service is running
169
-
170
- Returns:
171
- str: Fully validated URL
172
- """
173
- if not re.match(r"^https?://", user_provided_url):
174
- # Add the default `http://` if it's not already present in the URL
175
- user_provided_url = f"http://{user_provided_url}:{user_provided_port}"
176
- else:
177
- user_provided_url = f"{user_provided_url}:{user_provided_port}"
178
- return user_provided_url
179
-
180
- def fetch_message(self, job_id: str, timeout: float = (10, 600)) -> ResponseSchema:
181
- """
182
- Fetches a message from the specified queue with retries on failure, handling streaming HTTP responses.
183
-
184
- Parameters
185
- ----------
186
- job_id : str
187
- The server-side job identifier.
188
- timeout : float
189
- The timeout in seconds for blocking until a message is available.
190
-
191
- Returns
192
- -------
193
- ResponseSchema
194
- The fetched message wrapped in a ResponseSchema object.
195
- """
196
- retries = 0
197
- while True:
198
- try:
199
- url = f"{self.generate_url(self._host, self._port)}{self._fetch_endpoint}/{job_id}"
200
- logger.debug(f"Invoking fetch_message http endpoint @ '{url}'")
201
-
202
- # Fetch using streaming response
203
- with requests.get(url, timeout=(30, 600), stream=True) as result:
204
- response_code = result.status_code
205
-
206
- if response_code in _TERMINAL_RESPONSE_STATUSES:
207
- # Terminal response code; return error ResponseSchema
208
- return ResponseSchema(
209
- response_code=1,
210
- response_reason=(
211
- f"Terminal response code {response_code} received when fetching JobSpec: {job_id}"
212
- ),
213
- response=result.text,
214
- )
215
-
216
- if response_code == 200:
217
- # Handle streaming response, reconstructing payload incrementally
218
- response_chunks = []
219
- for chunk in result.iter_content(chunk_size=1024 * 1024): # 1MB chunks
220
- if chunk:
221
- response_chunks.append(chunk)
222
- full_response = b"".join(response_chunks).decode("utf-8")
223
-
224
- return ResponseSchema(
225
- response_code=0,
226
- response_reason="OK",
227
- response=full_response,
228
- )
229
-
230
- elif response_code == 202:
231
- # Job is not ready yet
232
- return ResponseSchema(
233
- response_code=1,
234
- response_reason="Job is not ready yet. Retry later.",
235
- )
236
-
237
- else:
238
- try:
239
- # Retry the operation
240
- retries = self.perform_retry_backoff(retries)
241
- except RuntimeError as rte:
242
- raise rte
243
-
244
- except (ConnectionError, requests.HTTPError, requests.exceptions.ConnectionError) as err:
245
- logger.error(f"Error during fetching, retrying... Error: {err}")
246
- self._client = None # Invalidate client to force reconnection
247
- if "Connection refused" in str(err):
248
- logger.debug(
249
- "Connection refused encountered during fetch; sleeping for 10 seconds before retrying."
250
- )
251
- time.sleep(10)
252
- try:
253
- retries = self.perform_retry_backoff(retries)
254
- except RuntimeError as rte:
255
- # Max retries reached
256
- return ResponseSchema(response_code=1, response_reason=str(rte), response=str(err))
257
- except TimeoutError:
258
- raise
259
- except Exception as e:
260
- # Handle non-http specific exceptions
261
- logger.error(f"Unexpected error during fetch from {url}: {e}")
262
- return ResponseSchema(
263
- response_code=1, response_reason=f"Unexpected error during fetch: {e}", response=None
264
- )
265
-
266
- def submit_message(self, channel_name: str, message: str, for_nv_ingest: bool = False) -> ResponseSchema:
267
- """
268
- Submits a JobSpec to a specified HTTP endpoint with retries on failure.
269
-
270
- Parameters
271
- ----------
272
- channel_name : str
273
- Not used as part of RestClient but defined in MessageClientBase.
274
- message : str
275
- The message to submit.
276
- for_nv_ingest : bool
277
- Not used as part of RestClient but defined in MessageClientBase.
278
-
279
- Returns
280
- -------
281
- ResponseSchema
282
- The response from the server wrapped in a ResponseSchema object.
283
- """
284
- retries = 0
285
- while True:
286
- try:
287
- # Submit via HTTP
288
- url = f"{self.generate_url(self._host, self._port)}{self._submit_endpoint}"
289
- result = requests.post(url, json={"payload": message}, headers={"Content-Type": "application/json"})
290
-
291
- response_code = result.status_code
292
- if response_code in _TERMINAL_RESPONSE_STATUSES:
293
- # Terminal response code; return error ResponseSchema
294
- return ResponseSchema(
295
- response_code=1,
296
- response_reason=f"Terminal response code {response_code} received when submitting JobSpec",
297
- trace_id=result.headers.get("x-trace-id"),
298
- )
299
- else:
300
- # If 200 we are good, otherwise let's try again
301
- if response_code == 200:
302
- logger.debug(f"JobSpec successfully submitted to http endpoint {self._submit_endpoint}")
303
- # The REST interface returns a JobId, so we capture that here
304
- x_trace_id = result.headers.get("x-trace-id")
305
- return ResponseSchema(
306
- response_code=0,
307
- response_reason="OK",
308
- response=result.text,
309
- transaction_id=result.text,
310
- trace_id=x_trace_id,
311
- )
312
- else:
313
- # Retry the operation
314
- retries = self.perform_retry_backoff(retries)
315
- except requests.RequestException as e:
316
- logger.error(f"Failed to submit job, retrying... Error: {e}")
317
- self._client = None # Invalidate client to force reconnection
318
- if "Connection refused" in str(e):
319
- logger.debug(
320
- "Connection refused encountered during submission; sleeping for 10 seconds before retrying."
321
- )
322
- time.sleep(10)
323
- try:
324
- retries = self.perform_retry_backoff(retries)
325
- except RuntimeError as rte:
326
- # Max retries reached
327
- return ResponseSchema(response_code=1, response_reason=str(rte), response=str(e))
328
- except Exception as e:
329
- # Handle non-http specific exceptions
330
- logger.error(f"Unexpected error during submission of JobSpec to {url}: {e}")
331
- return ResponseSchema(
332
- response_code=1, response_reason=f"Unexpected error during JobSpec submission: {e}", response=None
333
- )
334
-
335
- def perform_retry_backoff(self, existing_retries) -> int:
336
- """
337
- Attempts to perform a backoff retry delay. This function accepts the
338
- current number of retries that have been attempted and compares
339
- that with the maximum number of retries allowed. If the current
340
- number of retries exceeds the max then a RuntimeError is raised.
341
-
342
- Parameters
343
- ----------
344
- existing_retries : int
345
- The number of retries that have been attempted for this operation thus far
346
-
347
- Returns
348
- -------
349
- int
350
- The updated number of retry attempts that have been made for this operation
351
-
352
- Raises
353
- ------
354
- RuntimeError
355
- Raised if the maximum number of retry attempts has been reached.
356
- """
357
- backoff_delay = min(2**existing_retries, self._max_backoff)
358
- logger.debug(
359
- f"Retry #: {existing_retries} of max_retries: {self.max_retries} | "
360
- f"current backoff_delay: {backoff_delay}s of max_backoff: {self._max_backoff}s"
361
- )
362
-
363
- if self.max_retries > 0 and existing_retries < self.max_retries:
364
- logger.error(f"Operation failed, retrying in {backoff_delay}s...")
365
- time.sleep(backoff_delay)
366
- return existing_retries + 1
367
- else:
368
- raise RuntimeError(f"Max retry attempts of {self.max_retries} reached")