PyPI - nv-ingest-api - Versions diffs - 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl - Mend

nv-ingest-api 2025.7.16.dev20250716py3-none-any.whl → 2025.7.18.dev20250718py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (24) hide show

nv_ingest_api/internal/transform/split_text.py CHANGED Viewed

@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
     model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
-    if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
-        tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
-    ):
-        tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
-    elif os.path.exists(os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")) and (
-        tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"
-    ):
-        tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
+    if model_predownload_path is not None:
+        if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
+            tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
+        ):
+            tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
+        elif os.path.exists(
+            os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
+        ) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
+            tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
+    # Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
+    if tokenizer_identifier is None:
+        tokenizer_identifier = "intfloat/e5-large-unsupervised"
     tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)

nv_ingest_api/util/image_processing/table_and_chart.py CHANGED Viewed

@@ -46,14 +46,14 @@ def process_yolox_graphic_elements(yolox_text_dict):
     return chart_content.strip()
-def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
+def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
     """
     Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
     Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
     Boxes are expeceted in format (x0, y0, x1, y1)
     Args:
         yolox_box (np array [4]): Cached Bbox.
-        paddle_ocr_boxes (np array [n x 4]): PaddleOCR boxes
+        ocr_boxes (np array [n x 4]): PaddleOCR boxes
         already_matched (list or None, Optional): Already matched ids to ignore.
         delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
     Returns:
@@ -61,10 +61,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
     """
     x0_1, y0_1, x1_1, y1_1 = yolox_box
     x0_2, y0_2, x1_2, y1_2 = (
-        paddle_ocr_boxes[:, 0],
-        paddle_ocr_boxes[:, 1],
-        paddle_ocr_boxes[:, 2],
-        paddle_ocr_boxes[:, 3],
+        ocr_boxes[:, 0],
+        ocr_boxes[:, 1],
+        ocr_boxes[:, 2],
+        ocr_boxes[:, 3],
     )
     # Intersection
@@ -92,10 +92,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
     return matches
-def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, paddle_txts):
+def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
     """
     Matching boxes
-    We need to associate a text to the paddle detections.
+    We need to associate a text to the ocr detections.
     For each class and for each CACHED detections, we look for overlapping text bboxes
     with  IoU > max_iou / delta where max_iou is the biggest found overlap.
     Found texts are added to the class representation, and removed from the texts to match
@@ -113,18 +113,18 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
         "value_label",
     ]
-    paddle_txts = np.array(paddle_txts)
-    paddle_boxes = np.array(paddle_boxes)
+    ocr_txts = np.array(ocr_txts)
+    ocr_boxes = np.array(ocr_boxes)
-    if (paddle_txts.size == 0) or (paddle_boxes.size == 0):
+    if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
         return {}
-    paddle_boxes = np.array(
+    ocr_boxes = np.array(
         [
-            paddle_boxes[:, :, 0].min(-1),
-            paddle_boxes[:, :, 1].min(-1),
-            paddle_boxes[:, :, 0].max(-1),
-            paddle_boxes[:, :, 1].max(-1),
+            ocr_boxes[:, :, 0].min(-1),
+            ocr_boxes[:, :, 1].min(-1),
+            ocr_boxes[:, :, 0].max(-1),
+            ocr_boxes[:, :, 1].max(-1),
         ]
     ).T
@@ -139,10 +139,10 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
         for yolox_box in yolox_output[k]:
             # if there's a score at the end, drop the score.
             yolox_box = yolox_box[:4]
-            paddle_ids = match_bboxes(yolox_box, paddle_boxes, already_matched=already_matched, delta=4)
+            ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
-            if len(paddle_ids) > 0:
-                text = " ".join(paddle_txts[paddle_ids].tolist())
+            if len(ocr_ids) > 0:
+                text = " ".join(ocr_txts[ocr_ids].tolist())
                 texts.append(text)
         processed_texts = []
@@ -161,7 +161,7 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
     return results
-def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
+def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
     if (not bboxes) or (not texts):
         return ""
@@ -186,22 +186,22 @@ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
     return results
-def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_boxes, paddle_ocr_txts):
-    if (not paddle_ocr_boxes) or (not paddle_ocr_txts):
+def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
+    if (not ocr_boxes) or (not ocr_txts):
         return ""
-    paddle_ocr_boxes = np.array(paddle_ocr_boxes)
-    paddle_ocr_boxes_ = np.array(
+    ocr_boxes = np.array(ocr_boxes)
+    ocr_boxes_ = np.array(
         [
-            paddle_ocr_boxes[:, :, 0].min(-1),
-            paddle_ocr_boxes[:, :, 1].min(-1),
-            paddle_ocr_boxes[:, :, 0].max(-1),
-            paddle_ocr_boxes[:, :, 1].max(-1),
+            ocr_boxes[:, :, 0].min(-1),
+            ocr_boxes[:, :, 1].min(-1),
+            ocr_boxes[:, :, 0].max(-1),
+            ocr_boxes[:, :, 1].max(-1),
         ]
     ).T
     assignments = []
-    for i, (b, t) in enumerate(zip(paddle_ocr_boxes_, paddle_ocr_txts)):
+    for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
         # Find a cell
         matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
         cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
@@ -221,7 +221,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
         assignments.append(
             {
                 "index": i,
-                "paddle_box": b,
+                "ocr_box": b,
                 "is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
                 "cell_id": matches_cell[0] if len(matches_cell) else -1,
                 "cell": cell,
@@ -249,13 +249,13 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
         mat = build_markdown(df_table)
         markdown_table = display_markdown(mat, use_header=False)
-        all_boxes = np.stack(df_table.paddle_box.values)
+        all_boxes = np.stack(df_table.ocr_box.values)
         table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
         df_table_to_text = pd.DataFrame(
             [
                 {
-                    "paddle_box": table_box,
+                    "ocr_box": table_box,
                     "text": markdown_table,
                     "is_table": True,
                 }
@@ -264,7 +264,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
         # Final text representations dataframe
         df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
-    df_text = df_text.rename(columns={"paddle_box": "box"})
+    df_text = df_text.rename(columns={"ocr_box": "box"})
     # Sort by y and x
     df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
@@ -297,12 +297,12 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
     return result
-def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
+def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
     """
-    Assigns the closest bounding boxes to a reference `paddle_box` based on overlap.
+    Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
     Args:
-        paddle_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
+        ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
         boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
         delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
         min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
@@ -316,7 +316,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
     boxes = np.array(boxes)
-    x0_1, y0_1, x1_1, y1_1 = paddle_box
+    x0_1, y0_1, x1_1, y1_1 = ocr_box
     x0_2, y0_2, x1_2, y1_2 = (
         boxes[:, 0],
         boxes[:, 1],
@@ -331,7 +331,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
     inter_x1 = np.minimum(x1_1, x1_2)
     inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
-    # Normalize by paddle_box size
+    # Normalize by ocr_box size
     area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
     ious = inter_area / (area_1 + 1e-6)
@@ -385,16 +385,16 @@ def merge_text_in_cell(df_cell):
     Returns:
         pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
     """
-    paddle_boxes = np.stack(df_cell["paddle_box"].values)
+    ocr_boxes = np.stack(df_cell["ocr_box"].values)
-    df_cell["x"] = (paddle_boxes[:, 0] - paddle_boxes[:, 0].min()) // 10
-    df_cell["y"] = (paddle_boxes[:, 1] - paddle_boxes[:, 1].min()) // 10
+    df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
+    df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
     df_cell = df_cell.sort_values(["y", "x"])
     text = " ".join(df_cell["text"].values.tolist())
     df_cell["text"] = text
     df_cell = df_cell.head(1)
-    df_cell["paddle_box"] = df_cell["cell"]
+    df_cell["ocr_box"] = df_cell["cell"]
     df_cell.drop(["x", "y"], axis=1, inplace=True)
     return df_cell
@@ -447,3 +447,58 @@ def display_markdown(
         markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
     return markdown_table
+def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
+    """
+    Reorders the boxes in reading order.
+    If mode is "center", the boxes are reordered using bbox center.
+    If mode is "top_left", the boxes are reordered using the top left corner.
+    If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
+    Args:
+        boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
+        texts (np array [n]): The text of the OCR results.
+        confs (np array [n]): The confidence scores of the OCR results.
+        mode (str, optional): The mode to reorder the boxes. Defaults to "center".
+        dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
+    Returns:
+        List[List[int, ...]]: The reordered bounding boxes.
+        List[str]: The reordered texts.
+        List[float]: The reordered confidence scores.
+    """
+    df = pd.DataFrame(
+        [[b, t, c] for b, t, c in zip(boxes, texts, confs)],
+        columns=["bbox", "text", "conf"],
+    )
+    if mode == "center":
+        df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
+        df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
+    elif mode == "top_left":
+        df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
+        df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
+    if dbscan_eps:
+        do_naive_sorting = False
+        try:
+            dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
+            dbscan.fit(df["y"].values[:, None])
+            df["cluster"] = dbscan.labels_
+            df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
+            df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
+        except ValueError:
+            do_naive_sorting = True
+    else:
+        do_naive_sorting = True
+    if do_naive_sorting:
+        df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
+        df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
+    bboxes = df["bbox"].values.tolist()
+    texts = df["text"].values.tolist()
+    confs = df["conf"].values.tolist()
+    return bboxes, texts, confs

nv_ingest_api/util/image_processing/transforms.py CHANGED Viewed

@@ -209,6 +209,7 @@ def pad_image(
     target_height: int = DEFAULT_MAX_HEIGHT,
     background_color: int = 255,
     dtype=np.uint8,
+    how: str = "center",
 ) -> Tuple[np.ndarray, Tuple[int, int]]:
     """
     Pads a NumPy array representing an image to the specified target dimensions.
@@ -217,6 +218,8 @@ def pad_image(
     in that dimension. If the target dimensions are larger, the image will be centered within the
     canvas of the specified target size, with the remaining space filled with white padding.
+    The padding can be done around the center (how="center"), or to the bottom right (how="bottom_right").
     Parameters
     ----------
     array : np.ndarray
@@ -225,6 +228,8 @@ def pad_image(
         The desired target width of the padded image. Defaults to DEFAULT_MAX_WIDTH.
     target_height : int, optional
         The desired target height of the padded image. Defaults to DEFAULT_MAX_HEIGHT.
+    how : str, optional
+        The method to pad the image. Defaults to "center".
     Returns
     -------
@@ -249,17 +254,23 @@ def pad_image(
     """
     height, width = array.shape[:2]
-    # Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
-    pad_height = max((target_height - height) // 2, 0)
-    pad_width = max((target_width - width) // 2, 0)
     # Determine final canvas size (may be equal to original if target is smaller)
     final_height = max(height, target_height)
     final_width = max(width, target_width)
     # Create the canvas and place the original image on it
     canvas = background_color * np.ones((final_height, final_width, array.shape[2]), dtype=dtype)
-    canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array  # noqa: E203
+    # Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
+    if how == "center":
+        pad_height = max((target_height - height) // 2, 0)
+        pad_width = max((target_width - width) // 2, 0)
+        canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array  # noqa: E203
+    elif how == "bottom_right":
+        pad_height, pad_width = 0, 0
+        canvas[:height, :width] = array  # noqa: E203
     return canvas, (pad_width, pad_height)

nv_ingest_api/util/message_brokers/simple_message_broker/broker.py CHANGED Viewed

@@ -250,7 +250,7 @@ class SimpleMessageBrokerHandler(socketserver.BaseRequestHandler):
         with queue_lock:
             if queue.empty():
                 # Return failure response immediately
-                response = ResponseSchema(response_code=1, response_reason="Queue is empty")
+                response = ResponseSchema(response_code=2, response_reason="Job not ready")
                 self._send_response(response)
                 return
             # Pop the message from the queue

nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py CHANGED Viewed

@@ -14,7 +14,7 @@ import logging
 from typing import Optional, Tuple, Union
 from nv_ingest_api.internal.schemas.message_brokers.response_schema import ResponseSchema
-from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase, FetchMode
+from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase
 logger = logging.getLogger(__name__)
@@ -108,29 +108,23 @@ class SimpleClient(MessageBrokerClientBase):
         return self._handle_push(queue_name, message, timeout, for_nv_ingest)
     def fetch_message(
-        self,
-        queue_name: str,
-        timeout: Optional[Tuple[int, Union[float]]] = (100, None),
-        override_fetch_mode: FetchMode = None,
+        self, queue_name: str, timeout: Optional[Tuple[int, Union[float, None]]] = (1200, None)
     ) -> ResponseSchema:
         """
-        Fetch a message from the specified queue.
+        Fetch a message from a specified queue.
         Parameters
         ----------
         queue_name : str
             The name of the queue.
-        timeout : float, optional
-            Timeout in seconds for the operation.
+        timeout : tuple, optional
+            A tuple containing the timeout value and an unused second element.
         Returns
         -------
         ResponseSchema
-            The response containing the fetched message.
+            The response from the broker.
         """
-        if isinstance(timeout, int):
-            timeout = (timeout, None)
         return self._handle_pop(queue_name, timeout)
     def ping(self) -> ResponseSchema:
@@ -208,6 +202,7 @@ class SimpleClient(MessageBrokerClientBase):
             try:
                 with socket.create_connection((self._host, self._port), timeout=self._connection_timeout) as sock:
+                    sock.settimeout(self._connection_timeout)
                     self._send(sock, json.dumps(command).encode("utf-8"))
                     # Receive initial response with transaction ID
                     response_data = self._recv(sock)
@@ -241,8 +236,9 @@ class SimpleClient(MessageBrokerClientBase):
                     return ResponseSchema(**final_response)
-            except (ConnectionError, socket.error, BrokenPipeError):
-                pass
+            except (ConnectionError, socket.error, BrokenPipeError, socket.timeout) as e:
+                logger.debug(f"Connection error during PUSH: {e}")
+                pass  # Will be retried
             except json.JSONDecodeError:
                 return ResponseSchema(response_code=1, response_reason="Invalid JSON response from server.")
             except Exception as e:
@@ -272,61 +268,67 @@ class SimpleClient(MessageBrokerClientBase):
         command = {"command": "POP", "queue_name": queue_name}
-        timeout = int(timeout[0])
+        timeout_val = timeout[0] if isinstance(timeout, tuple) else timeout
-        if timeout is not None:
-            command["timeout"] = timeout
+        if timeout_val is not None:
+            command["timeout"] = timeout_val
         start_time = time.time()
+        backoff_delay = 1  # Start with a 1-second backoff
         while True:
             elapsed = time.time() - start_time
-            remaining_timeout = timeout - elapsed if timeout else None
-            if remaining_timeout is not None and remaining_timeout <= 0:
-                return ResponseSchema(response_code=1, response_reason="POP operation timed out.")
+            if timeout_val is not None and elapsed >= timeout_val:
+                return ResponseSchema(response_code=2, response_reason="Job not ready.")
             try:
                 with socket.create_connection((self._host, self._port), timeout=self._connection_timeout) as sock:
+                    sock.settimeout(self._connection_timeout)
                     self._send(sock, json.dumps(command).encode("utf-8"))
                     # Receive initial response with transaction ID and message
                     response_data = self._recv(sock)
                     response = json.loads(response_data)
-                    if response.get("response_code") != 0:
-                        if response.get("response_reason") == "Queue is empty":
-                            time.sleep(0.1)
-                            continue
-                        else:
-                            return ResponseSchema(**response)
-                    if "transaction_id" not in response:
-                        error_msg = "No transaction_id in response."
+                    # The broker now returns a response_code of 2 for a timeout, which the high-level
+                    # client should handle as a retryable event.
+                    if response.get("response_code") == 2:
+                        # Queue is empty or job not ready, continue to backoff and retry
+                        pass
+                    elif response.get("response_code") != 0:
+                        return ResponseSchema(**response)
+                    else:
+                        # Success case: we received a message.
+                        if "transaction_id" not in response:
+                            return ResponseSchema(response_code=1, response_reason="No transaction_id in response.")
-                        return ResponseSchema(response_code=1, response_reason=error_msg)
+                        transaction_id = response["transaction_id"]
+                        message = response.get("response")
-                    transaction_id = response["transaction_id"]
-                    message = response.get("response")
+                        # Send ACK
+                        ack_data = json.dumps({"transaction_id": transaction_id, "ack": True}).encode("utf-8")
+                        self._send(sock, ack_data)
-                    # Send ACK
-                    ack_data = json.dumps({"transaction_id": transaction_id, "ack": True}).encode("utf-8")
-                    self._send(sock, ack_data)
+                        # Receive final response
+                        final_response_data = self._recv(sock)
+                        final_response = json.loads(final_response_data)
-                    # Receive final response
-                    final_response_data = self._recv(sock)
-                    final_response = json.loads(final_response_data)
-                    if final_response.get("response_code") == 0:
-                        return ResponseSchema(response_code=0, response=message, transaction_id=transaction_id)
-                    else:
-                        return ResponseSchema(**final_response)
+                        if final_response.get("response_code") == 0:
+                            return ResponseSchema(response_code=0, response=message, transaction_id=transaction_id)
+                        else:
+                            return ResponseSchema(**final_response)
-            except (ConnectionError, socket.error, BrokenPipeError):
-                pass
+            except (ConnectionError, socket.error, BrokenPipeError, socket.timeout) as e:
+                # Let the high-level client handle connection errors as retryable.
+                logger.debug(f"Connection error during POP: {e}, will retry after backoff.")
+                pass  # Fall through to backoff and retry
             except json.JSONDecodeError:
                 return ResponseSchema(response_code=1, response_reason="Invalid JSON response from server.")
             except Exception as e:
                 return ResponseSchema(response_code=1, response_reason=str(e))
-            time.sleep(0.1)  # Backoff delay before retry
+            # Exponential backoff
+            time.sleep(backoff_delay)
+            backoff_delay = min(backoff_delay * 2, self._max_backoff)
     def _execute_simple_command(self, command: dict) -> ResponseSchema:
         """
@@ -350,12 +352,13 @@ class SimpleClient(MessageBrokerClientBase):
         try:
             with socket.create_connection((self._host, self._port), timeout=self._connection_timeout) as sock:
+                sock.settimeout(self._connection_timeout)
                 self._send(sock, data)
                 response_data = self._recv(sock)
                 response = json.loads(response_data)
                 return ResponseSchema(**response)
-        except (ConnectionError, socket.error, BrokenPipeError) as e:
-            return ResponseSchema(response_code=1, response_reason=f"Connection error: {e}")
+        except (ConnectionError, socket.error, BrokenPipeError, socket.timeout) as e:
+            return ResponseSchema(response_code=2, response_reason=f"Connection error: {e}")
         except json.JSONDecodeError:
             return ResponseSchema(response_code=1, response_reason="Invalid JSON response from server.")
         except Exception as e:

{nv_ingest_api-2025.7.16.dev20250716.dist-info → nv_ingest_api-2025.7.18.dev20250718.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest-api
-Version: 2025.7.16.dev20250716
+Version: 2025.7.18.dev20250718
 Summary: Python module with core document ingestion functions.
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

nv-ingest-api 2025.7.16.dev20250716__py3-none-any.whl → 2025.7.18.dev20250718__py3-none-any.whl

Potentially problematic release.

nv-ingest-api 2025.7.16.dev20250716py3-none-any.whl → 2025.7.18.dev20250718py3-none-any.whl