nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,462 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any, List, Tuple
8
+ from typing import Dict
9
+ from typing import Optional
10
+
11
+ import numpy as np
12
+
13
+ from nv_ingest_api.internal.primitives.nim import ModelInterface
14
+ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import preprocess_image_for_paddle
15
+ from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class PaddleOCRModelInterface(ModelInterface):
21
+ """
22
+ An interface for handling inference with a PaddleOCR model, supporting both gRPC and HTTP protocols.
23
+ """
24
+
25
+ def name(self) -> str:
26
+ """
27
+ Get the name of the model interface.
28
+
29
+ Returns
30
+ -------
31
+ str
32
+ The name of the model interface.
33
+ """
34
+ return "PaddleOCR"
35
+
36
+ def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
37
+ """
38
+ Decode one or more base64-encoded images into NumPy arrays, storing them
39
+ alongside their dimensions in `data`.
40
+
41
+ Parameters
42
+ ----------
43
+ data : dict of str -> Any
44
+ The input data containing either:
45
+ - 'base64_image': a single base64-encoded image, or
46
+ - 'base64_images': a list of base64-encoded images.
47
+
48
+ Returns
49
+ -------
50
+ dict of str -> Any
51
+ The updated data dictionary with the following keys added:
52
+ - "image_arrays": List of decoded NumPy arrays of shape (H, W, C).
53
+ - "image_dims": List of (height, width) tuples for each decoded image.
54
+
55
+ Raises
56
+ ------
57
+ KeyError
58
+ If neither 'base64_image' nor 'base64_images' is found in `data`.
59
+ ValueError
60
+ If 'base64_images' is present but is not a list.
61
+ """
62
+ if "base64_images" in data:
63
+ base64_list = data["base64_images"]
64
+ if not isinstance(base64_list, list):
65
+ raise ValueError("The 'base64_images' key must contain a list of base64-encoded strings.")
66
+
67
+ image_arrays: List[np.ndarray] = []
68
+ for b64 in base64_list:
69
+ img = base64_to_numpy(b64)
70
+ image_arrays.append(img)
71
+
72
+ data["image_arrays"] = image_arrays
73
+
74
+ elif "base64_image" in data:
75
+ # Single-image fallback
76
+ img = base64_to_numpy(data["base64_image"])
77
+ data["image_arrays"] = [img]
78
+
79
+ else:
80
+ raise KeyError("Input data must include 'base64_image' or 'base64_images'.")
81
+
82
+ return data
83
+
84
+ def format_input(self, data: Dict[str, Any], protocol: str, max_batch_size: int, **kwargs) -> Any:
85
+ """
86
+ Format input data for the specified protocol ("grpc" or "http"), supporting batched data.
87
+
88
+ Parameters
89
+ ----------
90
+ data : dict of str -> Any
91
+ The input data dictionary, expected to contain "image_arrays" (list of np.ndarray)
92
+ and "image_dims" (list of (height, width) tuples), as produced by prepare_data_for_inference.
93
+ protocol : str
94
+ The inference protocol, either "grpc" or "http".
95
+ max_batch_size : int
96
+ The maximum batch size for batching.
97
+
98
+ Returns
99
+ -------
100
+ tuple
101
+ A tuple (formatted_batches, formatted_batch_data) where:
102
+ - formatted_batches is a list of batches ready for inference.
103
+ - formatted_batch_data is a list of scratch-pad dictionaries corresponding to each batch,
104
+ containing the keys "image_arrays" and "image_dims" for later post-processing.
105
+
106
+ Raises
107
+ ------
108
+ KeyError
109
+ If either "image_arrays" or "image_dims" is not found in `data`.
110
+ ValueError
111
+ If an invalid protocol is specified.
112
+ """
113
+
114
+ images = data["image_arrays"]
115
+
116
+ dims: List[Dict[str, Any]] = []
117
+ data["image_dims"] = dims
118
+
119
+ # Helper function to split a list into chunks of size up to chunk_size.
120
+ def chunk_list(lst, chunk_size):
121
+ return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
122
+
123
+ if "image_arrays" not in data or "image_dims" not in data:
124
+ raise KeyError("Expected 'image_arrays' and 'image_dims' in data. Call prepare_data_for_inference first.")
125
+
126
+ images = data["image_arrays"]
127
+ dims = data["image_dims"]
128
+
129
+ if protocol == "grpc":
130
+ logger.debug("Formatting input for gRPC PaddleOCR model (batched).")
131
+ processed: List[np.ndarray] = []
132
+ for img in images:
133
+ arr, _dims = preprocess_image_for_paddle(img)
134
+ dims.append(_dims)
135
+ arr = arr.astype(np.float32)
136
+ arr = np.expand_dims(arr, axis=0) # => shape (1, H, W, C)
137
+ processed.append(arr)
138
+
139
+ batches = []
140
+ batch_data_list = []
141
+ for proc_chunk, orig_chunk, dims_chunk in zip(
142
+ chunk_list(processed, max_batch_size),
143
+ chunk_list(images, max_batch_size),
144
+ chunk_list(dims, max_batch_size),
145
+ ):
146
+ batched_input = np.concatenate(proc_chunk, axis=0)
147
+ batches.append(batched_input)
148
+ batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
149
+ return batches, batch_data_list
150
+
151
+ elif protocol == "http":
152
+ logger.debug("Formatting input for HTTP PaddleOCR model (batched).")
153
+ if "base64_images" in data:
154
+ base64_list = data["base64_images"]
155
+ else:
156
+ base64_list = [data["base64_image"]]
157
+
158
+ input_list: List[Dict[str, Any]] = []
159
+ for b64, img in zip(base64_list, images):
160
+ image_url = f"data:image/png;base64,{b64}"
161
+ image_obj = {"type": "image_url", "url": image_url}
162
+ input_list.append(image_obj)
163
+ _dims = {"new_width": img.shape[1], "new_height": img.shape[0]}
164
+ dims.append(_dims)
165
+
166
+ batches = []
167
+ batch_data_list = []
168
+ for input_chunk, orig_chunk, dims_chunk in zip(
169
+ chunk_list(input_list, max_batch_size),
170
+ chunk_list(images, max_batch_size),
171
+ chunk_list(dims, max_batch_size),
172
+ ):
173
+ payload = {"input": input_chunk}
174
+ batches.append(payload)
175
+ batch_data_list.append({"image_arrays": orig_chunk, "image_dims": dims_chunk})
176
+
177
+ return batches, batch_data_list
178
+
179
+ else:
180
+ raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
181
+
182
+ def parse_output(self, response: Any, protocol: str, data: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Any:
183
+ """
184
+ Parse the model's inference response for the given protocol. The parsing
185
+ may handle batched outputs for multiple images.
186
+
187
+ Parameters
188
+ ----------
189
+ response : Any
190
+ The raw response from the PaddleOCR model.
191
+ protocol : str
192
+ The protocol used for inference, "grpc" or "http".
193
+ data : dict of str -> Any, optional
194
+ Additional data dictionary that may include "image_dims" for bounding box scaling.
195
+ **kwargs : Any
196
+ Additional keyword arguments, such as custom `table_content_format`.
197
+
198
+ Returns
199
+ -------
200
+ Any
201
+ The parsed output, typically a list of (content, table_content_format) tuples.
202
+
203
+ Raises
204
+ ------
205
+ ValueError
206
+ If an invalid protocol is specified.
207
+ """
208
+ # Retrieve image dimensions if available
209
+ dims: Optional[List[Tuple[int, int]]] = data.get("image_dims") if data else None
210
+
211
+ if protocol == "grpc":
212
+ logger.debug("Parsing output from gRPC PaddleOCR model (batched).")
213
+ return self._extract_content_from_paddle_grpc_response(response, dims)
214
+
215
+ elif protocol == "http":
216
+ logger.debug("Parsing output from HTTP PaddleOCR model (batched).")
217
+ return self._extract_content_from_paddle_http_response(response, dims)
218
+
219
+ else:
220
+ raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
221
+
222
+ def process_inference_results(self, output: Any, **kwargs: Any) -> Any:
223
+ """
224
+ Process inference results for the PaddleOCR model.
225
+
226
+ Parameters
227
+ ----------
228
+ output : Any
229
+ The raw output parsed from the PaddleOCR model.
230
+ **kwargs : Any
231
+ Additional keyword arguments for customization.
232
+
233
+ Returns
234
+ -------
235
+ Any
236
+ The post-processed inference results. By default, this simply returns the output
237
+ as the table content (or content list).
238
+ """
239
+ return output
240
+
241
+ def _prepare_paddle_payload(self, base64_img: str) -> Dict[str, Any]:
242
+ """
243
+ DEPRECATED by batch logic in format_input. Kept here if you need single-image direct calls.
244
+
245
+ Parameters
246
+ ----------
247
+ base64_img : str
248
+ A single base64-encoded image string.
249
+
250
+ Returns
251
+ -------
252
+ dict of str -> Any
253
+ The payload in either legacy or new format for PaddleOCR's HTTP endpoint.
254
+ """
255
+ image_url = f"data:image/png;base64,{base64_img}"
256
+
257
+ image = {"type": "image_url", "url": image_url}
258
+ payload = {"input": [image]}
259
+
260
+ return payload
261
+
262
+ def _extract_content_from_paddle_http_response(
263
+ self,
264
+ json_response: Dict[str, Any],
265
+ dimensions: List[Dict[str, Any]],
266
+ ) -> List[Tuple[str, str]]:
267
+ """
268
+ Extract content from the JSON response of a PaddleOCR HTTP API request.
269
+
270
+ Parameters
271
+ ----------
272
+ json_response : dict of str -> Any
273
+ The JSON response returned by the PaddleOCR endpoint.
274
+ table_content_format : str or None
275
+ The specified format for table content (e.g., 'simple' or 'pseudo_markdown').
276
+ dimensions : list of dict, optional
277
+ A list of dict for each corresponding image, used for bounding box scaling.
278
+
279
+ Returns
280
+ -------
281
+ list of (str, str)
282
+ A list of (content, table_content_format) tuples, one for each image result.
283
+
284
+ Raises
285
+ ------
286
+ RuntimeError
287
+ If the response format is missing or invalid.
288
+ ValueError
289
+ If the `table_content_format` is unrecognized.
290
+ """
291
+ if "data" not in json_response or not json_response["data"]:
292
+ raise RuntimeError("Unexpected response format: 'data' key is missing or empty.")
293
+
294
+ results: List[str] = []
295
+ for item_idx, item in enumerate(json_response["data"]):
296
+ text_detections = item.get("text_detections", [])
297
+ text_predictions = []
298
+ bounding_boxes = []
299
+ for td in text_detections:
300
+ text_predictions.append(td["text_prediction"]["text"])
301
+ bounding_boxes.append([[pt["x"], pt["y"]] for pt in td["bounding_box"]["points"]])
302
+
303
+ bounding_boxes, text_predictions = self._postprocess_paddle_response(
304
+ bounding_boxes,
305
+ text_predictions,
306
+ dimensions,
307
+ img_index=item_idx,
308
+ )
309
+
310
+ results.append([bounding_boxes, text_predictions])
311
+
312
+ return results
313
+
314
+ def _extract_content_from_paddle_grpc_response(
315
+ self,
316
+ response: np.ndarray,
317
+ dimensions: List[Dict[str, Any]],
318
+ ) -> List[Tuple[str, str]]:
319
+ """
320
+ Parse a gRPC response for one or more images. The response can have two possible shapes:
321
+ - (3,) for batch_size=1
322
+ - (3, n) for batch_size=n
323
+
324
+ In either case:
325
+ response[0, i]: byte string containing bounding box data
326
+ response[1, i]: byte string containing text prediction data
327
+ response[2, i]: (Optional) additional data/metadata (ignored here)
328
+
329
+ Parameters
330
+ ----------
331
+ response : np.ndarray
332
+ The raw NumPy array from gRPC. Expected shape: (3,) or (3, n).
333
+ table_content_format : str
334
+ The format of the output text content, e.g. 'simple' or 'pseudo_markdown'.
335
+ dims : list of dict, optional
336
+ A list of dict for each corresponding image, used for bounding box scaling.
337
+
338
+ Returns
339
+ -------
340
+ list of (str, str)
341
+ A list of (content, table_content_format) for each image.
342
+
343
+ Raises
344
+ ------
345
+ ValueError
346
+ If the response is not a NumPy array or has an unexpected shape,
347
+ or if the `table_content_format` is unrecognized.
348
+ """
349
+ if not isinstance(response, np.ndarray):
350
+ raise ValueError("Unexpected response format: response is not a NumPy array.")
351
+
352
+ # If we have shape (3,), convert to (3, 1)
353
+ if response.ndim == 1 and response.shape == (3,):
354
+ response = response.reshape(3, 1)
355
+ elif response.ndim != 2 or response.shape[0] != 3:
356
+ raise ValueError(f"Unexpected response shape: {response.shape}. Expecting (3,) or (3, n).")
357
+
358
+ batch_size = response.shape[1]
359
+ results: List[Tuple[str, str]] = []
360
+
361
+ for i in range(batch_size):
362
+ # 1) Parse bounding boxes
363
+ bboxes_bytestr: bytes = response[0, i]
364
+ bounding_boxes = json.loads(bboxes_bytestr.decode("utf8"))
365
+
366
+ # 2) Parse text predictions
367
+ texts_bytestr: bytes = response[1, i]
368
+ text_predictions = json.loads(texts_bytestr.decode("utf8"))
369
+
370
+ # 3) Log the third element (extra data/metadata) if needed
371
+ extra_data_bytestr: bytes = response[2, i]
372
+ logger.debug(f"Ignoring extra_data for image {i}: {extra_data_bytestr}")
373
+
374
+ # Some gRPC responses nest single-item lists; flatten them if needed
375
+ if isinstance(bounding_boxes, list) and len(bounding_boxes) == 1:
376
+ bounding_boxes = bounding_boxes[0]
377
+ if isinstance(text_predictions, list) and len(text_predictions) == 1:
378
+ text_predictions = text_predictions[0]
379
+
380
+ bounding_boxes, text_predictions = self._postprocess_paddle_response(
381
+ bounding_boxes,
382
+ text_predictions,
383
+ dimensions,
384
+ img_index=i,
385
+ )
386
+
387
+ results.append([bounding_boxes, text_predictions])
388
+
389
+ return results
390
+
391
+ @staticmethod
392
+ def _postprocess_paddle_response(
393
+ bounding_boxes: List[Any],
394
+ text_predictions: List[str],
395
+ dims: Optional[List[Dict[str, Any]]] = None,
396
+ img_index: int = 0,
397
+ ) -> Tuple[List[Any], List[str]]:
398
+ """
399
+ Convert bounding boxes with normalized coordinates to pixel cooridnates by using
400
+ the dimensions. Also shift the coorindates if the inputs were padded. For multiple images,
401
+ the correct image dimensions (height, width) are retrieved from `dims[img_index]`.
402
+
403
+ Parameters
404
+ ----------
405
+ bounding_boxes : list of Any
406
+ A list (per line of text) of bounding boxes, each a list of (x, y) points.
407
+ text_predictions : list of str
408
+ A list of text predictions, one for each bounding box.
409
+ img_index : int, optional
410
+ The index of the image for which bounding boxes are being converted. Default is 0.
411
+ dims : list of dict, optional
412
+ A list of dictionaries, where each dictionary contains image-specific dimensions
413
+ and scaling information:
414
+ - "new_width" (int): The width of the image after processing.
415
+ - "new_height" (int): The height of the image after processing.
416
+ - "pad_width" (int, optional): The width of padding added to the image.
417
+ - "pad_height" (int, optional): The height of padding added to the image.
418
+ - "scale_factor" (float, optional): The scaling factor applied to the image.
419
+
420
+ Returns
421
+ -------
422
+ Tuple[List[Any], List[str]]
423
+ Bounding boxes scaled backed to the original dimensions and detected text lines.
424
+
425
+ Notes
426
+ -----
427
+ - If `dims` is None or `img_index` is out of range, bounding boxes will not be scaled properly.
428
+ """
429
+ # Default to no scaling if dims are missing or out of range
430
+ if not dims:
431
+ raise ValueError("No image_dims provided.")
432
+ else:
433
+ if img_index >= len(dims):
434
+ logger.warning("Image index out of range for stored dimensions. Using first image dims by default.")
435
+ img_index = 0
436
+
437
+ max_width = dims[img_index]["new_width"]
438
+ max_height = dims[img_index]["new_height"]
439
+ pad_width = dims[img_index].get("pad_width", 0)
440
+ pad_height = dims[img_index].get("pad_height", 0)
441
+ scale_factor = dims[img_index].get("scale_factor", 1.0)
442
+
443
+ bboxes: List[List[float]] = []
444
+ texts: List[str] = []
445
+
446
+ # Convert normalized coords back to actual pixel coords
447
+ for box, txt in zip(bounding_boxes, text_predictions):
448
+ if box == "nan":
449
+ continue
450
+ points: List[List[float]] = []
451
+ for point in box:
452
+ # Convert normalized coords back to actual pixel coords,
453
+ # and shift them back to their original positions if padded.
454
+ x_pixels = float(point[0]) * max_width - pad_width
455
+ y_pixels = float(point[1]) * max_height - pad_height
456
+ x_original = x_pixels / scale_factor
457
+ y_original = y_pixels / scale_factor
458
+ points.append([x_original, y_original])
459
+ bboxes.append(points)
460
+ texts.append(txt)
461
+
462
+ return bboxes, texts