nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.23.dev20250423.dist-info/RECORD +152 -0
  149. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/WHEEL +1 -1
  150. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  151. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,260 @@
1
+ import logging
2
+ from typing import List
3
+ from typing import Optional
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def boxes_are_close_or_overlap(b1: List[int], b2: List[int], threshold: float = 10.0) -> bool:
10
+ """
11
+ Determine if two bounding boxes either overlap or are within a certain distance threshold.
12
+
13
+ The function expands each bounding box by `threshold` in all directions and checks
14
+ if the expanded regions overlap on both the x-axis and y-axis.
15
+
16
+ Parameters
17
+ ----------
18
+ b1 (tuple): The first bounding box, in the format (xmin, ymin, xmax, ymax).
19
+ b2 (tuple): The second bounding box, in the same format.
20
+ threshold (float, optional): The distance (in pixels or points) by which to expand
21
+ each bounding box before checking for overlap. Defaults to 10.0.
22
+
23
+ Returns
24
+ -------
25
+ bool:
26
+ True if the two bounding boxes overlap or are within the specified
27
+ threshold distance of each other, False otherwise.
28
+
29
+ Example
30
+ -------
31
+ >>> box1 = (100, 100, 150, 150)
32
+ >>> box2 = (160, 110, 200, 140)
33
+ >>> boxes_are_close_or_overlap(box1, box2, threshold=10)
34
+ True # Because box2 is within 10 pixels of box1 along the x-axis
35
+ """
36
+ (xmin1, ymin1, xmax1, ymax1) = b1
37
+ (xmin2, ymin2, xmax2, ymax2) = b2
38
+
39
+ # Expand each box by 'threshold' in all directions and see if they overlap
40
+ expanded_b1 = (xmin1 - threshold, ymin1 - threshold, xmax1 + threshold, ymax1 + threshold)
41
+ expanded_b2 = (xmin2 - threshold, ymin2 - threshold, xmax2 + threshold, ymax2 + threshold)
42
+
43
+ # Check overlap on expanded boxes
44
+ (exmin1, eymin1, exmax1, eymax1) = expanded_b1
45
+ (exmin2, eymin2, exmax2, eymax2) = expanded_b2
46
+
47
+ overlap_x_expanded = not (exmax1 < exmin2 or exmax2 < exmin1)
48
+ overlap_y_expanded = not (eymax1 < eymin2 or eymax2 < eymin1)
49
+
50
+ return overlap_x_expanded and overlap_y_expanded
51
+
52
+
53
+ def group_bounding_boxes(
54
+ boxes: List[List[int]], threshold: float = 10.0, max_num_boxes: int = 1_000, max_depth: Optional[int] = None
55
+ ) -> List[List[int]]:
56
+ """
57
+ Group bounding boxes that either overlap or lie within a given proximity threshold.
58
+
59
+ This function first checks whether the number of bounding boxes exceeds
60
+ `max_num_boxes`, returning an empty list if it does (to avoid excessive
61
+ computation). Then, it builds an adjacency list by comparing each pair
62
+ of bounding boxes (using `boxes_are_close_or_overlap`). Any bounding
63
+ boxes determined to be within `threshold` distance (or overlapping)
64
+ are treated as connected.
65
+
66
+ Using a Depth-First Search (DFS), we traverse these connections to
67
+ form groups (connected components). Each group is a list of indices
68
+ referencing bounding boxes in the original `boxes` list.
69
+
70
+ Parameters
71
+ ----------
72
+ boxes (list of tuple):
73
+ A list of bounding boxes in the format (xmin, ymin, xmax, ymax).
74
+ threshold (float, optional):
75
+ The distance threshold used to determine if two boxes are
76
+ considered "close enough" to be in the same group. Defaults to 10.0.
77
+ max_num_boxes (int, optional):
78
+ The maximum number of bounding boxes to process. If the length of
79
+ `boxes` exceeds this, a warning is logged and the function returns
80
+ an empty list. Defaults to 1,000.
81
+ max_depth (int, optional):
82
+ The maximum depth for the DFS. If None, there is no limit to how
83
+ many layers deep the search may go when forming connected components.
84
+ If set, bounding boxes beyond that depth in the adjacency graph
85
+ will not be included in the group. Defaults to None.
86
+
87
+ Returns
88
+ -------
89
+ list of list of int:
90
+ Each element is a list (group) containing the indices of bounding
91
+ boxes that are connected (overlapping or within `threshold`
92
+ distance of each other).
93
+ """
94
+ n = len(boxes)
95
+ if n > max_num_boxes:
96
+ logger.warning(
97
+ "Number of bounding boxes (%d) exceeds the maximum allowed (%d). "
98
+ "Skipping image grouping to avoid high computational overhead.",
99
+ n,
100
+ max_num_boxes,
101
+ )
102
+ return []
103
+
104
+ visited = [False] * n
105
+ adjacency_list = [[] for _ in range(n)]
106
+
107
+ # Build adjacency by checking closeness/overlap
108
+ for i in range(n):
109
+ for j in range(i + 1, n):
110
+ if boxes_are_close_or_overlap(boxes[i], boxes[j], threshold):
111
+ adjacency_list[i].append(j)
112
+ adjacency_list[j].append(i)
113
+
114
+ # DFS to get connected components
115
+ def dfs(start):
116
+ stack = [(start, 0)] # (node, depth)
117
+ component = []
118
+ while stack:
119
+ node, depth = stack.pop()
120
+ if not visited[node]:
121
+ visited[node] = True
122
+ component.append(node)
123
+
124
+ # If we haven't reached max_depth (if max_depth is set)
125
+ if max_depth is None or depth < max_depth:
126
+ for neighbor in adjacency_list[node]:
127
+ if not visited[neighbor]:
128
+ stack.append((neighbor, depth + 1))
129
+
130
+ return component
131
+
132
+ groups = []
133
+ for i in range(n):
134
+ if not visited[i]:
135
+ comp = dfs(i)
136
+ groups.append(comp)
137
+
138
+ return groups
139
+
140
+
141
+ def combine_groups_into_bboxes(
142
+ boxes: List[List[int]], groups: List[List[int]], min_num_components: int = 1
143
+ ) -> List[List[int]]:
144
+ """
145
+ Merge bounding boxes based on grouped indices.
146
+
147
+ Given:
148
+ - A list of bounding boxes (`boxes`), each in the form (xmin, ymin, xmax, ymax).
149
+ - A list of groups (`groups`), where each group is a list of indices
150
+ referring to bounding boxes in `boxes`.
151
+
152
+ For each group, this function:
153
+ 1. Collects all bounding boxes in that group.
154
+ 2. Computes a single bounding box that tightly encompasses all of those
155
+ bounding boxes by taking the minimum of all xmins and ymins, and the
156
+ maximum of all xmaxs and ymaxs.
157
+ 3. If the group has fewer than `min_num_components` bounding boxes, it is
158
+ skipped.
159
+
160
+ Parameters
161
+ ----------
162
+ boxes (list of tuple):
163
+ The original bounding boxes, each in (xmin, ymin, xmax, ymax) format.
164
+ groups (list of list of int):
165
+ A list of groups, where each group is a list of indices into `boxes`.
166
+ min_num_components (int, optional):
167
+ The minimum number of bounding boxes a group must have to produce
168
+ a merged bounding box. Defaults to 1.
169
+
170
+ Returns
171
+ -------
172
+ list of list of int:
173
+ A list of merged bounding boxes, one for each group that meets or exceeds
174
+ `min_num_components`. Each bounding box is in the format
175
+ (xmin, ymin, xmax, ymax).
176
+ """
177
+ combined = []
178
+ for group in groups:
179
+ if len(group) < min_num_components:
180
+ continue
181
+ xmins = []
182
+ ymins = []
183
+ xmaxs = []
184
+ ymaxs = []
185
+ for idx in group:
186
+ (xmin, ymin, xmax, ymax) = boxes[idx]
187
+ xmins.append(xmin)
188
+ ymins.append(ymin)
189
+ xmaxs.append(xmax)
190
+ ymaxs.append(ymax)
191
+
192
+ group_xmin = min(xmins)
193
+ group_ymin = min(ymins)
194
+ group_xmax = max(xmaxs)
195
+ group_ymax = max(ymaxs)
196
+
197
+ combined.append([group_xmin, group_ymin, group_xmax, group_ymax])
198
+
199
+ return combined
200
+
201
+
202
+ def remove_superset_bboxes(bboxes: List[List[int]]) -> List[List[int]]:
203
+ """
204
+ Remove any bounding box that strictly contains another bounding box.
205
+
206
+ Specifically, for each bounding box `box_a`, if it fully encloses
207
+ another bounding box `box_b` in all dimensions (with at least one
208
+ edge strictly larger rather than exactly equal), then `box_a` is
209
+ excluded from the results.
210
+
211
+ Parameters
212
+ ----------
213
+ bboxes (List[List[int]]):
214
+ A list of bounding boxes, where each bounding box is a list
215
+ or tuple of four integers in the format:
216
+ [x_min, y_min, x_max, y_max].
217
+
218
+ Returns
219
+ -------
220
+ List[List[int]]:
221
+ A new list of bounding boxes, excluding those that are
222
+ strict supersets of any other bounding box in `bboxes`.
223
+
224
+ Example
225
+ -------
226
+ >>> bboxes = [
227
+ ... [0, 0, 5, 5], # box A
228
+ ... [1, 1, 2, 2], # box B
229
+ ... [3, 3, 4, 4] # box C
230
+ ... ]
231
+ >>> # Box A strictly encloses B and C, so it is removed
232
+ >>> remove_superset_bboxes(bboxes)
233
+ [[1, 1, 2, 2], [3, 3, 4, 4]]
234
+ """
235
+ results = []
236
+
237
+ for i, box_a in enumerate(bboxes):
238
+ xA_min, yA_min, xA_max, yA_max = box_a
239
+
240
+ # Flag to mark if we should exclude this box
241
+ exclude_a = False
242
+
243
+ for j, box_b in enumerate(bboxes):
244
+ if i == j:
245
+ continue
246
+
247
+ xB_min, yB_min, xB_max, yB_max = box_b
248
+
249
+ # Check if box_a strictly encloses box_b:
250
+ # 1) xA_min <= xB_min, yA_min <= yB_min, xA_max >= xB_max, yA_max >= yB_max
251
+ # 2) At least one of those inequalities is strict, meaning they're not equal on all edges
252
+ if xA_min <= xB_min and yA_min <= yB_min and xA_max >= xB_max and yA_max >= yB_max:
253
+ # box_a is a strict superset => remove it
254
+ exclude_a = True
255
+ break
256
+
257
+ if not exclude_a:
258
+ results.append(box_a)
259
+
260
+ return results
@@ -0,0 +1,179 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import logging
19
+ import numpy as np
20
+ from typing import List, Tuple, Optional
21
+
22
+ from nv_ingest_api.internal.primitives.nim.default_values import (
23
+ YOLOX_MAX_BATCH_SIZE,
24
+ YOLOX_NUM_CLASSES,
25
+ YOLOX_CONF_THRESHOLD,
26
+ YOLOX_IOU_THRESHOLD,
27
+ YOLOX_MIN_SCORE,
28
+ YOLOX_FINAL_SCORE,
29
+ )
30
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxPageElementsModelInterface
31
+ from nv_ingest_api.util.image_processing.transforms import crop_image, numpy_to_base64
32
+ from nv_ingest_api.util.metadata.aggregators import CroppedImageWithContent
33
+ from nv_ingest_api.util.nim import create_inference_client
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def extract_tables_and_charts_from_image(annotation_dict, original_image, page_idx, tables_and_charts):
39
+ """
40
+ Extract and process table and chart regions from the provided image based on detection annotations.
41
+
42
+ Parameters
43
+ ----------
44
+ annotation_dict : dict
45
+ A dictionary containing detected objects and their bounding boxes, e.g. keys "table" and "chart".
46
+ original_image : np.ndarray
47
+ The original image from which objects were detected.
48
+ page_idx : int
49
+ The index of the current page being processed.
50
+ tables_and_charts : list of tuple
51
+ A list to which extracted table/chart data will be appended. Each item is a tuple
52
+ (page_idx, CroppedImageWithContent).
53
+
54
+ Notes
55
+ -----
56
+ This function iterates over the detected table and chart objects. For each detected object, it:
57
+ - Crops the original image based on the bounding box.
58
+ - Converts the cropped image to a base64 encoded string.
59
+ - Wraps the encoded image along with its bounding box and the image dimensions in a standardized data structure.
60
+
61
+ Additional model inference or post-processing can be added where needed.
62
+
63
+ Examples
64
+ --------
65
+ >>> annotation_dict = {"table": [ [...], [...] ], "chart": [ [...], [...] ]}
66
+ >>> original_image = np.random.rand(1536, 1536, 3)
67
+ >>> tables_and_charts = []
68
+ >>> extract_tables_and_charts(annotation_dict, original_image, 0, tables_and_charts)
69
+ """
70
+
71
+ width, height, *_ = original_image.shape
72
+ for label in ["table", "chart"]:
73
+ if not annotation_dict:
74
+ continue
75
+
76
+ objects = annotation_dict[label]
77
+ for idx, bboxes in enumerate(objects):
78
+ *bbox, _ = bboxes
79
+ h1, w1, h2, w2 = bbox
80
+
81
+ cropped = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
82
+ base64_img = numpy_to_base64(cropped)
83
+
84
+ element_data = CroppedImageWithContent(
85
+ content="",
86
+ image=base64_img,
87
+ bbox=(int(w1), int(h1), int(w2), int(h2)),
88
+ max_width=width,
89
+ max_height=height,
90
+ type_string=label,
91
+ )
92
+ tables_and_charts.append((page_idx, element_data))
93
+
94
+
95
+ def extract_tables_and_charts_yolox(
96
+ pages: List[Tuple[int, np.ndarray]],
97
+ config: dict,
98
+ trace_info: Optional[List] = None,
99
+ ) -> List[Tuple[int, object]]:
100
+ """
101
+ Given a list of (page_index, image) tuples and a configuration dictionary,
102
+ this function calls the YOLOX-based inference service to extract table and chart
103
+ annotations from all pages.
104
+
105
+ Parameters
106
+ ----------
107
+ pages : List[Tuple[int, np.ndarray]]
108
+ A list of tuples containing the page index and the corresponding image.
109
+ config : dict
110
+ A dictionary containing configuration parameters such as:
111
+ - 'yolox_endpoints'
112
+ - 'auth_token'
113
+ - 'yolox_infer_protocol'
114
+ trace_info : Optional[List], optional
115
+ Optional tracing information for logging/debugging purposes.
116
+
117
+ Returns
118
+ -------
119
+ List[Tuple[int, object]]
120
+ For each page, returns a tuple (page_index, joined_content) where
121
+ joined_content is the result of combining annotations from the inference.
122
+ """
123
+ tables_and_charts = []
124
+ yolox_client = None
125
+
126
+ try:
127
+ model_interface = YoloxPageElementsModelInterface()
128
+ yolox_client = create_inference_client(
129
+ config["yolox_endpoints"],
130
+ model_interface,
131
+ config["auth_token"],
132
+ config["yolox_infer_protocol"],
133
+ )
134
+
135
+ # Collect all page indices and images in order.
136
+ image_page_indices = [page[0] for page in pages]
137
+ original_images = [page[1] for page in pages]
138
+
139
+ # Prepare the data payload with all images.
140
+ data = {"images": original_images}
141
+
142
+ # Perform inference using the YOLOX client.
143
+ inference_results = yolox_client.infer(
144
+ data,
145
+ model_name="yolox",
146
+ max_batch_size=YOLOX_MAX_BATCH_SIZE,
147
+ num_classes=YOLOX_NUM_CLASSES,
148
+ conf_thresh=YOLOX_CONF_THRESHOLD,
149
+ iou_thresh=YOLOX_IOU_THRESHOLD,
150
+ min_score=YOLOX_MIN_SCORE,
151
+ final_thresh=YOLOX_FINAL_SCORE,
152
+ trace_info=trace_info,
153
+ stage_name="pdf_content_extractor",
154
+ )
155
+
156
+ # Process results: iterate over each image's inference output.
157
+ for annotation_dict, page_index, original_image in zip(inference_results, image_page_indices, original_images):
158
+ extract_tables_and_charts_from_image(
159
+ annotation_dict,
160
+ original_image,
161
+ page_index,
162
+ tables_and_charts,
163
+ )
164
+
165
+ except TimeoutError:
166
+ logger.error("Timeout error during table/chart extraction.")
167
+ raise
168
+
169
+ except Exception as e:
170
+ err_msg = f"Error during table/chart extraction: {str(e)}"
171
+ logger.exception(err_msg)
172
+ raise
173
+
174
+ finally:
175
+ if yolox_client:
176
+ yolox_client.close()
177
+
178
+ logger.debug(f"Extracted {len(tables_and_charts)} tables and charts.")
179
+ return tables_and_charts