nv-ingest-api 2025.4.15.dev20250415__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.15.dev20250415.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.15.dev20250415.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def boxes_are_close_or_overlap(b1: List[int], b2: List[int], threshold: float = 10.0) -> bool:
|
|
10
|
+
"""
|
|
11
|
+
Determine if two bounding boxes either overlap or are within a certain distance threshold.
|
|
12
|
+
|
|
13
|
+
The function expands each bounding box by `threshold` in all directions and checks
|
|
14
|
+
if the expanded regions overlap on both the x-axis and y-axis.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
b1 (tuple): The first bounding box, in the format (xmin, ymin, xmax, ymax).
|
|
19
|
+
b2 (tuple): The second bounding box, in the same format.
|
|
20
|
+
threshold (float, optional): The distance (in pixels or points) by which to expand
|
|
21
|
+
each bounding box before checking for overlap. Defaults to 10.0.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
bool:
|
|
26
|
+
True if the two bounding boxes overlap or are within the specified
|
|
27
|
+
threshold distance of each other, False otherwise.
|
|
28
|
+
|
|
29
|
+
Example
|
|
30
|
+
-------
|
|
31
|
+
>>> box1 = (100, 100, 150, 150)
|
|
32
|
+
>>> box2 = (160, 110, 200, 140)
|
|
33
|
+
>>> boxes_are_close_or_overlap(box1, box2, threshold=10)
|
|
34
|
+
True # Because box2 is within 10 pixels of box1 along the x-axis
|
|
35
|
+
"""
|
|
36
|
+
(xmin1, ymin1, xmax1, ymax1) = b1
|
|
37
|
+
(xmin2, ymin2, xmax2, ymax2) = b2
|
|
38
|
+
|
|
39
|
+
# Expand each box by 'threshold' in all directions and see if they overlap
|
|
40
|
+
expanded_b1 = (xmin1 - threshold, ymin1 - threshold, xmax1 + threshold, ymax1 + threshold)
|
|
41
|
+
expanded_b2 = (xmin2 - threshold, ymin2 - threshold, xmax2 + threshold, ymax2 + threshold)
|
|
42
|
+
|
|
43
|
+
# Check overlap on expanded boxes
|
|
44
|
+
(exmin1, eymin1, exmax1, eymax1) = expanded_b1
|
|
45
|
+
(exmin2, eymin2, exmax2, eymax2) = expanded_b2
|
|
46
|
+
|
|
47
|
+
overlap_x_expanded = not (exmax1 < exmin2 or exmax2 < exmin1)
|
|
48
|
+
overlap_y_expanded = not (eymax1 < eymin2 or eymax2 < eymin1)
|
|
49
|
+
|
|
50
|
+
return overlap_x_expanded and overlap_y_expanded
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def group_bounding_boxes(
|
|
54
|
+
boxes: List[List[int]], threshold: float = 10.0, max_num_boxes: int = 1_000, max_depth: Optional[int] = None
|
|
55
|
+
) -> List[List[int]]:
|
|
56
|
+
"""
|
|
57
|
+
Group bounding boxes that either overlap or lie within a given proximity threshold.
|
|
58
|
+
|
|
59
|
+
This function first checks whether the number of bounding boxes exceeds
|
|
60
|
+
`max_num_boxes`, returning an empty list if it does (to avoid excessive
|
|
61
|
+
computation). Then, it builds an adjacency list by comparing each pair
|
|
62
|
+
of bounding boxes (using `boxes_are_close_or_overlap`). Any bounding
|
|
63
|
+
boxes determined to be within `threshold` distance (or overlapping)
|
|
64
|
+
are treated as connected.
|
|
65
|
+
|
|
66
|
+
Using a Depth-First Search (DFS), we traverse these connections to
|
|
67
|
+
form groups (connected components). Each group is a list of indices
|
|
68
|
+
referencing bounding boxes in the original `boxes` list.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
boxes (list of tuple):
|
|
73
|
+
A list of bounding boxes in the format (xmin, ymin, xmax, ymax).
|
|
74
|
+
threshold (float, optional):
|
|
75
|
+
The distance threshold used to determine if two boxes are
|
|
76
|
+
considered "close enough" to be in the same group. Defaults to 10.0.
|
|
77
|
+
max_num_boxes (int, optional):
|
|
78
|
+
The maximum number of bounding boxes to process. If the length of
|
|
79
|
+
`boxes` exceeds this, a warning is logged and the function returns
|
|
80
|
+
an empty list. Defaults to 1,000.
|
|
81
|
+
max_depth (int, optional):
|
|
82
|
+
The maximum depth for the DFS. If None, there is no limit to how
|
|
83
|
+
many layers deep the search may go when forming connected components.
|
|
84
|
+
If set, bounding boxes beyond that depth in the adjacency graph
|
|
85
|
+
will not be included in the group. Defaults to None.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
list of list of int:
|
|
90
|
+
Each element is a list (group) containing the indices of bounding
|
|
91
|
+
boxes that are connected (overlapping or within `threshold`
|
|
92
|
+
distance of each other).
|
|
93
|
+
"""
|
|
94
|
+
n = len(boxes)
|
|
95
|
+
if n > max_num_boxes:
|
|
96
|
+
logger.warning(
|
|
97
|
+
"Number of bounding boxes (%d) exceeds the maximum allowed (%d). "
|
|
98
|
+
"Skipping image grouping to avoid high computational overhead.",
|
|
99
|
+
n,
|
|
100
|
+
max_num_boxes,
|
|
101
|
+
)
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
visited = [False] * n
|
|
105
|
+
adjacency_list = [[] for _ in range(n)]
|
|
106
|
+
|
|
107
|
+
# Build adjacency by checking closeness/overlap
|
|
108
|
+
for i in range(n):
|
|
109
|
+
for j in range(i + 1, n):
|
|
110
|
+
if boxes_are_close_or_overlap(boxes[i], boxes[j], threshold):
|
|
111
|
+
adjacency_list[i].append(j)
|
|
112
|
+
adjacency_list[j].append(i)
|
|
113
|
+
|
|
114
|
+
# DFS to get connected components
|
|
115
|
+
def dfs(start):
|
|
116
|
+
stack = [(start, 0)] # (node, depth)
|
|
117
|
+
component = []
|
|
118
|
+
while stack:
|
|
119
|
+
node, depth = stack.pop()
|
|
120
|
+
if not visited[node]:
|
|
121
|
+
visited[node] = True
|
|
122
|
+
component.append(node)
|
|
123
|
+
|
|
124
|
+
# If we haven't reached max_depth (if max_depth is set)
|
|
125
|
+
if max_depth is None or depth < max_depth:
|
|
126
|
+
for neighbor in adjacency_list[node]:
|
|
127
|
+
if not visited[neighbor]:
|
|
128
|
+
stack.append((neighbor, depth + 1))
|
|
129
|
+
|
|
130
|
+
return component
|
|
131
|
+
|
|
132
|
+
groups = []
|
|
133
|
+
for i in range(n):
|
|
134
|
+
if not visited[i]:
|
|
135
|
+
comp = dfs(i)
|
|
136
|
+
groups.append(comp)
|
|
137
|
+
|
|
138
|
+
return groups
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def combine_groups_into_bboxes(
|
|
142
|
+
boxes: List[List[int]], groups: List[List[int]], min_num_components: int = 1
|
|
143
|
+
) -> List[List[int]]:
|
|
144
|
+
"""
|
|
145
|
+
Merge bounding boxes based on grouped indices.
|
|
146
|
+
|
|
147
|
+
Given:
|
|
148
|
+
- A list of bounding boxes (`boxes`), each in the form (xmin, ymin, xmax, ymax).
|
|
149
|
+
- A list of groups (`groups`), where each group is a list of indices
|
|
150
|
+
referring to bounding boxes in `boxes`.
|
|
151
|
+
|
|
152
|
+
For each group, this function:
|
|
153
|
+
1. Collects all bounding boxes in that group.
|
|
154
|
+
2. Computes a single bounding box that tightly encompasses all of those
|
|
155
|
+
bounding boxes by taking the minimum of all xmins and ymins, and the
|
|
156
|
+
maximum of all xmaxs and ymaxs.
|
|
157
|
+
3. If the group has fewer than `min_num_components` bounding boxes, it is
|
|
158
|
+
skipped.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
boxes (list of tuple):
|
|
163
|
+
The original bounding boxes, each in (xmin, ymin, xmax, ymax) format.
|
|
164
|
+
groups (list of list of int):
|
|
165
|
+
A list of groups, where each group is a list of indices into `boxes`.
|
|
166
|
+
min_num_components (int, optional):
|
|
167
|
+
The minimum number of bounding boxes a group must have to produce
|
|
168
|
+
a merged bounding box. Defaults to 1.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
list of list of int:
|
|
173
|
+
A list of merged bounding boxes, one for each group that meets or exceeds
|
|
174
|
+
`min_num_components`. Each bounding box is in the format
|
|
175
|
+
(xmin, ymin, xmax, ymax).
|
|
176
|
+
"""
|
|
177
|
+
combined = []
|
|
178
|
+
for group in groups:
|
|
179
|
+
if len(group) < min_num_components:
|
|
180
|
+
continue
|
|
181
|
+
xmins = []
|
|
182
|
+
ymins = []
|
|
183
|
+
xmaxs = []
|
|
184
|
+
ymaxs = []
|
|
185
|
+
for idx in group:
|
|
186
|
+
(xmin, ymin, xmax, ymax) = boxes[idx]
|
|
187
|
+
xmins.append(xmin)
|
|
188
|
+
ymins.append(ymin)
|
|
189
|
+
xmaxs.append(xmax)
|
|
190
|
+
ymaxs.append(ymax)
|
|
191
|
+
|
|
192
|
+
group_xmin = min(xmins)
|
|
193
|
+
group_ymin = min(ymins)
|
|
194
|
+
group_xmax = max(xmaxs)
|
|
195
|
+
group_ymax = max(ymaxs)
|
|
196
|
+
|
|
197
|
+
combined.append([group_xmin, group_ymin, group_xmax, group_ymax])
|
|
198
|
+
|
|
199
|
+
return combined
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def remove_superset_bboxes(bboxes: List[List[int]]) -> List[List[int]]:
|
|
203
|
+
"""
|
|
204
|
+
Remove any bounding box that strictly contains another bounding box.
|
|
205
|
+
|
|
206
|
+
Specifically, for each bounding box `box_a`, if it fully encloses
|
|
207
|
+
another bounding box `box_b` in all dimensions (with at least one
|
|
208
|
+
edge strictly larger rather than exactly equal), then `box_a` is
|
|
209
|
+
excluded from the results.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
bboxes (List[List[int]]):
|
|
214
|
+
A list of bounding boxes, where each bounding box is a list
|
|
215
|
+
or tuple of four integers in the format:
|
|
216
|
+
[x_min, y_min, x_max, y_max].
|
|
217
|
+
|
|
218
|
+
Returns
|
|
219
|
+
-------
|
|
220
|
+
List[List[int]]:
|
|
221
|
+
A new list of bounding boxes, excluding those that are
|
|
222
|
+
strict supersets of any other bounding box in `bboxes`.
|
|
223
|
+
|
|
224
|
+
Example
|
|
225
|
+
-------
|
|
226
|
+
>>> bboxes = [
|
|
227
|
+
... [0, 0, 5, 5], # box A
|
|
228
|
+
... [1, 1, 2, 2], # box B
|
|
229
|
+
... [3, 3, 4, 4] # box C
|
|
230
|
+
... ]
|
|
231
|
+
>>> # Box A strictly encloses B and C, so it is removed
|
|
232
|
+
>>> remove_superset_bboxes(bboxes)
|
|
233
|
+
[[1, 1, 2, 2], [3, 3, 4, 4]]
|
|
234
|
+
"""
|
|
235
|
+
results = []
|
|
236
|
+
|
|
237
|
+
for i, box_a in enumerate(bboxes):
|
|
238
|
+
xA_min, yA_min, xA_max, yA_max = box_a
|
|
239
|
+
|
|
240
|
+
# Flag to mark if we should exclude this box
|
|
241
|
+
exclude_a = False
|
|
242
|
+
|
|
243
|
+
for j, box_b in enumerate(bboxes):
|
|
244
|
+
if i == j:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
xB_min, yB_min, xB_max, yB_max = box_b
|
|
248
|
+
|
|
249
|
+
# Check if box_a strictly encloses box_b:
|
|
250
|
+
# 1) xA_min <= xB_min, yA_min <= yB_min, xA_max >= xB_max, yA_max >= yB_max
|
|
251
|
+
# 2) At least one of those inequalities is strict, meaning they're not equal on all edges
|
|
252
|
+
if xA_min <= xB_min and yA_min <= yB_min and xA_max >= xB_max and yA_max >= yB_max:
|
|
253
|
+
# box_a is a strict superset => remove it
|
|
254
|
+
exclude_a = True
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
if not exclude_a:
|
|
258
|
+
results.append(box_a)
|
|
259
|
+
|
|
260
|
+
return results
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import numpy as np
|
|
20
|
+
from typing import List, Tuple, Optional
|
|
21
|
+
|
|
22
|
+
from nv_ingest_api.internal.primitives.nim.default_values import (
|
|
23
|
+
YOLOX_MAX_BATCH_SIZE,
|
|
24
|
+
YOLOX_NUM_CLASSES,
|
|
25
|
+
YOLOX_CONF_THRESHOLD,
|
|
26
|
+
YOLOX_IOU_THRESHOLD,
|
|
27
|
+
YOLOX_MIN_SCORE,
|
|
28
|
+
YOLOX_FINAL_SCORE,
|
|
29
|
+
)
|
|
30
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxPageElementsModelInterface
|
|
31
|
+
from nv_ingest_api.util.image_processing.transforms import crop_image, numpy_to_base64
|
|
32
|
+
from nv_ingest_api.util.metadata.aggregators import CroppedImageWithContent
|
|
33
|
+
from nv_ingest_api.util.nim import create_inference_client
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_tables_and_charts_from_image(annotation_dict, original_image, page_idx, tables_and_charts):
|
|
39
|
+
"""
|
|
40
|
+
Extract and process table and chart regions from the provided image based on detection annotations.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
annotation_dict : dict
|
|
45
|
+
A dictionary containing detected objects and their bounding boxes, e.g. keys "table" and "chart".
|
|
46
|
+
original_image : np.ndarray
|
|
47
|
+
The original image from which objects were detected.
|
|
48
|
+
page_idx : int
|
|
49
|
+
The index of the current page being processed.
|
|
50
|
+
tables_and_charts : list of tuple
|
|
51
|
+
A list to which extracted table/chart data will be appended. Each item is a tuple
|
|
52
|
+
(page_idx, CroppedImageWithContent).
|
|
53
|
+
|
|
54
|
+
Notes
|
|
55
|
+
-----
|
|
56
|
+
This function iterates over the detected table and chart objects. For each detected object, it:
|
|
57
|
+
- Crops the original image based on the bounding box.
|
|
58
|
+
- Converts the cropped image to a base64 encoded string.
|
|
59
|
+
- Wraps the encoded image along with its bounding box and the image dimensions in a standardized data structure.
|
|
60
|
+
|
|
61
|
+
Additional model inference or post-processing can be added where needed.
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
>>> annotation_dict = {"table": [ [...], [...] ], "chart": [ [...], [...] ]}
|
|
66
|
+
>>> original_image = np.random.rand(1536, 1536, 3)
|
|
67
|
+
>>> tables_and_charts = []
|
|
68
|
+
>>> extract_tables_and_charts(annotation_dict, original_image, 0, tables_and_charts)
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
width, height, *_ = original_image.shape
|
|
72
|
+
for label in ["table", "chart"]:
|
|
73
|
+
if not annotation_dict:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
objects = annotation_dict[label]
|
|
77
|
+
for idx, bboxes in enumerate(objects):
|
|
78
|
+
*bbox, _ = bboxes
|
|
79
|
+
h1, w1, h2, w2 = bbox
|
|
80
|
+
|
|
81
|
+
cropped = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
|
|
82
|
+
base64_img = numpy_to_base64(cropped)
|
|
83
|
+
|
|
84
|
+
element_data = CroppedImageWithContent(
|
|
85
|
+
content="",
|
|
86
|
+
image=base64_img,
|
|
87
|
+
bbox=(int(w1), int(h1), int(w2), int(h2)),
|
|
88
|
+
max_width=width,
|
|
89
|
+
max_height=height,
|
|
90
|
+
type_string=label,
|
|
91
|
+
)
|
|
92
|
+
tables_and_charts.append((page_idx, element_data))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_tables_and_charts_yolox(
|
|
96
|
+
pages: List[Tuple[int, np.ndarray]],
|
|
97
|
+
config: dict,
|
|
98
|
+
trace_info: Optional[List] = None,
|
|
99
|
+
) -> List[Tuple[int, object]]:
|
|
100
|
+
"""
|
|
101
|
+
Given a list of (page_index, image) tuples and a configuration dictionary,
|
|
102
|
+
this function calls the YOLOX-based inference service to extract table and chart
|
|
103
|
+
annotations from all pages.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
pages : List[Tuple[int, np.ndarray]]
|
|
108
|
+
A list of tuples containing the page index and the corresponding image.
|
|
109
|
+
config : dict
|
|
110
|
+
A dictionary containing configuration parameters such as:
|
|
111
|
+
- 'yolox_endpoints'
|
|
112
|
+
- 'auth_token'
|
|
113
|
+
- 'yolox_infer_protocol'
|
|
114
|
+
trace_info : Optional[List], optional
|
|
115
|
+
Optional tracing information for logging/debugging purposes.
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
List[Tuple[int, object]]
|
|
120
|
+
For each page, returns a tuple (page_index, joined_content) where
|
|
121
|
+
joined_content is the result of combining annotations from the inference.
|
|
122
|
+
"""
|
|
123
|
+
tables_and_charts = []
|
|
124
|
+
yolox_client = None
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
model_interface = YoloxPageElementsModelInterface()
|
|
128
|
+
yolox_client = create_inference_client(
|
|
129
|
+
config["yolox_endpoints"],
|
|
130
|
+
model_interface,
|
|
131
|
+
config["auth_token"],
|
|
132
|
+
config["yolox_infer_protocol"],
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Collect all page indices and images in order.
|
|
136
|
+
image_page_indices = [page[0] for page in pages]
|
|
137
|
+
original_images = [page[1] for page in pages]
|
|
138
|
+
|
|
139
|
+
# Prepare the data payload with all images.
|
|
140
|
+
data = {"images": original_images}
|
|
141
|
+
|
|
142
|
+
# Perform inference using the YOLOX client.
|
|
143
|
+
inference_results = yolox_client.infer(
|
|
144
|
+
data,
|
|
145
|
+
model_name="yolox",
|
|
146
|
+
max_batch_size=YOLOX_MAX_BATCH_SIZE,
|
|
147
|
+
num_classes=YOLOX_NUM_CLASSES,
|
|
148
|
+
conf_thresh=YOLOX_CONF_THRESHOLD,
|
|
149
|
+
iou_thresh=YOLOX_IOU_THRESHOLD,
|
|
150
|
+
min_score=YOLOX_MIN_SCORE,
|
|
151
|
+
final_thresh=YOLOX_FINAL_SCORE,
|
|
152
|
+
trace_info=trace_info,
|
|
153
|
+
stage_name="pdf_content_extractor",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Process results: iterate over each image's inference output.
|
|
157
|
+
for annotation_dict, page_index, original_image in zip(inference_results, image_page_indices, original_images):
|
|
158
|
+
extract_tables_and_charts_from_image(
|
|
159
|
+
annotation_dict,
|
|
160
|
+
original_image,
|
|
161
|
+
page_index,
|
|
162
|
+
tables_and_charts,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
except TimeoutError:
|
|
166
|
+
logger.error("Timeout error during table/chart extraction.")
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
err_msg = f"Error during table/chart extraction: {str(e)}"
|
|
171
|
+
logger.exception(err_msg)
|
|
172
|
+
raise
|
|
173
|
+
|
|
174
|
+
finally:
|
|
175
|
+
if yolox_client:
|
|
176
|
+
yolox_client.close()
|
|
177
|
+
|
|
178
|
+
logger.debug(f"Extracted {len(tables_and_charts)} tables and charts.")
|
|
179
|
+
return tables_and_charts
|