nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,449 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import re
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from sklearn.cluster import DBSCAN
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def process_yolox_graphic_elements(yolox_text_dict):
18
+ """
19
+ Process the inference results from yolox-graphic-elements model.
20
+
21
+ Parameters
22
+ ----------
23
+ yolox_text : str
24
+ The result from the yolox model inference.
25
+
26
+ Returns
27
+ -------
28
+ str
29
+ The concatenated and processed chart content as a string.
30
+ """
31
+ chart_content = ""
32
+
33
+ chart_content += yolox_text_dict.get("chart_title", "")
34
+
35
+ chart_content += " " + yolox_text_dict.get("caption", "")
36
+ chart_content += " " + yolox_text_dict.get("x_title", "")
37
+ chart_content += " " + yolox_text_dict.get("xlabel", "")
38
+ chart_content += " " + yolox_text_dict.get("y_title", "")
39
+ chart_content += " " + yolox_text_dict.get("ylabel", "")
40
+ chart_content += " " + yolox_text_dict.get("legend_label", "")
41
+ chart_content += " " + yolox_text_dict.get("legend_title", "")
42
+ chart_content += " " + yolox_text_dict.get("mark_label", "")
43
+ chart_content += " " + yolox_text_dict.get("value_label", "")
44
+ chart_content += " " + yolox_text_dict.get("other", "")
45
+
46
+ return chart_content.strip()
47
+
48
+
49
+ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
50
+ """
51
+ Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
52
+ Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
53
+ Boxes are expeceted in format (x0, y0, x1, y1)
54
+ Args:
55
+ yolox_box (np array [4]): Cached Bbox.
56
+ paddle_ocr_boxes (np array [n x 4]): PaddleOCR boxes
57
+ already_matched (list or None, Optional): Already matched ids to ignore.
58
+ delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
59
+ Returns:
60
+ np array or list: Indices of the match bboxes
61
+ """
62
+ x0_1, y0_1, x1_1, y1_1 = yolox_box
63
+ x0_2, y0_2, x1_2, y1_2 = (
64
+ paddle_ocr_boxes[:, 0],
65
+ paddle_ocr_boxes[:, 1],
66
+ paddle_ocr_boxes[:, 2],
67
+ paddle_ocr_boxes[:, 3],
68
+ )
69
+
70
+ # Intersection
71
+ inter_y0 = np.maximum(y0_1, y0_2)
72
+ inter_y1 = np.minimum(y1_1, y1_2)
73
+ inter_x0 = np.maximum(x0_1, x0_2)
74
+ inter_x1 = np.minimum(x1_1, x1_2)
75
+ inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
76
+
77
+ # Union
78
+ area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
79
+ area_2 = (y1_2 - y0_2) * (x1_2 - x0_2)
80
+ union_area = area_1 + area_2 - inter_area
81
+
82
+ # IoU
83
+ ious = inter_area / union_area
84
+
85
+ max_iou = np.max(ious)
86
+ if max_iou <= 0.01:
87
+ return []
88
+
89
+ matches = np.where(ious > (max_iou / delta))[0]
90
+ if already_matched is not None:
91
+ matches = np.array([m for m in matches if m not in already_matched])
92
+ return matches
93
+
94
+
95
+ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, paddle_txts):
96
+ """
97
+ Matching boxes
98
+ We need to associate a text to the paddle detections.
99
+ For each class and for each CACHED detections, we look for overlapping text bboxes
100
+ with IoU > max_iou / delta where max_iou is the biggest found overlap.
101
+ Found texts are added to the class representation, and removed from the texts to match
102
+ """
103
+ KEPT_CLASSES = [ # Used CACHED classes, corresponds to YoloX classes
104
+ "chart_title",
105
+ "x_title",
106
+ "y_title",
107
+ "xlabel",
108
+ "ylabel",
109
+ "other",
110
+ "legend_label",
111
+ "legend_title",
112
+ "mark_label",
113
+ "value_label",
114
+ ]
115
+
116
+ paddle_txts = np.array(paddle_txts)
117
+ paddle_boxes = np.array(paddle_boxes)
118
+
119
+ if (paddle_txts.size == 0) or (paddle_boxes.size == 0):
120
+ return {}
121
+
122
+ paddle_boxes = np.array(
123
+ [
124
+ paddle_boxes[:, :, 0].min(-1),
125
+ paddle_boxes[:, :, 1].min(-1),
126
+ paddle_boxes[:, :, 0].max(-1),
127
+ paddle_boxes[:, :, 1].max(-1),
128
+ ]
129
+ ).T
130
+
131
+ already_matched = []
132
+ results = {}
133
+
134
+ for k in KEPT_CLASSES:
135
+ if not len(yolox_output.get(k, [])): # No bounding boxes
136
+ continue
137
+
138
+ texts = []
139
+ for yolox_box in yolox_output[k]:
140
+ # if there's a score at the end, drop the score.
141
+ yolox_box = yolox_box[:4]
142
+ paddle_ids = match_bboxes(yolox_box, paddle_boxes, already_matched=already_matched, delta=4)
143
+
144
+ if len(paddle_ids) > 0:
145
+ text = " ".join(paddle_txts[paddle_ids].tolist())
146
+ texts.append(text)
147
+
148
+ processed_texts = []
149
+ for t in texts:
150
+ t = re.sub(r"\s+", " ", t)
151
+ t = re.sub(r"\.+", ".", t)
152
+ processed_texts.append(t)
153
+
154
+ if "title" in k:
155
+ processed_texts = " ".join(processed_texts)
156
+ else:
157
+ processed_texts = " - ".join(processed_texts) # Space ?
158
+
159
+ results[k] = processed_texts
160
+
161
+ return results
162
+
163
+
164
+ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
165
+ if (not bboxes) or (not texts):
166
+ return ""
167
+
168
+ bboxes = np.array(bboxes).astype(int)
169
+ bboxes = bboxes.reshape(-1, 8)[:, [0, 1, 2, -1]]
170
+
171
+ preds_df = pd.DataFrame(
172
+ {"x0": bboxes[:, 0], "y0": bboxes[:, 1], "x1": bboxes[:, 2], "y1": bboxes[:, 3], "text": texts}
173
+ )
174
+ preds_df = preds_df.sort_values("y0")
175
+
176
+ dbscan = DBSCAN(eps=10, min_samples=1)
177
+ dbscan.fit(preds_df["y0"].values[:, None])
178
+
179
+ preds_df["cluster"] = dbscan.labels_
180
+ preds_df = preds_df.sort_values(["cluster", "x0"])
181
+
182
+ results = ""
183
+ for _, dfg in preds_df.groupby("cluster"):
184
+ results += "| " + " | ".join(dfg["text"].values.tolist()) + " |\n"
185
+
186
+ return results
187
+
188
+
189
+ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_boxes, paddle_ocr_txts):
190
+ if (not paddle_ocr_boxes) or (not paddle_ocr_txts):
191
+ return ""
192
+
193
+ paddle_ocr_boxes = np.array(paddle_ocr_boxes)
194
+ paddle_ocr_boxes_ = np.array(
195
+ [
196
+ paddle_ocr_boxes[:, :, 0].min(-1),
197
+ paddle_ocr_boxes[:, :, 1].min(-1),
198
+ paddle_ocr_boxes[:, :, 0].max(-1),
199
+ paddle_ocr_boxes[:, :, 1].max(-1),
200
+ ]
201
+ ).T
202
+
203
+ assignments = []
204
+ for i, (b, t) in enumerate(zip(paddle_ocr_boxes_, paddle_ocr_txts)):
205
+ # Find a cell
206
+ matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
207
+ cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
208
+
209
+ # Find a row
210
+ matches_row = assign_boxes(cell, yolox_cell_preds["row"], delta=1)
211
+ row_ids = matches_row if len(matches_row) else -1
212
+
213
+ # Find a column - or more if if it is the first row
214
+ if isinstance(row_ids, np.ndarray):
215
+ delta = 2 if row_ids.min() == 0 else 1 # delta=2 if header column
216
+ else:
217
+ delta = 1
218
+ matches_col = assign_boxes(cell, yolox_cell_preds["column"], delta=delta)
219
+ col_ids = matches_col if len(matches_col) else -1
220
+
221
+ assignments.append(
222
+ {
223
+ "index": i,
224
+ "paddle_box": b,
225
+ "is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
226
+ "cell_id": matches_cell[0] if len(matches_cell) else -1,
227
+ "cell": cell,
228
+ "col_ids": col_ids,
229
+ "row_ids": row_ids,
230
+ "text": t,
231
+ }
232
+ )
233
+ # break
234
+ df_assign = pd.DataFrame(assignments)
235
+
236
+ # Merge cells with several assigned texts
237
+ dfs = []
238
+ for cell_id, df_cell in df_assign.groupby("cell_id"):
239
+ if len(df_cell) > 1 and cell_id > -1:
240
+ df_cell = merge_text_in_cell(df_cell)
241
+ dfs.append(df_cell)
242
+ df_assign = pd.concat(dfs)
243
+
244
+ df_text = df_assign[~df_assign["is_table"]].reset_index(drop=True)
245
+
246
+ # Table to text
247
+ df_table = df_assign[df_assign["is_table"]].reset_index(drop=True)
248
+ if len(df_table):
249
+ mat = build_markdown(df_table)
250
+ markdown_table = display_markdown(mat, use_header=False)
251
+
252
+ all_boxes = np.stack(df_table.paddle_box.values)
253
+ table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
254
+
255
+ df_table_to_text = pd.DataFrame(
256
+ [
257
+ {
258
+ "paddle_box": table_box,
259
+ "text": markdown_table,
260
+ "is_table": True,
261
+ }
262
+ ]
263
+ )
264
+ # Final text representations dataframe
265
+ df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
266
+
267
+ df_text = df_text.rename(columns={"paddle_box": "box"})
268
+
269
+ # Sort by y and x
270
+ df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
271
+ df_text["y"] = df_text["box"].apply(lambda x: (x[1] + x[3]) / 2)
272
+ df_text["x"] = (df_text["x"] - df_text["x"].min()) // 10
273
+ df_text["y"] = (df_text["y"] - df_text["y"].min()) // 20
274
+ df_text = df_text.sort_values(["y", "x"], ignore_index=True)
275
+
276
+ # Loop over lines
277
+ rows_list = []
278
+ for r, df_row in df_text.groupby("y"):
279
+ if df_row["is_table"].values.any(): # Add table
280
+ table = df_row[df_row["is_table"]]
281
+ df_row = df_row[~df_row["is_table"]]
282
+ else:
283
+ table = None
284
+
285
+ if len(df_row) > 1: # Add text
286
+ df_row = df_row.reset_index(drop=True)
287
+ df_row["text"] = "\n".join(df_row["text"].values.tolist())
288
+
289
+ rows_list.append(df_row.head(1))
290
+
291
+ if table is not None:
292
+ rows_list.append(table)
293
+
294
+ df_display = pd.concat(rows_list, ignore_index=True)
295
+ result = "\n".join(df_display.text.values.tolist())
296
+
297
+ return result
298
+
299
+
300
+ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
301
+ """
302
+ Assigns the closest bounding boxes to a reference `paddle_box` based on overlap.
303
+
304
+ Args:
305
+ paddle_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
306
+ boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
307
+ delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
308
+ min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
309
+
310
+ Returns:
311
+ list: Indices of the matched boxes sorted by decreasing overlap.
312
+ Returns an empty list if no matches are found.
313
+ """
314
+ if not len(boxes):
315
+ return []
316
+
317
+ boxes = np.array(boxes)
318
+
319
+ x0_1, y0_1, x1_1, y1_1 = paddle_box
320
+ x0_2, y0_2, x1_2, y1_2 = (
321
+ boxes[:, 0],
322
+ boxes[:, 1],
323
+ boxes[:, 2],
324
+ boxes[:, 3],
325
+ )
326
+
327
+ # Intersection
328
+ inter_y0 = np.maximum(y0_1, y0_2)
329
+ inter_y1 = np.minimum(y1_1, y1_2)
330
+ inter_x0 = np.maximum(x0_1, x0_2)
331
+ inter_x1 = np.minimum(x1_1, x1_2)
332
+ inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
333
+
334
+ # Normalize by paddle_box size
335
+ area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
336
+ ious = inter_area / (area_1 + 1e-6)
337
+
338
+ max_iou = np.max(ious)
339
+ if max_iou <= min_overlap: # No match
340
+ return []
341
+
342
+ n = len(np.where(ious >= (max_iou / delta))[0])
343
+ matches = np.argsort(-ious)[:n]
344
+ return matches
345
+
346
+
347
+ def build_markdown(df):
348
+ """
349
+ Convert a dataframe into a markdown table.
350
+
351
+ Args:
352
+ df (pandas DataFrame): The dataframe to convert.
353
+
354
+ Returns:
355
+ list[list]: A list of lists representing the markdown table.
356
+ """
357
+ df = df.reset_index(drop=True)
358
+ n_cols = max([np.max(c) for c in df["col_ids"].values])
359
+ n_rows = max([np.max(c) for c in df["row_ids"].values])
360
+
361
+ mat = np.empty((n_rows + 1, n_cols + 1), dtype=str).tolist()
362
+
363
+ for i in range(len(df)):
364
+ if isinstance(df["row_ids"][i], int) or isinstance(df["col_ids"][i], int):
365
+ continue
366
+ for r in df["row_ids"][i]:
367
+ for c in df["col_ids"][i]:
368
+ mat[r][c] = (mat[r][c] + " " + df["text"][i]).strip()
369
+
370
+ # Remove empty rows & columns
371
+ mat = remove_empty_row(mat)
372
+ mat = np.array(remove_empty_row(np.array(mat).T.tolist())).T.tolist()
373
+
374
+ return mat
375
+
376
+
377
+ def merge_text_in_cell(df_cell):
378
+ """
379
+ Merges text from multiple rows into a single cell and recalculates its bounding box.
380
+ Values are sorted by rounded (y, x) coordinates.
381
+
382
+ Args:
383
+ df_cell (pandas.DataFrame): DataFrame containing cells to merge.
384
+
385
+ Returns:
386
+ pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
387
+ """
388
+ paddle_boxes = np.stack(df_cell["paddle_box"].values)
389
+
390
+ df_cell["x"] = (paddle_boxes[:, 0] - paddle_boxes[:, 0].min()) // 10
391
+ df_cell["y"] = (paddle_boxes[:, 1] - paddle_boxes[:, 1].min()) // 10
392
+ df_cell = df_cell.sort_values(["y", "x"])
393
+
394
+ text = " ".join(df_cell["text"].values.tolist())
395
+ df_cell["text"] = text
396
+ df_cell = df_cell.head(1)
397
+ df_cell["paddle_box"] = df_cell["cell"]
398
+ df_cell.drop(["x", "y"], axis=1, inplace=True)
399
+
400
+ return df_cell
401
+
402
+
403
+ def remove_empty_row(mat):
404
+ """
405
+ Remove empty rows from a matrix.
406
+
407
+ Args:
408
+ mat (list[list]): The matrix to remove empty rows from.
409
+
410
+ Returns:
411
+ list[list]: The matrix with empty rows removed.
412
+ """
413
+ mat_filter = []
414
+ for row in mat:
415
+ if max([len(c) for c in row]):
416
+ mat_filter.append(row)
417
+ return mat_filter
418
+
419
+
420
+ def display_markdown(
421
+ data: list[list[str]],
422
+ use_header: bool = False,
423
+ ) -> str:
424
+ """
425
+ Convert a list of lists of strings into a markdown table.
426
+
427
+ Parameters:
428
+ data (list[list[str]]): The table data. The first sublist should contain headers.
429
+ use_header (bool, optional): Whether to use the first sublist as headers. Defaults to True.
430
+
431
+ Returns:
432
+ str: A markdown-formatted table as a string.
433
+ """
434
+ if not len(data):
435
+ return "EMPTY TABLE"
436
+
437
+ max_cols = max(len(row) for row in data)
438
+ data = [row + [""] * (max_cols - len(row)) for row in data]
439
+
440
+ if use_header:
441
+ header = "| " + " | ".join(data[0]) + " |"
442
+ separator = "| " + " | ".join(["---"] * max_cols) + " |"
443
+ body = "\n".join("| " + " | ".join(row) + " |" for row in data[1:])
444
+ markdown_table = f"{header}\n{separator}\n{body}" if body else f"{header}\n{separator}"
445
+
446
+ else:
447
+ markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
448
+
449
+ return markdown_table