nv-ingest-api 2025.3.27.dev20250327__py3-none-any.whl → 2025.3.29.dev20250329__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.29.dev20250329.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.3.29.dev20250329.dist-info/RECORD +9 -0
  6. nv_ingest_api/interface/__init__.py +0 -215
  7. nv_ingest_api/interface/extract.py +0 -972
  8. nv_ingest_api/interface/mutate.py +0 -154
  9. nv_ingest_api/interface/store.py +0 -218
  10. nv_ingest_api/interface/transform.py +0 -382
  11. nv_ingest_api/interface/utility.py +0 -200
  12. nv_ingest_api/internal/enums/__init__.py +0 -3
  13. nv_ingest_api/internal/enums/common.py +0 -494
  14. nv_ingest_api/internal/extract/__init__.py +0 -3
  15. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  17. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  18. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  19. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  20. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  23. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  24. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  25. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  26. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  27. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  28. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  29. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  30. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  34. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  40. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  44. nv_ingest_api/internal/mutate/__init__.py +0 -3
  45. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  46. nv_ingest_api/internal/mutate/filter.py +0 -133
  47. nv_ingest_api/internal/primitives/__init__.py +0 -0
  48. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  49. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  50. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  51. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  52. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  53. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  54. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -272
  55. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  56. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -452
  57. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  58. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  59. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  60. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  61. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  62. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  63. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  64. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  65. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  66. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  67. nv_ingest_api/internal/schemas/__init__.py +0 -3
  68. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  70. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  71. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  72. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  74. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  75. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  76. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  77. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  78. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  79. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  80. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  81. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  82. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  83. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  84. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  85. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  86. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  87. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  88. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  89. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  90. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  91. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  92. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  93. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  94. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  95. nv_ingest_api/internal/store/__init__.py +0 -3
  96. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  97. nv_ingest_api/internal/store/image_upload.py +0 -232
  98. nv_ingest_api/internal/transform/__init__.py +0 -3
  99. nv_ingest_api/internal/transform/caption_image.py +0 -205
  100. nv_ingest_api/internal/transform/embed_text.py +0 -496
  101. nv_ingest_api/internal/transform/split_text.py +0 -157
  102. nv_ingest_api/util/__init__.py +0 -0
  103. nv_ingest_api/util/control_message/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/validators.py +0 -47
  105. nv_ingest_api/util/converters/__init__.py +0 -0
  106. nv_ingest_api/util/converters/bytetools.py +0 -78
  107. nv_ingest_api/util/converters/containers.py +0 -65
  108. nv_ingest_api/util/converters/datetools.py +0 -90
  109. nv_ingest_api/util/converters/dftools.py +0 -127
  110. nv_ingest_api/util/converters/formats.py +0 -64
  111. nv_ingest_api/util/converters/type_mappings.py +0 -27
  112. nv_ingest_api/util/detectors/__init__.py +0 -5
  113. nv_ingest_api/util/detectors/language.py +0 -38
  114. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  115. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  116. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  117. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  118. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  119. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  120. nv_ingest_api/util/image_processing/__init__.py +0 -5
  121. nv_ingest_api/util/image_processing/clustering.py +0 -260
  122. nv_ingest_api/util/image_processing/processing.py +0 -179
  123. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  124. nv_ingest_api/util/image_processing/transforms.py +0 -407
  125. nv_ingest_api/util/logging/__init__.py +0 -0
  126. nv_ingest_api/util/logging/configuration.py +0 -31
  127. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  128. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  129. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  130. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  131. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  132. nv_ingest_api/util/metadata/__init__.py +0 -5
  133. nv_ingest_api/util/metadata/aggregators.py +0 -469
  134. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  135. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  136. nv_ingest_api/util/nim/__init__.py +0 -56
  137. nv_ingest_api/util/pdf/__init__.py +0 -3
  138. nv_ingest_api/util/pdf/pdfium.py +0 -427
  139. nv_ingest_api/util/schema/__init__.py +0 -0
  140. nv_ingest_api/util/schema/schema_validator.py +0 -10
  141. nv_ingest_api/util/service_clients/__init__.py +0 -3
  142. nv_ingest_api/util/service_clients/client_base.py +0 -72
  143. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  144. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  146. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  147. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -368
  148. nv_ingest_api/util/string_processing/__init__.py +0 -51
  149. nv_ingest_api-2025.3.27.dev20250327.dist-info/RECORD +0 -152
  150. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  151. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.29.dev20250329.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.29.dev20250329.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.3.27.dev20250327.dist-info → nv_ingest_api-2025.3.29.dev20250329.dist-info}/top_level.txt +0 -0
@@ -1,449 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- import re
8
-
9
- import numpy as np
10
- import pandas as pd
11
- from sklearn.cluster import DBSCAN
12
-
13
-
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
- def process_yolox_graphic_elements(yolox_text_dict):
18
- """
19
- Process the inference results from yolox-graphic-elements model.
20
-
21
- Parameters
22
- ----------
23
- yolox_text : str
24
- The result from the yolox model inference.
25
-
26
- Returns
27
- -------
28
- str
29
- The concatenated and processed chart content as a string.
30
- """
31
- chart_content = ""
32
-
33
- chart_content += yolox_text_dict.get("chart_title", "")
34
-
35
- chart_content += " " + yolox_text_dict.get("caption", "")
36
- chart_content += " " + yolox_text_dict.get("x_title", "")
37
- chart_content += " " + yolox_text_dict.get("xlabel", "")
38
- chart_content += " " + yolox_text_dict.get("y_title", "")
39
- chart_content += " " + yolox_text_dict.get("ylabel", "")
40
- chart_content += " " + yolox_text_dict.get("legend_label", "")
41
- chart_content += " " + yolox_text_dict.get("legend_title", "")
42
- chart_content += " " + yolox_text_dict.get("mark_label", "")
43
- chart_content += " " + yolox_text_dict.get("value_label", "")
44
- chart_content += " " + yolox_text_dict.get("other", "")
45
-
46
- return chart_content.strip()
47
-
48
-
49
- def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
50
- """
51
- Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
52
- Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
53
- Boxes are expeceted in format (x0, y0, x1, y1)
54
- Args:
55
- yolox_box (np array [4]): Cached Bbox.
56
- paddle_ocr_boxes (np array [n x 4]): PaddleOCR boxes
57
- already_matched (list or None, Optional): Already matched ids to ignore.
58
- delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
59
- Returns:
60
- np array or list: Indices of the match bboxes
61
- """
62
- x0_1, y0_1, x1_1, y1_1 = yolox_box
63
- x0_2, y0_2, x1_2, y1_2 = (
64
- paddle_ocr_boxes[:, 0],
65
- paddle_ocr_boxes[:, 1],
66
- paddle_ocr_boxes[:, 2],
67
- paddle_ocr_boxes[:, 3],
68
- )
69
-
70
- # Intersection
71
- inter_y0 = np.maximum(y0_1, y0_2)
72
- inter_y1 = np.minimum(y1_1, y1_2)
73
- inter_x0 = np.maximum(x0_1, x0_2)
74
- inter_x1 = np.minimum(x1_1, x1_2)
75
- inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
76
-
77
- # Union
78
- area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
79
- area_2 = (y1_2 - y0_2) * (x1_2 - x0_2)
80
- union_area = area_1 + area_2 - inter_area
81
-
82
- # IoU
83
- ious = inter_area / union_area
84
-
85
- max_iou = np.max(ious)
86
- if max_iou <= 0.01:
87
- return []
88
-
89
- matches = np.where(ious > (max_iou / delta))[0]
90
- if already_matched is not None:
91
- matches = np.array([m for m in matches if m not in already_matched])
92
- return matches
93
-
94
-
95
- def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, paddle_txts):
96
- """
97
- Matching boxes
98
- We need to associate a text to the paddle detections.
99
- For each class and for each CACHED detections, we look for overlapping text bboxes
100
- with IoU > max_iou / delta where max_iou is the biggest found overlap.
101
- Found texts are added to the class representation, and removed from the texts to match
102
- """
103
- KEPT_CLASSES = [ # Used CACHED classes, corresponds to YoloX classes
104
- "chart_title",
105
- "x_title",
106
- "y_title",
107
- "xlabel",
108
- "ylabel",
109
- "other",
110
- "legend_label",
111
- "legend_title",
112
- "mark_label",
113
- "value_label",
114
- ]
115
-
116
- paddle_txts = np.array(paddle_txts)
117
- paddle_boxes = np.array(paddle_boxes)
118
-
119
- if (paddle_txts.size == 0) or (paddle_boxes.size == 0):
120
- return {}
121
-
122
- paddle_boxes = np.array(
123
- [
124
- paddle_boxes[:, :, 0].min(-1),
125
- paddle_boxes[:, :, 1].min(-1),
126
- paddle_boxes[:, :, 0].max(-1),
127
- paddle_boxes[:, :, 1].max(-1),
128
- ]
129
- ).T
130
-
131
- already_matched = []
132
- results = {}
133
-
134
- for k in KEPT_CLASSES:
135
- if not len(yolox_output.get(k, [])): # No bounding boxes
136
- continue
137
-
138
- texts = []
139
- for yolox_box in yolox_output[k]:
140
- # if there's a score at the end, drop the score.
141
- yolox_box = yolox_box[:4]
142
- paddle_ids = match_bboxes(yolox_box, paddle_boxes, already_matched=already_matched, delta=4)
143
-
144
- if len(paddle_ids) > 0:
145
- text = " ".join(paddle_txts[paddle_ids].tolist())
146
- texts.append(text)
147
-
148
- processed_texts = []
149
- for t in texts:
150
- t = re.sub(r"\s+", " ", t)
151
- t = re.sub(r"\.+", ".", t)
152
- processed_texts.append(t)
153
-
154
- if "title" in k:
155
- processed_texts = " ".join(processed_texts)
156
- else:
157
- processed_texts = " - ".join(processed_texts) # Space ?
158
-
159
- results[k] = processed_texts
160
-
161
- return results
162
-
163
-
164
- def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
165
- if (not bboxes) or (not texts):
166
- return ""
167
-
168
- bboxes = np.array(bboxes).astype(int)
169
- bboxes = bboxes.reshape(-1, 8)[:, [0, 1, 2, -1]]
170
-
171
- preds_df = pd.DataFrame(
172
- {"x0": bboxes[:, 0], "y0": bboxes[:, 1], "x1": bboxes[:, 2], "y1": bboxes[:, 3], "text": texts}
173
- )
174
- preds_df = preds_df.sort_values("y0")
175
-
176
- dbscan = DBSCAN(eps=10, min_samples=1)
177
- dbscan.fit(preds_df["y0"].values[:, None])
178
-
179
- preds_df["cluster"] = dbscan.labels_
180
- preds_df = preds_df.sort_values(["cluster", "x0"])
181
-
182
- results = ""
183
- for _, dfg in preds_df.groupby("cluster"):
184
- results += "| " + " | ".join(dfg["text"].values.tolist()) + " |\n"
185
-
186
- return results
187
-
188
-
189
- def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_boxes, paddle_ocr_txts):
190
- if (not paddle_ocr_boxes) or (not paddle_ocr_txts):
191
- return ""
192
-
193
- paddle_ocr_boxes = np.array(paddle_ocr_boxes)
194
- paddle_ocr_boxes_ = np.array(
195
- [
196
- paddle_ocr_boxes[:, :, 0].min(-1),
197
- paddle_ocr_boxes[:, :, 1].min(-1),
198
- paddle_ocr_boxes[:, :, 0].max(-1),
199
- paddle_ocr_boxes[:, :, 1].max(-1),
200
- ]
201
- ).T
202
-
203
- assignments = []
204
- for i, (b, t) in enumerate(zip(paddle_ocr_boxes_, paddle_ocr_txts)):
205
- # Find a cell
206
- matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
207
- cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
208
-
209
- # Find a row
210
- matches_row = assign_boxes(cell, yolox_cell_preds["row"], delta=1)
211
- row_ids = matches_row if len(matches_row) else -1
212
-
213
- # Find a column - or more if if it is the first row
214
- if isinstance(row_ids, np.ndarray):
215
- delta = 2 if row_ids.min() == 0 else 1 # delta=2 if header column
216
- else:
217
- delta = 1
218
- matches_col = assign_boxes(cell, yolox_cell_preds["column"], delta=delta)
219
- col_ids = matches_col if len(matches_col) else -1
220
-
221
- assignments.append(
222
- {
223
- "index": i,
224
- "paddle_box": b,
225
- "is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
226
- "cell_id": matches_cell[0] if len(matches_cell) else -1,
227
- "cell": cell,
228
- "col_ids": col_ids,
229
- "row_ids": row_ids,
230
- "text": t,
231
- }
232
- )
233
- # break
234
- df_assign = pd.DataFrame(assignments)
235
-
236
- # Merge cells with several assigned texts
237
- dfs = []
238
- for cell_id, df_cell in df_assign.groupby("cell_id"):
239
- if len(df_cell) > 1 and cell_id > -1:
240
- df_cell = merge_text_in_cell(df_cell)
241
- dfs.append(df_cell)
242
- df_assign = pd.concat(dfs)
243
-
244
- df_text = df_assign[~df_assign["is_table"]].reset_index(drop=True)
245
-
246
- # Table to text
247
- df_table = df_assign[df_assign["is_table"]].reset_index(drop=True)
248
- if len(df_table):
249
- mat = build_markdown(df_table)
250
- markdown_table = display_markdown(mat, use_header=False)
251
-
252
- all_boxes = np.stack(df_table.paddle_box.values)
253
- table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
254
-
255
- df_table_to_text = pd.DataFrame(
256
- [
257
- {
258
- "paddle_box": table_box,
259
- "text": markdown_table,
260
- "is_table": True,
261
- }
262
- ]
263
- )
264
- # Final text representations dataframe
265
- df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
266
-
267
- df_text = df_text.rename(columns={"paddle_box": "box"})
268
-
269
- # Sort by y and x
270
- df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
271
- df_text["y"] = df_text["box"].apply(lambda x: (x[1] + x[3]) / 2)
272
- df_text["x"] = (df_text["x"] - df_text["x"].min()) // 10
273
- df_text["y"] = (df_text["y"] - df_text["y"].min()) // 20
274
- df_text = df_text.sort_values(["y", "x"], ignore_index=True)
275
-
276
- # Loop over lines
277
- rows_list = []
278
- for r, df_row in df_text.groupby("y"):
279
- if df_row["is_table"].values.any(): # Add table
280
- table = df_row[df_row["is_table"]]
281
- df_row = df_row[~df_row["is_table"]]
282
- else:
283
- table = None
284
-
285
- if len(df_row) > 1: # Add text
286
- df_row = df_row.reset_index(drop=True)
287
- df_row["text"] = "\n".join(df_row["text"].values.tolist())
288
-
289
- rows_list.append(df_row.head(1))
290
-
291
- if table is not None:
292
- rows_list.append(table)
293
-
294
- df_display = pd.concat(rows_list, ignore_index=True)
295
- result = "\n".join(df_display.text.values.tolist())
296
-
297
- return result
298
-
299
-
300
- def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
301
- """
302
- Assigns the closest bounding boxes to a reference `paddle_box` based on overlap.
303
-
304
- Args:
305
- paddle_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
306
- boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
307
- delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
308
- min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
309
-
310
- Returns:
311
- list: Indices of the matched boxes sorted by decreasing overlap.
312
- Returns an empty list if no matches are found.
313
- """
314
- if not len(boxes):
315
- return []
316
-
317
- boxes = np.array(boxes)
318
-
319
- x0_1, y0_1, x1_1, y1_1 = paddle_box
320
- x0_2, y0_2, x1_2, y1_2 = (
321
- boxes[:, 0],
322
- boxes[:, 1],
323
- boxes[:, 2],
324
- boxes[:, 3],
325
- )
326
-
327
- # Intersection
328
- inter_y0 = np.maximum(y0_1, y0_2)
329
- inter_y1 = np.minimum(y1_1, y1_2)
330
- inter_x0 = np.maximum(x0_1, x0_2)
331
- inter_x1 = np.minimum(x1_1, x1_2)
332
- inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
333
-
334
- # Normalize by paddle_box size
335
- area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
336
- ious = inter_area / (area_1 + 1e-6)
337
-
338
- max_iou = np.max(ious)
339
- if max_iou <= min_overlap: # No match
340
- return []
341
-
342
- n = len(np.where(ious >= (max_iou / delta))[0])
343
- matches = np.argsort(-ious)[:n]
344
- return matches
345
-
346
-
347
- def build_markdown(df):
348
- """
349
- Convert a dataframe into a markdown table.
350
-
351
- Args:
352
- df (pandas DataFrame): The dataframe to convert.
353
-
354
- Returns:
355
- list[list]: A list of lists representing the markdown table.
356
- """
357
- df = df.reset_index(drop=True)
358
- n_cols = max([np.max(c) for c in df["col_ids"].values])
359
- n_rows = max([np.max(c) for c in df["row_ids"].values])
360
-
361
- mat = np.empty((n_rows + 1, n_cols + 1), dtype=str).tolist()
362
-
363
- for i in range(len(df)):
364
- if isinstance(df["row_ids"][i], int) or isinstance(df["col_ids"][i], int):
365
- continue
366
- for r in df["row_ids"][i]:
367
- for c in df["col_ids"][i]:
368
- mat[r][c] = (mat[r][c] + " " + df["text"][i]).strip()
369
-
370
- # Remove empty rows & columns
371
- mat = remove_empty_row(mat)
372
- mat = np.array(remove_empty_row(np.array(mat).T.tolist())).T.tolist()
373
-
374
- return mat
375
-
376
-
377
- def merge_text_in_cell(df_cell):
378
- """
379
- Merges text from multiple rows into a single cell and recalculates its bounding box.
380
- Values are sorted by rounded (y, x) coordinates.
381
-
382
- Args:
383
- df_cell (pandas.DataFrame): DataFrame containing cells to merge.
384
-
385
- Returns:
386
- pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
387
- """
388
- paddle_boxes = np.stack(df_cell["paddle_box"].values)
389
-
390
- df_cell["x"] = (paddle_boxes[:, 0] - paddle_boxes[:, 0].min()) // 10
391
- df_cell["y"] = (paddle_boxes[:, 1] - paddle_boxes[:, 1].min()) // 10
392
- df_cell = df_cell.sort_values(["y", "x"])
393
-
394
- text = " ".join(df_cell["text"].values.tolist())
395
- df_cell["text"] = text
396
- df_cell = df_cell.head(1)
397
- df_cell["paddle_box"] = df_cell["cell"]
398
- df_cell.drop(["x", "y"], axis=1, inplace=True)
399
-
400
- return df_cell
401
-
402
-
403
- def remove_empty_row(mat):
404
- """
405
- Remove empty rows from a matrix.
406
-
407
- Args:
408
- mat (list[list]): The matrix to remove empty rows from.
409
-
410
- Returns:
411
- list[list]: The matrix with empty rows removed.
412
- """
413
- mat_filter = []
414
- for row in mat:
415
- if max([len(c) for c in row]):
416
- mat_filter.append(row)
417
- return mat_filter
418
-
419
-
420
- def display_markdown(
421
- data: list[list[str]],
422
- use_header: bool = False,
423
- ) -> str:
424
- """
425
- Convert a list of lists of strings into a markdown table.
426
-
427
- Parameters:
428
- data (list[list[str]]): The table data. The first sublist should contain headers.
429
- use_header (bool, optional): Whether to use the first sublist as headers. Defaults to True.
430
-
431
- Returns:
432
- str: A markdown-formatted table as a string.
433
- """
434
- if not len(data):
435
- return "EMPTY TABLE"
436
-
437
- max_cols = max(len(row) for row in data)
438
- data = [row + [""] * (max_cols - len(row)) for row in data]
439
-
440
- if use_header:
441
- header = "| " + " | ".join(data[0]) + " |"
442
- separator = "| " + " | ".join(["---"] * max_cols) + " |"
443
- body = "\n".join("| " + " | ".join(row) + " |" for row in data[1:])
444
- markdown_table = f"{header}\n{separator}\n{body}" if body else f"{header}\n{separator}"
445
-
446
- else:
447
- markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
448
-
449
- return markdown_table