nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -72
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
@@ -1,344 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- from concurrent.futures import ThreadPoolExecutor
7
- from typing import Any, Union
8
- from typing import Dict
9
- from typing import List
10
- from typing import Optional
11
- from typing import Tuple
12
-
13
- import numpy as np
14
- import pandas as pd
15
-
16
- from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskTableExtraction
17
- from nv_ingest_api.internal.enums.common import TableFormatEnum
18
- from nv_ingest_api.internal.primitives.nim.model_interface.paddle import PaddleOCRModelInterface
19
- from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
20
- from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_table_structure_and_paddle_output
21
- from nv_ingest_api.util.image_processing.table_and_chart import convert_paddle_response_to_psuedo_markdown
22
- from nv_ingest_api.internal.primitives.nim import NimClient
23
- from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxTableStructureModelInterface
24
- from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
25
- from nv_ingest_api.util.nim import create_inference_client
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
- PADDLE_MIN_WIDTH = 32
30
- PADDLE_MIN_HEIGHT = 32
31
-
32
-
33
- def _filter_valid_images(base64_images: List[str]) -> Tuple[List[str], List[np.ndarray], List[int]]:
34
- """
35
- Filter base64-encoded images by their dimensions.
36
-
37
- Returns three lists:
38
- - valid_images: The base64 strings that meet minimum size requirements.
39
- - valid_arrays: The corresponding numpy arrays.
40
- - valid_indices: The original indices in the input list.
41
- """
42
- valid_images: List[str] = []
43
- valid_arrays: List[np.ndarray] = []
44
- valid_indices: List[int] = []
45
-
46
- for i, img in enumerate(base64_images):
47
- array = base64_to_numpy(img)
48
- height, width = array.shape[0], array.shape[1]
49
- if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
50
- valid_images.append(img)
51
- valid_arrays.append(array)
52
- valid_indices.append(i)
53
- else:
54
- # Image is too small; skip it.
55
- continue
56
-
57
- return valid_images, valid_arrays, valid_indices
58
-
59
-
60
- def _run_inference(
61
- enable_yolox: bool,
62
- yolox_client: Any,
63
- paddle_client: Any,
64
- valid_arrays: List[np.ndarray],
65
- valid_images: List[str],
66
- trace_info: Optional[Dict] = None,
67
- ) -> Tuple[List[Any], List[Any]]:
68
- """
69
- Run inference concurrently for YOLOX (if enabled) and Paddle.
70
-
71
- Returns a tuple of (yolox_results, paddle_results).
72
- """
73
- data_paddle = {"base64_images": valid_images}
74
- if enable_yolox:
75
- data_yolox = {"images": valid_arrays}
76
-
77
- with ThreadPoolExecutor(max_workers=2) as executor:
78
- future_yolox = None
79
- if enable_yolox:
80
- future_yolox = executor.submit(
81
- yolox_client.infer,
82
- data=data_yolox,
83
- model_name="yolox",
84
- stage_name="table_data_extraction",
85
- max_batch_size=8,
86
- trace_info=trace_info,
87
- )
88
- future_paddle = executor.submit(
89
- paddle_client.infer,
90
- data=data_paddle,
91
- model_name="paddle",
92
- stage_name="table_data_extraction",
93
- max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
94
- trace_info=trace_info,
95
- )
96
-
97
- if enable_yolox:
98
- try:
99
- yolox_results = future_yolox.result()
100
- except Exception as e:
101
- logger.error(f"Error calling yolox_client.infer: {e}", exc_info=True)
102
- raise
103
- else:
104
- yolox_results = [None] * len(valid_images)
105
-
106
- try:
107
- paddle_results = future_paddle.result()
108
- except Exception as e:
109
- logger.error(f"Error calling paddle_client.infer: {e}", exc_info=True)
110
- raise
111
-
112
- return yolox_results, paddle_results
113
-
114
-
115
- def _validate_inference_results(
116
- yolox_results: Any,
117
- paddle_results: Any,
118
- valid_arrays: List[Any],
119
- valid_images: List[str],
120
- ) -> Tuple[List[Any], List[Any]]:
121
- """
122
- Validate that both inference results are lists and have the expected lengths.
123
-
124
- If not, default values are assigned. Raises a ValueError if the lengths do not match.
125
- """
126
- if not isinstance(yolox_results, list) or not isinstance(paddle_results, list):
127
- logger.warning(
128
- "Unexpected result types from inference clients: yolox_results=%s, paddle_results=%s. "
129
- "Proceeding with available results.",
130
- type(yolox_results).__name__,
131
- type(paddle_results).__name__,
132
- )
133
- if not isinstance(yolox_results, list):
134
- yolox_results = [None] * len(valid_arrays)
135
- if not isinstance(paddle_results, list):
136
- paddle_results = [(None, None)] * len(valid_images)
137
-
138
- if len(yolox_results) != len(valid_arrays):
139
- raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
140
- if len(paddle_results) != len(valid_images):
141
- raise ValueError(f"Expected {len(valid_images)} paddle results, got {len(paddle_results)}")
142
-
143
- return yolox_results, paddle_results
144
-
145
-
146
- def _update_table_metadata(
147
- base64_images: List[str],
148
- yolox_client: Any,
149
- paddle_client: Any,
150
- worker_pool_size: int = 8, # Not currently used
151
- enable_yolox: bool = False,
152
- trace_info: Optional[Dict] = None,
153
- ) -> List[Tuple[str, Any, Any, Any]]:
154
- """
155
- Given a list of base64-encoded images, this function filters out images that do not meet
156
- the minimum size requirements and then calls the PaddleOCR model via paddle_client.infer
157
- to extract table data.
158
-
159
- For each base64-encoded image, the result is a tuple:
160
- (base64_image, yolox_result, paddle_text_predictions, paddle_bounding_boxes)
161
-
162
- Images that do not meet the minimum size are skipped (resulting in placeholders).
163
- The paddle_client is expected to handle any necessary batching and concurrency.
164
- """
165
- logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")
166
-
167
- # Initialize the results list with default placeholders.
168
- results: List[Tuple[str, Any, Any, Any]] = [("", None, None, None)] * len(base64_images)
169
-
170
- # Filter valid images based on size requirements.
171
- valid_images, valid_arrays, valid_indices = _filter_valid_images(base64_images)
172
-
173
- if not valid_images:
174
- return results
175
-
176
- # Run inference concurrently.
177
- yolox_results, paddle_results = _run_inference(
178
- enable_yolox=enable_yolox,
179
- yolox_client=yolox_client,
180
- paddle_client=paddle_client,
181
- valid_arrays=valid_arrays,
182
- valid_images=valid_images,
183
- trace_info=trace_info,
184
- )
185
-
186
- # Validate that the inference results have the expected structure.
187
- yolox_results, paddle_results = _validate_inference_results(
188
- yolox_results, paddle_results, valid_arrays, valid_images
189
- )
190
-
191
- # Combine results with the original order.
192
- for idx, (yolox_res, paddle_res) in enumerate(zip(yolox_results, paddle_results)):
193
- original_index = valid_indices[idx]
194
- results[original_index] = (base64_images[original_index], yolox_res, paddle_res[0], paddle_res[1])
195
-
196
- return results
197
-
198
-
199
- def _create_clients(
200
- yolox_endpoints: Tuple[str, str],
201
- yolox_protocol: str,
202
- paddle_endpoints: Tuple[str, str],
203
- paddle_protocol: str,
204
- auth_token: str,
205
- ) -> Tuple[NimClient, NimClient]:
206
- yolox_model_interface = YoloxTableStructureModelInterface()
207
- paddle_model_interface = PaddleOCRModelInterface()
208
-
209
- logger.debug(f"Inference protocols: yolox={yolox_protocol}, paddle={paddle_protocol}")
210
-
211
- yolox_client = create_inference_client(
212
- endpoints=yolox_endpoints,
213
- model_interface=yolox_model_interface,
214
- auth_token=auth_token,
215
- infer_protocol=yolox_protocol,
216
- )
217
-
218
- paddle_client = create_inference_client(
219
- endpoints=paddle_endpoints,
220
- model_interface=paddle_model_interface,
221
- auth_token=auth_token,
222
- infer_protocol=paddle_protocol,
223
- )
224
-
225
- return yolox_client, paddle_client
226
-
227
-
228
- def extract_table_data_from_image_internal(
229
- df_extraction_ledger: pd.DataFrame,
230
- task_config: Union[IngestTaskTableExtraction, Dict[str, Any]],
231
- extraction_config: TableExtractorSchema,
232
- execution_trace_log: Optional[Dict] = None,
233
- ) -> Tuple[pd.DataFrame, Dict]:
234
- """
235
- Extracts table data from a DataFrame in a bulk fashion rather than row-by-row,
236
- following the chart extraction pattern.
237
-
238
- Parameters
239
- ----------
240
- df_extraction_ledger : pd.DataFrame
241
- DataFrame containing the content from which table data is to be extracted.
242
- task_config : Dict[str, Any]
243
- Dictionary containing task properties and configurations.
244
- extraction_config : Any
245
- The validated configuration object for table extraction.
246
- execution_trace_log : Optional[Dict], optional
247
- Optional trace information for debugging or logging. Defaults to None.
248
-
249
- Returns
250
- -------
251
- Tuple[pd.DataFrame, Dict]
252
- A tuple containing the updated DataFrame and the trace information.
253
- """
254
-
255
- _ = task_config # unused
256
-
257
- if execution_trace_log is None:
258
- execution_trace_log = {}
259
- logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
260
-
261
- if df_extraction_ledger.empty:
262
- return df_extraction_ledger, execution_trace_log
263
-
264
- endpoint_config = extraction_config.endpoint_config
265
- yolox_client, paddle_client = _create_clients(
266
- endpoint_config.yolox_endpoints,
267
- endpoint_config.yolox_infer_protocol,
268
- endpoint_config.paddle_endpoints,
269
- endpoint_config.paddle_infer_protocol,
270
- endpoint_config.auth_token,
271
- )
272
-
273
- try:
274
- # 1) Identify rows that meet criteria (structured, subtype=table, table_metadata != None, content not empty)
275
- def meets_criteria(row):
276
- m = row.get("metadata", {})
277
- if not m:
278
- return False
279
- content_md = m.get("content_metadata", {})
280
- if (
281
- content_md.get("type") == "structured"
282
- and content_md.get("subtype") == "table"
283
- and m.get("table_metadata") is not None
284
- and m.get("content") not in [None, ""]
285
- ):
286
- return True
287
- return False
288
-
289
- mask = df_extraction_ledger.apply(meets_criteria, axis=1)
290
- valid_indices = df_extraction_ledger[mask].index.tolist()
291
-
292
- # If no rows meet the criteria, just return
293
- if not valid_indices:
294
- return df_extraction_ledger, {"trace_info": execution_trace_log}
295
-
296
- # 2) Extract base64 images in the same order
297
- base64_images = []
298
- for idx in valid_indices:
299
- meta = df_extraction_ledger.at[idx, "metadata"]
300
- base64_images.append(meta["content"])
301
-
302
- # 3) Call our bulk _update_metadata to get all results
303
- table_content_format = (
304
- df_extraction_ledger.at[valid_indices[0], "metadata"]["table_metadata"].get("table_content_format")
305
- or TableFormatEnum.PSEUDO_MARKDOWN
306
- )
307
- enable_yolox = True if table_content_format in (TableFormatEnum.MARKDOWN,) else False
308
-
309
- bulk_results = _update_table_metadata(
310
- base64_images=base64_images,
311
- yolox_client=yolox_client,
312
- paddle_client=paddle_client,
313
- worker_pool_size=endpoint_config.workers_per_progress_engine,
314
- enable_yolox=enable_yolox,
315
- trace_info=execution_trace_log,
316
- )
317
-
318
- # 4) Write the results (bounding_boxes, text_predictions) back
319
- for row_id, idx in enumerate(valid_indices):
320
- # unpack (base64_image, (yolox_predictions, paddle_bounding boxes, paddle_text_predictions))
321
- _, cell_predictions, bounding_boxes, text_predictions = bulk_results[row_id]
322
-
323
- if table_content_format == TableFormatEnum.SIMPLE:
324
- table_content = " ".join(text_predictions)
325
- elif table_content_format == TableFormatEnum.PSEUDO_MARKDOWN:
326
- table_content = convert_paddle_response_to_psuedo_markdown(bounding_boxes, text_predictions)
327
- elif table_content_format == TableFormatEnum.MARKDOWN:
328
- table_content = join_yolox_table_structure_and_paddle_output(
329
- cell_predictions, bounding_boxes, text_predictions
330
- )
331
- else:
332
- raise ValueError(f"Unexpected table format: {table_content_format}")
333
-
334
- df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content"] = table_content
335
- df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content_format"] = table_content_format
336
-
337
- return df_extraction_ledger, {"trace_info": execution_trace_log}
338
-
339
- except Exception:
340
- logger.exception("Error occurred while extracting table data.", exc_info=True)
341
- raise
342
- finally:
343
- yolox_client.close()
344
- paddle_client.close()
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,19 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from .adobe import adobe_extractor
6
- from .llama import llama_parse_extractor
7
- from .nemoretriever import nemoretriever_parse_extractor
8
- from .pdfium import pdfium_extractor
9
- from .tika import tika_extractor
10
- from .unstructured_io import unstructured_io_extractor
11
-
12
- __all__ = [
13
- "adobe_extractor",
14
- "llama_parse_extractor",
15
- "nemoretriever_parse_extractor",
16
- "pdfium_extractor",
17
- "tika_extractor",
18
- "unstructured_io_extractor",
19
- ]