nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,84 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import uuid
8
+ from typing import Optional, Dict, Any, Union, Tuple, List
9
+
10
+ import pandas as pd
11
+
12
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
13
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
14
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
15
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
16
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
17
+
18
+ from markitdown.converters import HtmlConverter
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @unified_exception_handler
24
+ def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
25
+ metadata = row.get("metadata")
26
+ html_content = row.get("content")
27
+
28
+ if html_content:
29
+ html_converter = HtmlConverter()
30
+ md_content = html_converter.convert_string(html_content=html_content).text_content
31
+ metadata["content"] = md_content
32
+
33
+ return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
34
+
35
+
36
+ def extract_markdown_from_html_internal(
37
+ df_extraction_ledger: pd.DataFrame,
38
+ task_config: Dict[str, Any],
39
+ extraction_config: HtmlExtractorSchema,
40
+ execution_trace_log: Optional[Dict[str, Any]] = None,
41
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
42
+ """
43
+ Processes a pandas DataFrame containing HTML file content, extracting html as text from
44
+ each document and converting it to markdown.
45
+
46
+ Parameters
47
+ ----------
48
+ df_extraction_ledger : pd.DataFrame
49
+ The input DataFrame containing html files as raw text. Expected columns include
50
+ 'source_id' and 'content'.
51
+ task_config : Union[Dict[str, Any], BaseModel]
52
+ Configuration instructions for the document processing task. This can be provided as a
53
+ dictionary or a Pydantic model.
54
+ extraction_config : Any
55
+ A configuration object for document extraction that guides the extraction process.
56
+ execution_trace_log : Optional[Dict[str, Any]], default=None
57
+ An optional dictionary containing trace information for debugging or logging.
58
+
59
+ Returns
60
+ -------
61
+ pd.DataFrame
62
+ A DataFrame with the original html content converted to markdown. The resulting
63
+ DataFrame contains the columns "document_type", "metadata", and "uuid".
64
+
65
+ Raises
66
+ ------
67
+ Exception
68
+ If an error occurs during the document extraction process, the exception is logged and
69
+ re-raised.
70
+ """
71
+
72
+ # Apply the decode_and_extract function to each row in the DataFrame.
73
+ sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
74
+
75
+ # Explode any list results and drop missing values.
76
+ sr_extraction = sr_extraction.explode().dropna()
77
+
78
+ # Convert the extraction results to a DataFrame if available.
79
+ if not sr_extraction.empty:
80
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
81
+ else:
82
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
83
+
84
+ return extracted_df, {}
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,375 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from typing import Any, Union
8
+ from typing import Dict
9
+ from typing import List
10
+ from typing import Optional
11
+ from typing import Tuple
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
17
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
18
+ from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_graphic_elements_and_ocr_output
19
+ from nv_ingest_api.util.image_processing.table_and_chart import process_yolox_graphic_elements
20
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import PaddleOCRModelInterface
21
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import NemoRetrieverOCRModelInterface
22
+ from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
23
+ from nv_ingest_api.internal.primitives.nim import NimClient
24
+ from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxGraphicElementsModelInterface
25
+ from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
26
+ from nv_ingest_api.util.nim import create_inference_client
27
+
28
+ PADDLE_MIN_WIDTH = 32
29
+ PADDLE_MIN_HEIGHT = 32
30
+
31
+ logger = logging.getLogger(f"ray.{__name__}")
32
+
33
+
34
+ def _filter_valid_chart_images(
35
+ base64_images: List[str],
36
+ ) -> Tuple[List[str], List[np.ndarray], List[int], List[Tuple[str, Optional[Dict]]]]:
37
+ """
38
+ Filter base64-encoded images based on minimum dimensions for chart extraction.
39
+
40
+ Returns:
41
+ - valid_images: Base64 strings meeting size requirements.
42
+ - valid_arrays: Corresponding numpy arrays.
43
+ - valid_indices: Original indices of valid images.
44
+ - results: Initial results list where invalid images are set to (img, None).
45
+ """
46
+ results: List[Tuple[str, Optional[Dict]]] = [("", None)] * len(base64_images)
47
+ valid_images: List[str] = []
48
+ valid_arrays: List[np.ndarray] = []
49
+ valid_indices: List[int] = []
50
+
51
+ for i, img in enumerate(base64_images):
52
+ array = base64_to_numpy(img)
53
+ height, width = array.shape[0], array.shape[1]
54
+ if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
55
+ valid_images.append(img)
56
+ valid_arrays.append(array)
57
+ valid_indices.append(i)
58
+ else:
59
+ # Image is too small; mark as skipped.
60
+ results[i] = (img, None)
61
+ return valid_images, valid_arrays, valid_indices, results
62
+
63
+
64
+ def _run_chart_inference(
65
+ yolox_client: Any,
66
+ ocr_client: Any,
67
+ ocr_model_name: str,
68
+ valid_arrays: List[np.ndarray],
69
+ valid_images: List[str],
70
+ trace_info: Dict,
71
+ ) -> Tuple[List[Any], List[Any]]:
72
+ """
73
+ Run concurrent inference for chart extraction using YOLOX and Paddle.
74
+
75
+ Returns a tuple of (yolox_results, ocr_results).
76
+ """
77
+ data_yolox = {"images": valid_arrays}
78
+ data_ocr = {"base64_images": valid_images}
79
+
80
+ future_yolox_kwargs = dict(
81
+ data=data_yolox,
82
+ model_name="yolox_ensemble",
83
+ stage_name="chart_extraction",
84
+ input_names=["INPUT_IMAGES", "THRESHOLDS"],
85
+ dtypes=["BYTES", "FP32"],
86
+ output_names=["OUTPUT"],
87
+ trace_info=trace_info,
88
+ max_batch_size=8,
89
+ )
90
+ future_ocr_kwargs = dict(
91
+ data=data_ocr,
92
+ stage_name="chart_extraction",
93
+ trace_info=trace_info,
94
+ )
95
+ if ocr_model_name == "paddle":
96
+ future_ocr_kwargs.update(
97
+ model_name="paddle",
98
+ max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
99
+ )
100
+ elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
101
+ future_ocr_kwargs.update(
102
+ model_name=ocr_model_name,
103
+ input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
104
+ output_names=["OUTPUT"],
105
+ dtypes=["BYTES", "BYTES"],
106
+ merge_level="paragraph",
107
+ )
108
+ else:
109
+ raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
110
+
111
+ with ThreadPoolExecutor(max_workers=2) as executor:
112
+ future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
113
+ future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
114
+
115
+ try:
116
+ yolox_results = future_yolox.result()
117
+ except Exception as e:
118
+ logger.error(f"Error calling yolox_client.infer: {e}", exc_info=True)
119
+ raise
120
+
121
+ try:
122
+ ocr_results = future_ocr.result()
123
+ except Exception as e:
124
+ logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
125
+ raise
126
+
127
+ return yolox_results, ocr_results
128
+
129
+
130
+ def _validate_chart_inference_results(
131
+ yolox_results: Any,
132
+ ocr_results: Any,
133
+ valid_arrays: List[Any],
134
+ valid_images: List[str],
135
+ ) -> Tuple[List[Any], List[Any]]:
136
+ """
137
+ Ensure inference results are lists and have expected lengths.
138
+
139
+ Raises:
140
+ ValueError if results do not match expected types or lengths.
141
+ """
142
+ if not (isinstance(yolox_results, list) and isinstance(ocr_results, list)):
143
+ raise ValueError("Expected list results from both yolox_client and ocr_client infer calls.")
144
+
145
+ if len(yolox_results) != len(valid_arrays):
146
+ raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
147
+ if len(ocr_results) != len(valid_images):
148
+ raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
149
+ return yolox_results, ocr_results
150
+
151
+
152
+ def _merge_chart_results(
153
+ base64_images: List[str],
154
+ valid_indices: List[int],
155
+ yolox_results: List[Any],
156
+ ocr_results: List[Any],
157
+ initial_results: List[Tuple[str, Optional[Dict]]],
158
+ ) -> List[Tuple[str, Optional[Dict]]]:
159
+ """
160
+ Merge inference results into the initial results list using the original indices.
161
+
162
+ For each valid image, processes the results from both inference calls and updates the
163
+ corresponding entry in the results list.
164
+ """
165
+ for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
166
+ # Unpack ocr result into bounding boxes and text predictions.
167
+ bounding_boxes, text_predictions, _ = ocr_res
168
+ yolox_elements = join_yolox_graphic_elements_and_ocr_output(yolox_res, bounding_boxes, text_predictions)
169
+ chart_content = process_yolox_graphic_elements(yolox_elements)
170
+ original_index = valid_indices[idx]
171
+ initial_results[original_index] = (base64_images[original_index], chart_content)
172
+ return initial_results
173
+
174
+
175
+ def _update_chart_metadata(
176
+ base64_images: List[str],
177
+ yolox_client: Any,
178
+ ocr_client: Any,
179
+ ocr_model_name: str,
180
+ trace_info: Dict,
181
+ worker_pool_size: int = 8, # Not currently used.
182
+ ) -> List[Tuple[str, Optional[Dict]]]:
183
+ """
184
+ Given a list of base64-encoded chart images, concurrently call both YOLOX and Paddle
185
+ inference services to extract chart data.
186
+
187
+ For each base64-encoded image, returns:
188
+ (original_image_str, joined_chart_content_dict)
189
+
190
+ Images that do not meet minimum size requirements are marked as skipped.
191
+ """
192
+ logger.debug("Running chart extraction using updated concurrency handling.")
193
+
194
+ # Initialize results with placeholders and filter valid images.
195
+ valid_images, valid_arrays, valid_indices, results = _filter_valid_chart_images(base64_images)
196
+
197
+ # Run concurrent inference only for valid images.
198
+ yolox_results, ocr_results = _run_chart_inference(
199
+ yolox_client=yolox_client,
200
+ ocr_client=ocr_client,
201
+ ocr_model_name=ocr_model_name,
202
+ valid_arrays=valid_arrays,
203
+ valid_images=valid_images,
204
+ trace_info=trace_info,
205
+ )
206
+
207
+ # Validate that the returned inference results are lists of the expected length.
208
+ yolox_results, ocr_results = _validate_chart_inference_results(
209
+ yolox_results, ocr_results, valid_arrays, valid_images
210
+ )
211
+
212
+ # Merge the inference results into the results list.
213
+ return _merge_chart_results(base64_images, valid_indices, yolox_results, ocr_results, results)
214
+
215
+
216
+ def _create_yolox_client(
217
+ yolox_endpoints: Tuple[str, str],
218
+ yolox_protocol: str,
219
+ auth_token: str,
220
+ ) -> NimClient:
221
+ yolox_model_interface = YoloxGraphicElementsModelInterface()
222
+
223
+ yolox_client = create_inference_client(
224
+ endpoints=yolox_endpoints,
225
+ model_interface=yolox_model_interface,
226
+ auth_token=auth_token,
227
+ infer_protocol=yolox_protocol,
228
+ )
229
+
230
+ return yolox_client
231
+
232
+
233
+ def _create_ocr_client(
234
+ ocr_endpoints: Tuple[str, str],
235
+ ocr_protocol: str,
236
+ ocr_model_name: str,
237
+ auth_token: str,
238
+ ) -> NimClient:
239
+ ocr_model_interface = (
240
+ NemoRetrieverOCRModelInterface()
241
+ if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
242
+ else PaddleOCRModelInterface()
243
+ )
244
+
245
+ ocr_client = create_inference_client(
246
+ endpoints=ocr_endpoints,
247
+ model_interface=ocr_model_interface,
248
+ auth_token=auth_token,
249
+ infer_protocol=ocr_protocol,
250
+ enable_dynamic_batching=(
251
+ True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
252
+ ),
253
+ dynamic_batch_memory_budget_mb=32,
254
+ )
255
+
256
+ return ocr_client
257
+
258
+
259
+ def extract_chart_data_from_image_internal(
260
+ df_extraction_ledger: pd.DataFrame,
261
+ task_config: Union[IngestTaskChartExtraction, Dict[str, Any]],
262
+ extraction_config: ChartExtractorSchema,
263
+ execution_trace_log: Optional[Dict] = None,
264
+ ) -> Tuple[pd.DataFrame, Dict]:
265
+ """
266
+ Extracts chart data from a DataFrame in a bulk fashion rather than row-by-row.
267
+
268
+ Parameters
269
+ ----------
270
+ df_extraction_ledger : pd.DataFrame
271
+ DataFrame containing the content from which chart data is to be extracted.
272
+ task_config : Dict[str, Any]
273
+ Dictionary containing task properties and configurations.
274
+ extraction_config : Any
275
+ The validated configuration object for chart extraction.
276
+ execution_trace_log : Optional[Dict], optional
277
+ Optional trace information for debugging or logging. Defaults to None.
278
+
279
+ Returns
280
+ -------
281
+ Tuple[pd.DataFrame, Dict]
282
+ A tuple containing the updated DataFrame and the trace information.
283
+
284
+ Raises
285
+ ------
286
+ Exception
287
+ If any error occurs during the chart data extraction process.
288
+ """
289
+ _ = task_config # Unused variable
290
+
291
+ if execution_trace_log is None:
292
+ execution_trace_log = {}
293
+ logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
294
+
295
+ if df_extraction_ledger.empty:
296
+ return df_extraction_ledger, execution_trace_log
297
+
298
+ endpoint_config = extraction_config.endpoint_config
299
+
300
+ # Get the grpc endpoint to determine the model if needed
301
+ ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
302
+ ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
303
+
304
+ try:
305
+ # 1) Identify rows that meet criteria in a single pass
306
+ # - metadata exists
307
+ # - content_metadata.type == "structured"
308
+ # - content_metadata.subtype == "chart"
309
+ # - table_metadata not None
310
+ # - base64_image not None or ""
311
+ def meets_criteria(row):
312
+ m = row.get("metadata", {})
313
+ if not m:
314
+ return False
315
+
316
+ content_md = m.get("content_metadata", {})
317
+ if (
318
+ content_md.get("type") == "structured"
319
+ and content_md.get("subtype") == "chart"
320
+ and m.get("table_metadata") is not None
321
+ and m.get("content") not in [None, ""]
322
+ ):
323
+ return True
324
+
325
+ return False
326
+
327
+ mask = df_extraction_ledger.apply(meets_criteria, axis=1)
328
+ valid_indices = df_extraction_ledger[mask].index.tolist()
329
+
330
+ # If no rows meet the criteria, just return.
331
+ if not valid_indices:
332
+ return df_extraction_ledger, {"trace_info": execution_trace_log}
333
+
334
+ # 2) Extract base64 images + keep track of row -> image mapping.
335
+ base64_images = []
336
+ for idx in valid_indices:
337
+ meta = df_extraction_ledger.at[idx, "metadata"]
338
+ base64_images.append(meta["content"]) # guaranteed by meets_criteria
339
+
340
+ # 3) Call our bulk _update_metadata to get all results.
341
+ yolox_client = _create_yolox_client(
342
+ endpoint_config.yolox_endpoints,
343
+ endpoint_config.yolox_infer_protocol,
344
+ endpoint_config.auth_token,
345
+ )
346
+
347
+ ocr_client = _create_ocr_client(
348
+ endpoint_config.ocr_endpoints,
349
+ endpoint_config.ocr_infer_protocol,
350
+ ocr_model_name,
351
+ endpoint_config.auth_token,
352
+ )
353
+
354
+ bulk_results = _update_chart_metadata(
355
+ base64_images=base64_images,
356
+ yolox_client=yolox_client,
357
+ ocr_client=ocr_client,
358
+ ocr_model_name=ocr_model_name,
359
+ worker_pool_size=endpoint_config.workers_per_progress_engine,
360
+ trace_info=execution_trace_log,
361
+ )
362
+
363
+ # 4) Write the results back to each row’s table_metadata
364
+ # The order of base64_images in bulk_results should match their original
365
+ # indices if we process them in the same order.
366
+ for row_id, idx in enumerate(valid_indices):
367
+ _, chart_content = bulk_results[row_id]
368
+ df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content"] = chart_content
369
+
370
+ return df_extraction_ledger, {"trace_info": execution_trace_log}
371
+
372
+ except Exception:
373
+ logger.error("Error occurred while extracting chart data.", exc_info=True)
374
+
375
+ raise
@@ -0,0 +1,208 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import functools
8
+ import io
9
+ import logging
10
+ from typing import Any, Union, Tuple
11
+ from typing import Dict
12
+ from typing import List
13
+ from typing import Optional
14
+
15
+ import pandas as pd
16
+ from pydantic import BaseModel
17
+
18
+ from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
19
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
20
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @unified_exception_handler
26
+ def _decode_and_extract_from_image(
27
+ base64_row: pd.Series,
28
+ task_config: Dict[str, Any],
29
+ validated_extraction_config: ImageConfigSchema,
30
+ execution_trace_log: Optional[List[Any]] = None,
31
+ ) -> Any:
32
+ """
33
+ Decode base64-encoded image content from a DataFrame row and extract data using a specified extraction method.
34
+
35
+ This function extracts the "content" (base64 string) from the row, prepares additional task parameters by
36
+ inserting the remaining row data under "row_data", and decodes the base64 content into a BytesIO stream.
37
+ It then determines which extraction method to use (defaulting to "image" if the specified method is not found)
38
+ and calls the corresponding function from the image_helpers module.
39
+
40
+ Parameters
41
+ ----------
42
+ base64_row : pd.Series
43
+ A pandas Series representing a row containing base64-encoded content under the key "content"
44
+ and optionally a "source_id" and "document_type".
45
+ task_config : Dict[str, Any]
46
+ A dictionary containing task properties. It should include:
47
+ - "method" (str): The extraction method to use (e.g., "image").
48
+ - "params" (dict): Additional parameters to pass to the extraction function.
49
+ validated_extraction_config : Any
50
+ A configuration object that contains an attribute `image_extraction_config` to be used when
51
+ extracting image content.
52
+ default : str, optional
53
+ The default extraction method to use if the specified method is not available (default is "image").
54
+ execution_trace_log : Optional[List[Any]], optional
55
+ An optional list of trace information to pass to the extraction function (default is None).
56
+
57
+ Returns
58
+ -------
59
+ Any
60
+ The extracted data from the decoded image content. The exact return type depends on the extraction method used.
61
+
62
+ Raises
63
+ ------
64
+ KeyError
65
+ If the "content" key is missing from `base64_row`.
66
+ Exception
67
+ For any other unhandled exceptions during extraction.
68
+ """
69
+
70
+ # Retrieve document type and initialize source_id.
71
+ document_type: Any = base64_row["document_type"]
72
+ source_id: Optional[Any] = None
73
+
74
+ try:
75
+ base64_content: str = base64_row["content"]
76
+ except KeyError as e:
77
+ err_msg = f"decode_and_extract: Missing 'content' key in row: {base64_row}"
78
+ logger.error(err_msg, exc_info=True)
79
+ raise KeyError(err_msg) from e
80
+
81
+ try:
82
+ # Prepare additional row data (exclude "content") and inject into task parameters.
83
+ row_data = base64_row.drop(labels=["content"], errors="ignore")
84
+ task_config.setdefault("params", {})["row_data"] = row_data
85
+
86
+ # Retrieve source_id if available.
87
+ source_id = base64_row.get("source_id", None)
88
+
89
+ # Decode the base64 image content.
90
+ image_bytes: bytes = base64.b64decode(base64_content)
91
+ image_stream: io.BytesIO = io.BytesIO(image_bytes)
92
+
93
+ # Determine the extraction method and parameters.
94
+ # extract_method: str = task_config.get("method", "image")
95
+ extract_params: Dict[str, Any] = task_config.get("params", {})
96
+ extract_params["document_type"] = document_type
97
+
98
+ try:
99
+ extract_text: bool = extract_params.pop("extract_text", False)
100
+ extract_images: bool = extract_params.pop("extract_images", False)
101
+ extract_tables: bool = extract_params.pop("extract_tables", False)
102
+ extract_charts: bool = extract_params.pop("extract_charts", False)
103
+ extract_infographics: bool = extract_params.pop("extract_infographics", False)
104
+ except KeyError as e:
105
+ raise ValueError(f"Missing required extraction flag: {e}")
106
+
107
+ logger.debug(
108
+ f"decode_and_extract: Extracting image content using image_extraction_config: "
109
+ f"{validated_extraction_config}"
110
+ )
111
+ # Ensure we pass the correct nested config type (ImageConfigSchema) to helpers.
112
+ # Some callers provide the full ImageExtractorSchema; extract its inner image_extraction_config.
113
+ if validated_extraction_config is not None:
114
+ inner_cfg = getattr(validated_extraction_config, "image_extraction_config", validated_extraction_config)
115
+ if inner_cfg is not None:
116
+ extract_params["image_extraction_config"] = inner_cfg
117
+
118
+ if execution_trace_log is not None:
119
+ extract_params["trace_info"] = execution_trace_log
120
+
121
+ # func = getattr(image_helpers, extract_method, default)
122
+ extracted_data: Any = unstructured_image_extractor(
123
+ image_stream=image_stream,
124
+ extract_text=extract_text,
125
+ extract_images=extract_images,
126
+ extract_infographics=extract_infographics,
127
+ extract_tables=extract_tables,
128
+ extract_charts=extract_charts,
129
+ extraction_config=extract_params,
130
+ extraction_trace_log=execution_trace_log,
131
+ )
132
+
133
+ return extracted_data
134
+
135
+ except Exception as e:
136
+ err_msg = f"decode_and_extract: Unhandled exception for source '{source_id}'. Original error: {e}"
137
+ logger.error(err_msg, exc_info=True)
138
+ raise type(e)(err_msg) from e
139
+
140
+
141
+ @unified_exception_handler
142
+ def extract_primitives_from_image_internal(
143
+ df_extraction_ledger: pd.DataFrame,
144
+ task_config: Union[Dict[str, Any], BaseModel],
145
+ extraction_config: Any,
146
+ execution_trace_log: Optional[Dict[str, Any]] = None,
147
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
148
+ """
149
+ Process a DataFrame containing base64-encoded image files and extract primitives from each image.
150
+
151
+ This function applies the `decode_and_extract_from_image` routine to every row of the input DataFrame.
152
+ It then explodes any list results into separate rows, drops missing values, and compiles the extracted data
153
+ into a new DataFrame with columns "document_type", "metadata", and "uuid". In addition, trace information is
154
+ collected if provided.
155
+
156
+ Parameters
157
+ ----------
158
+ df_extraction_ledger : pd.DataFrame
159
+ Input DataFrame containing image files in base64 encoding. Expected to include columns 'source_id'
160
+ and 'content'.
161
+ task_config : Union[Dict[str, Any], BaseModel]
162
+ A dictionary or Pydantic model with instructions and parameters for the image processing task.
163
+ extraction_config : Any
164
+ A configuration object validated for processing images (e.g., containing `image_extraction_config`).
165
+ execution_trace_log : Optional[Dict[str, Any]], default=None
166
+ An optional dictionary for tracing and logging additional information during processing.
167
+
168
+ Returns
169
+ -------
170
+ pd.DataFrame
171
+ A DataFrame with the extracted image primitives. Expected columns include "document_type", "metadata",
172
+ and "uuid". Also returns a dictionary containing trace information under the key "trace_info".
173
+
174
+ Raises
175
+ ------
176
+ Exception
177
+ If an error occurs during the image processing stage, the exception is logged and re-raised.
178
+ """
179
+ logger.debug("process_image: Processing image content")
180
+ if execution_trace_log is None:
181
+ execution_trace_log = {}
182
+
183
+ if isinstance(task_config, BaseModel):
184
+ task_config = task_config.model_dump()
185
+
186
+ try:
187
+ # Create a partial function to decode and extract image data for each row.
188
+ _decode_and_extract = functools.partial(
189
+ _decode_and_extract_from_image,
190
+ task_config=task_config,
191
+ validated_extraction_config=extraction_config,
192
+ execution_trace_log=execution_trace_log,
193
+ )
194
+ logger.debug("process_image: Processing with method: %s", task_config.get("method", None))
195
+ sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
196
+ sr_extraction = sr_extraction.explode().dropna()
197
+
198
+ if not sr_extraction.empty:
199
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
200
+ else:
201
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
202
+
203
+ return extracted_df, {"trace_info": execution_trace_log}
204
+
205
+ except Exception as e:
206
+ err_msg = f"process_image: Unhandled exception in image extractor stage. Original error: {e}"
207
+ logger.exception(err_msg)
208
+ raise type(e)(err_msg) from e
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0