nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,972 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Tuple, Optional, Dict, Any
7
+
8
+ import pandas as pd
9
+ from pandas import DataFrame
10
+
11
+ from . import extraction_interface_relay_constructor
12
+
13
+ from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
14
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
15
+ from nv_ingest_api.internal.extract.docx.docx_extractor import extract_primitives_from_docx_internal
16
+ from nv_ingest_api.internal.extract.pptx.pptx_extractor import extract_primitives_from_pptx_internal
17
+ from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
18
+ from nv_ingest_api.internal.extract.image.image_extractor import extract_primitives_from_image_internal
19
+ from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
20
+ from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
21
+ from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
22
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
23
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
24
+ from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import (
25
+ InfographicExtractorConfigSchema,
26
+ InfographicExtractorSchema,
27
+ )
28
+ from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
29
+ from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
30
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import (
31
+ IngestTaskChartExtraction,
32
+ IngestTaskTableExtraction,
33
+ )
34
+ from nv_ingest_api.internal.extract.audio.audio_extraction import extract_text_from_audio_internal
35
+ from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ @unified_exception_handler
41
+ @extraction_interface_relay_constructor(
42
+ api_fn=extract_primitives_from_pdf_internal,
43
+ task_keys=["extract_text", "extract_images", "extract_tables", "extract_charts", "extract_infographics"],
44
+ )
45
+ def extract_primitives_from_pdf(
46
+ *,
47
+ df_extraction_ledger: pd.DataFrame, # Ledger (e.g., a pandas DataFrame)
48
+ extract_method: str = "pdfium", # Determines which extraction schema to use
49
+ extract_text: bool = True,
50
+ extract_images: bool = True,
51
+ extract_infographics: bool = True,
52
+ extract_tables: bool = True,
53
+ extract_charts: bool = True,
54
+ text_depth: str = "page",
55
+ # Adobe-specific parameters:
56
+ adobe_client_id: Optional[str] = None,
57
+ adobe_client_secret: Optional[str] = None,
58
+ # LLama
59
+ llama_api_key: Optional[str] = None,
60
+ # PDFium-specific parameters:
61
+ yolox_auth_token: Optional[str] = None,
62
+ yolox_endpoints: Optional[Tuple[Optional[str], Optional[str]]] = None,
63
+ yolox_infer_protocol: str = "http",
64
+ # Nemoretriver Parse parameters:
65
+ nemoretriever_parse_endpoints: Optional[Tuple[str, str]] = None,
66
+ nemoretriever_parse_protocol: str = "http",
67
+ nemoretriever_parse_model_name: str = None,
68
+ # UnstructuredIO parameters:
69
+ unstructured_io_api_key: Optional[str] = None,
70
+ # Tika-specific parameter:
71
+ tika_server_url: Optional[str] = None,
72
+ ):
73
+ """
74
+ Extract text, images, tables, charts, and infographics from PDF documents.
75
+
76
+ This function serves as a unified interface for PDF primitive extraction, supporting multiple
77
+ extraction engines (pdfium, adobe, llama, nemoretriever_parse, unstructured_io, and tika).
78
+ It processes a DataFrame containing base64-encoded PDF data and returns a new DataFrame
79
+ with structured information about the extracted elements.
80
+
81
+ The function uses a decorator pattern to dynamically validate configuration parameters
82
+ and invoke the appropriate extraction pipeline. This design allows for flexible
83
+ engine-specific configuration while maintaining a consistent interface.
84
+
85
+ Parameters
86
+ ----------
87
+ df_extraction_ledger : pd.DataFrame
88
+ DataFrame containing PDF documents to process. Must include the following columns:
89
+ - "content" : str
90
+ Base64-encoded PDF data
91
+ - "source_id" : str
92
+ Unique identifier for the document
93
+ - "source_name" : str
94
+ Name of the document (filename or descriptive name)
95
+ - "document_type" : str or enum
96
+ Document type identifier (should be "pdf" or related enum value)
97
+ - "metadata" : Dict[str, Any]
98
+ Dictionary containing additional metadata about the document
99
+
100
+ extract_method : str, default "pdfium"
101
+ The extraction engine to use. Valid options:
102
+ - "pdfium" : PDFium-based extraction (default)
103
+ - "adobe" : Adobe PDF Services API
104
+ - "llama" : LlamaParse extraction
105
+ - "nemoretriever_parse" : NVIDIA NemoRetriever Parse
106
+ - "unstructured_io" : Unstructured.io extraction
107
+ - "tika" : Apache Tika extraction
108
+
109
+ extract_text : bool, default True
110
+ Whether to extract text content from the PDFs.
111
+
112
+ extract_images : bool, default True
113
+ Whether to extract embedded images from the PDFs.
114
+
115
+ extract_infographics : bool, default True
116
+ Whether to extract infographics from the PDFs.
117
+
118
+ extract_tables : bool, default True
119
+ Whether to extract tables from the PDFs.
120
+
121
+ extract_charts : bool, default True
122
+ Whether to extract charts and graphs from the PDFs.
123
+
124
+ text_depth : str, default "page"
125
+ Level of text granularity to extract. Options:
126
+ - "page" : Text extracted at page level
127
+ - "block" : Text extracted at block level
128
+ - "paragraph" : Text extracted at paragraph level
129
+ - "line" : Text extracted at line level
130
+
131
+ adobe_client_id : str, optional
132
+ Client ID for Adobe PDF Services API. Required when extract_method="adobe".
133
+
134
+ adobe_client_secret : str, optional
135
+ Client secret for Adobe PDF Services API. Required when extract_method="adobe".
136
+
137
+ llama_api_key : str, optional
138
+ API key for LlamaParse service. Required when extract_method="llama".
139
+
140
+ yolox_auth_token : str, optional
141
+ Authentication token for YOLOX inference services.
142
+
143
+ yolox_endpoints : tuple of (str, str), optional
144
+ A tuple containing (gRPC endpoint, HTTP endpoint) for YOLOX services.
145
+ At least one endpoint must be non-empty.
146
+
147
+ yolox_infer_protocol : str, default "http"
148
+ Protocol to use for YOLOX inference. Options: "http" or "grpc".
149
+
150
+ nemoretriever_parse_endpoints : tuple of (str, str), optional
151
+ A tuple containing (gRPC endpoint, HTTP endpoint) for NemoRetriever Parse.
152
+ Required when extract_method="nemoretriever_parse".
153
+
154
+ nemoretriever_parse_protocol : str, default "http"
155
+ Protocol to use for NemoRetriever Parse. Options: "http" or "grpc".
156
+
157
+ nemoretriever_parse_model_name : str, optional
158
+ Model name for NemoRetriever Parse. Default is "nvidia/nemoretriever-parse".
159
+
160
+ unstructured_io_api_key : str, optional
161
+ API key for Unstructured.io services. Required when extract_method="unstructured_io".
162
+
163
+ tika_server_url : str, optional
164
+ URL for Apache Tika server. Required when extract_method="tika".
165
+
166
+ Returns
167
+ -------
168
+ pandas.DataFrame
169
+ A DataFrame containing the extracted primitives with the following columns:
170
+ - "document_type" : Type of the extracted element (e.g., "text", "image", "table")
171
+ - "metadata" : Dictionary containing detailed information about the extracted element
172
+ - "uuid" : Unique identifier for the extracted element
173
+
174
+ Raises
175
+ ------
176
+ ValueError
177
+ If an unsupported extraction method is specified.
178
+ If required parameters for the specified extraction method are missing.
179
+ If the input DataFrame does not have the required structure.
180
+
181
+ KeyError
182
+ If required columns are missing from the input DataFrame.
183
+
184
+ RuntimeError
185
+ If extraction fails due to processing errors.
186
+
187
+ Notes
188
+ -----
189
+ The function uses a decorator pattern through `extraction_interface_relay_constructor`
190
+ which dynamically processes the parameters and validates them against the appropriate
191
+ configuration schema. The actual extraction work is delegated to the
192
+ `extract_primitives_from_pdf_internal` function.
193
+
194
+ For each extraction method, specific parameters are required:
195
+ - pdfium: yolox_endpoints
196
+ - adobe: adobe_client_id, adobe_client_secret
197
+ - llama: llama_api_key
198
+ - nemoretriever_parse: nemoretriever_parse_endpoints
199
+ - unstructured_io: unstructured_io_api_key
200
+ - tika: tika_server_url
201
+
202
+ Examples
203
+ --------
204
+ >>> import pandas as pd
205
+ >>> import base64
206
+ >>>
207
+ >>> # Read a PDF file and encode it as base64
208
+ >>> with open("document.pdf", "rb") as f:
209
+ >>> pdf_content = base64.b64encode(f.read()).decode("utf-8")
210
+ >>>
211
+ >>> # Create a DataFrame with the PDF content
212
+ >>> df = pd.DataFrame({
213
+ >>> "source_id": ["doc1"],
214
+ >>> "source_name": ["document.pdf"],
215
+ >>> "content": [pdf_content],
216
+ >>> "document_type": ["pdf"],
217
+ >>> "metadata": [{"content_metadata": {"type": "document"}}]
218
+ >>> })
219
+ >>>
220
+ >>> # Extract primitives using PDFium
221
+ >>> result_df = extract_primitives_from_pdf(
222
+ >>> df_extraction_ledger=df,
223
+ >>> extract_method="pdfium",
224
+ >>> yolox_endpoints=(None, "http://localhost:8000/v1/infer")
225
+ >>> )
226
+ >>>
227
+ >>> # Display the types of extracted elements
228
+ >>> print(result_df["document_type"].value_counts())
229
+ """
230
+ pass
231
+
232
+
233
+ def extract_primitives_from_pdf_pdfium(
234
+ df_extraction_ledger: pd.DataFrame,
235
+ *,
236
+ extract_text: bool = True,
237
+ extract_images: bool = True,
238
+ extract_tables: bool = True,
239
+ extract_charts: bool = True,
240
+ extract_infographics: bool = True,
241
+ text_depth: str = "page",
242
+ yolox_auth_token: Optional[str] = None,
243
+ yolox_endpoints: Optional[Tuple[Optional[str], Optional[str]]] = None,
244
+ yolox_infer_protocol: str = "http",
245
+ ) -> pd.DataFrame:
246
+ """
247
+ Extract primitives from PDF documents using the PDFium extraction method.
248
+
249
+ A simplified wrapper around the general extract_primitives_from_pdf function
250
+ that defaults to using the PDFium extraction engine.
251
+
252
+ Parameters
253
+ ----------
254
+ df_extraction_ledger : pd.DataFrame
255
+ DataFrame containing PDF documents to process. Must include the following columns:
256
+ - "content" : str
257
+ Base64-encoded PDF data
258
+ - "source_id" : str
259
+ Unique identifier for the document
260
+ - "source_name" : str
261
+ Name of the document (filename or descriptive name)
262
+ - "document_type" : str or enum
263
+ Document type identifier (should be "pdf" or related enum value)
264
+ - "metadata" : Dict[str, Any]
265
+ Dictionary containing additional metadata about the document
266
+ extract_text : bool, default True
267
+ Whether to extract text content
268
+ extract_images : bool, default True
269
+ Whether to extract embedded images
270
+ extract_tables : bool, default True
271
+ Whether to extract tables
272
+ extract_charts : bool, default True
273
+ Whether to extract charts
274
+ extract_infographics : bool, default True
275
+ Whether to extract infographics
276
+ text_depth : str, default "page"
277
+ Level of text granularity (page, block, paragraph, line)
278
+ yolox_auth_token : str, optional
279
+ Authentication token for YOLOX inference services
280
+ yolox_endpoints : tuple of (str, str), optional
281
+ Tuple containing (gRPC endpoint, HTTP endpoint) for YOLOX services
282
+ yolox_infer_protocol : str, default "http"
283
+ Protocol to use for YOLOX inference ("http" or "grpc")
284
+
285
+ Returns
286
+ -------
287
+ pd.DataFrame
288
+ DataFrame containing the extracted primitives
289
+ """
290
+ return extract_primitives_from_pdf(
291
+ df_extraction_ledger=df_extraction_ledger,
292
+ extract_method="pdfium",
293
+ extract_text=extract_text,
294
+ extract_images=extract_images,
295
+ extract_tables=extract_tables,
296
+ extract_charts=extract_charts,
297
+ extract_infographics=extract_infographics,
298
+ text_depth=text_depth,
299
+ yolox_auth_token=yolox_auth_token,
300
+ yolox_endpoints=yolox_endpoints,
301
+ yolox_infer_protocol=yolox_infer_protocol,
302
+ )
303
+
304
+
305
+ def extract_primitives_from_pdf_nemoretriever_parse(
306
+ df_extraction_ledger: pd.DataFrame,
307
+ *,
308
+ extract_text: bool = True,
309
+ extract_images: bool = True,
310
+ extract_tables: bool = True,
311
+ extract_charts: bool = True,
312
+ extract_infographics: bool = True,
313
+ text_depth: str = "page",
314
+ yolox_auth_token: Optional[str] = None,
315
+ yolox_endpoints: Optional[Tuple[Optional[str], Optional[str]]] = None,
316
+ yolox_infer_protocol: str = "http",
317
+ nemoretriever_parse_endpoints: Optional[Tuple[str, str]] = None,
318
+ nemoretriever_parse_protocol: str = "http",
319
+ nemoretriever_parse_model_name: Optional[str] = None,
320
+ ) -> pd.DataFrame:
321
+ """
322
+ Extract primitives from PDF documents using the NemoRetriever Parse extraction method.
323
+
324
+ This function serves as a specialized wrapper around the general extract_primitives_from_pdf
325
+ function, pre-configured to use NemoRetriever Parse as the extraction engine. It processes
326
+ PDF documents to extract various content types including text, images, tables, charts, and
327
+ infographics, returning the results in a structured DataFrame.
328
+
329
+ Parameters
330
+ ----------
331
+ df_extraction_ledger : pd.DataFrame
332
+ DataFrame containing PDF documents to process. Must include the following columns:
333
+ - "content" : str
334
+ Base64-encoded PDF data
335
+ - "source_id" : str
336
+ Unique identifier for the document
337
+ - "source_name" : str
338
+ Name of the document (filename or descriptive name)
339
+ - "document_type" : str or enum
340
+ Document type identifier (should be "pdf" or related enum value)
341
+ - "metadata" : Dict[str, Any]
342
+ Dictionary containing additional metadata about the document
343
+
344
+ extract_text : bool, default True
345
+ Whether to extract text content from the PDFs. When True, the function will
346
+ attempt to extract and structure all textual content according to the
347
+ granularity specified by `text_depth`.
348
+
349
+ extract_images : bool, default True
350
+ Whether to extract embedded images from the PDFs. When True, the function
351
+ will identify, extract, and process images embedded within the document.
352
+
353
+ extract_tables : bool, default True
354
+ Whether to extract tables from the PDFs. When True, the function will
355
+ detect tabular structures and convert them into structured data.
356
+
357
+ extract_charts : bool, default True
358
+ Whether to extract charts and graphs from the PDFs. When True, the function
359
+ will detect and extract visual data representations.
360
+
361
+ extract_infographics : bool, default True
362
+ Whether to extract infographics from the PDFs. When True, the function will
363
+ identify and extract complex visual information displays.
364
+
365
+ text_depth : str, default "page"
366
+ Level of text granularity to extract. Options:
367
+ - "page" : Text extracted at page level (coarsest granularity)
368
+ - "block" : Text extracted at block level (groups of paragraphs)
369
+ - "paragraph" : Text extracted at paragraph level (semantic units)
370
+ - "line" : Text extracted at line level (finest granularity)
371
+
372
+ yolox_auth_token : Optional[str], default None
373
+ Authentication token for YOLOX inference services used for image processing.
374
+ Required if the YOLOX services need authentication.
375
+
376
+ yolox_endpoints : Optional[Tuple[Optional[str], Optional[str]]], default None
377
+ A tuple containing (gRPC endpoint, HTTP endpoint) for YOLOX services.
378
+ Used for image processing capabilities within the extraction pipeline.
379
+ Format: (grpc_endpoint, http_endpoint)
380
+ Example: (None, "http://localhost:8000/v1/infer")
381
+
382
+ yolox_infer_protocol : str, default "http"
383
+ Protocol to use for YOLOX inference. Options:
384
+ - "http" : Use HTTP protocol for YOLOX inference services
385
+ - "grpc" : Use gRPC protocol for YOLOX inference services
386
+
387
+ nemoretriever_parse_endpoints : Optional[Tuple[str, str]], default None
388
+ A tuple containing (gRPC endpoint, HTTP endpoint) for NemoRetriever Parse.
389
+ Format: (grpc_endpoint, http_endpoint)
390
+ Example: (None, "http://localhost:8015/v1/chat/completions")
391
+ Required for this extraction method.
392
+
393
+ nemoretriever_parse_protocol : str, default "http"
394
+ Protocol to use for NemoRetriever Parse. Options:
395
+ - "http" : Use HTTP protocol for NemoRetriever Parse services
396
+ - "grpc" : Use gRPC protocol for NemoRetriever Parse services
397
+
398
+ nemoretriever_parse_model_name : Optional[str], default None
399
+ Model name for NemoRetriever Parse.
400
+ Default is typically "nvidia/nemoretriever-parse" if None is provided.
401
+
402
+ Returns
403
+ -------
404
+ pd.DataFrame
405
+ A DataFrame containing the extracted primitives with the following columns:
406
+ - "document_type" : str
407
+ Type of the extracted element (e.g., "text", "image", "structured")
408
+ - "metadata" : Dict[str, Any]
409
+ Dictionary containing detailed information about the extracted element
410
+ including position, content, confidence scores, etc.
411
+ - "uuid" : str
412
+ Unique identifier for the extracted element
413
+
414
+ Raises
415
+ ------
416
+ ValueError
417
+ If `nemoretriever_parse_endpoints` is None or empty
418
+ If the input DataFrame does not have the required structure
419
+
420
+ KeyError
421
+ If required columns are missing from the input DataFrame
422
+
423
+ RuntimeError
424
+ If extraction fails due to service unavailability or processing errors
425
+
426
+ Examples
427
+ --------
428
+ >>> import pandas as pd
429
+ >>> import base64
430
+ >>>
431
+ >>> # Read a PDF file and encode it as base64
432
+ >>> with open("document.pdf", "rb") as f:
433
+ >>> pdf_content = base64.b64encode(f.read()).decode("utf-8")
434
+ >>>
435
+ >>> # Create a DataFrame with the PDF content
436
+ >>> df = pd.DataFrame({
437
+ >>> "source_id": ["doc1"],
438
+ >>> "source_name": ["document.pdf"],
439
+ >>> "content": [pdf_content],
440
+ >>> "document_type": ["pdf"],
441
+ >>> "metadata": [{"content_metadata": {"type": "document"}}]
442
+ >>> })
443
+ >>>
444
+ >>> # Extract primitives using NemoRetriever Parse
445
+ >>> result_df = extract_primitives_from_pdf_nemoretriever_parse(
446
+ >>> df_extraction_ledger=df,
447
+ >>> nemoretriever_parse_endpoints=(None, "http://localhost:8015/v1/chat/completions")
448
+ >>> )
449
+ >>>
450
+ >>> # Display the types of extracted elements
451
+ >>> print(result_df["document_type"].value_counts())
452
+
453
+ Notes
454
+ -----
455
+ - NemoRetriever Parse excels at extracting structured data like tables from PDFs
456
+ - For optimal results, ensure both NemoRetriever Parse and YOLOX services are
457
+ properly configured and accessible
458
+ - The extraction quality may vary depending on the complexity and quality of the input PDF
459
+ - This function wraps the more general `extract_primitives_from_pdf` function with
460
+ pre-configured parameters for NemoRetriever Parse extraction
461
+ """
462
+ return extract_primitives_from_pdf(
463
+ df_extraction_ledger=df_extraction_ledger,
464
+ extract_method="nemoretriever_parse",
465
+ extract_text=extract_text,
466
+ extract_images=extract_images,
467
+ extract_tables=extract_tables,
468
+ extract_charts=extract_charts,
469
+ extract_infographics=extract_infographics,
470
+ text_depth=text_depth,
471
+ yolox_endpoints=yolox_endpoints,
472
+ yolox_auth_token=yolox_auth_token,
473
+ yolox_infer_protocol=yolox_infer_protocol,
474
+ nemoretriever_parse_endpoints=nemoretriever_parse_endpoints,
475
+ nemoretriever_parse_protocol=nemoretriever_parse_protocol,
476
+ nemoretriever_parse_model_name=nemoretriever_parse_model_name,
477
+ )
478
+
479
+
480
+ @unified_exception_handler
481
+ def extract_primitives_from_audio(
482
+ *,
483
+ df_ledger: pd.DataFrame,
484
+ audio_endpoints: Tuple[str, str],
485
+ audio_infer_protocol: str = "grpc",
486
+ auth_token: str = None,
487
+ use_ssl: bool = False,
488
+ ssl_cert: str = None,
489
+ ) -> Any:
490
+ """
491
+ Extract audio primitives from a ledger DataFrame using the specified audio configuration.
492
+
493
+ This function builds an extraction configuration based on the provided audio endpoints,
494
+ inference protocol, authentication token, and SSL settings. It then delegates the extraction
495
+ work to the internal function ``extract_text_from_audio_internal`` using the constructed
496
+ configuration and ledger DataFrame.
497
+
498
+ Parameters
499
+ ----------
500
+ df_ledger : pandas.DataFrame
501
+ A DataFrame containing the ledger information required for audio extraction.
502
+ audio_endpoints : Tuple[str, str]
503
+ A tuple of two strings representing the audio service endpoints gRPC and HTTP services.
504
+ audio_infer_protocol : str, optional
505
+ The protocol to use for audio inference (e.g., "grpc"). Default is "grpc".
506
+ auth_token : str, optional
507
+ Authentication token for the audio inference service. Default is an empty string.
508
+ use_ssl : bool, optional
509
+ Flag indicating whether to use SSL for secure connections. Default is False.
510
+ ssl_cert : str, optional
511
+ Path to the SSL certificate file to use if ``use_ssl`` is True. Default is an empty string.
512
+
513
+ Returns
514
+ -------
515
+ Any
516
+ The result of the audio extraction as returned by
517
+ ``extract_text_from_audio_internal``. The specific type depends on the internal implementation.
518
+
519
+ Raises
520
+ ------
521
+ Exception
522
+ Any exceptions raised during the extraction process will be handled by the
523
+ ``@unified_exception_handler`` decorator.
524
+
525
+ Examples
526
+ --------
527
+ >>> import pandas as pd
528
+ >>> # Create a sample DataFrame with ledger data
529
+ >>> df = pd.DataFrame({"audio_data": ["file1.wav", "file2.wav"]})
530
+ >>> result = extract_primitives_from_audio(
531
+ ... df_ledger=df,
532
+ ... audio_endpoints=("http://primary.endpoint", "http://secondary.endpoint"),
533
+ ... audio_infer_protocol="grpc",
534
+ ... auth_token="secret-token",
535
+ ... use_ssl=True,
536
+ ... ssl_cert="/path/to/cert.pem"
537
+ ... )
538
+ """
539
+ task_config: Dict[str, Any] = {"params": {"extract_audio_params": {}}}
540
+
541
+ extraction_config = AudioExtractorSchema(
542
+ **{
543
+ "audio_extraction_config": {
544
+ "audio_endpoints": audio_endpoints,
545
+ "audio_infer_protocol": audio_infer_protocol,
546
+ "auth_token": auth_token,
547
+ "ssl_cert": ssl_cert,
548
+ "use_ssl": use_ssl,
549
+ }
550
+ }
551
+ )
552
+
553
+ result, _ = extract_text_from_audio_internal(
554
+ df_extraction_ledger=df_ledger,
555
+ task_config=task_config,
556
+ extraction_config=extraction_config,
557
+ execution_trace_log=None,
558
+ )
559
+
560
+ return result
561
+
562
+
563
+ @unified_exception_handler
564
+ def extract_primitives_from_pptx(
565
+ *,
566
+ df_ledger: pd.DataFrame,
567
+ extract_text: bool = True,
568
+ extract_images: bool = True,
569
+ extract_tables: bool = True,
570
+ extract_charts: bool = True,
571
+ extract_infographics: bool = True,
572
+ yolox_endpoints: Optional[Tuple[str, str]] = None,
573
+ yolox_infer_protocol: str = "grpc",
574
+ auth_token: str = "",
575
+ ) -> pd.DataFrame:
576
+ """
577
+ Extract primitives from PPTX files provided in a DataFrame.
578
+
579
+ This function configures the PPTX extraction task by assembling a task configuration
580
+ dictionary using the provided parameters. It then creates an extraction configuration
581
+ object (e.g., an instance of PPTXExtractorSchema) and delegates the actual extraction
582
+ process to the internal function `extract_primitives_from_pptx_internal`.
583
+
584
+ Parameters
585
+ ----------
586
+ df_ledger : pd.DataFrame
587
+ A DataFrame containing base64-encoded PPTX files. The DataFrame is expected to include
588
+ columns such as "content" (with the base64-encoded PPTX) and "source_id".
589
+ extract_text : bool, default=True
590
+ Flag indicating whether text should be extracted from the PPTX files.
591
+ extract_images : bool, default=True
592
+ Flag indicating whether images should be extracted.
593
+ extract_tables : bool, default=True
594
+ Flag indicating whether tables should be extracted.
595
+ extract_charts : bool, default=True
596
+ Flag indicating whether charts should be extracted.
597
+ extract_infographics : bool, default=True
598
+ Flag indicating whether infographics should be extracted.
599
+ yolox_endpoints : Optional[Tuple[str, str]], default=None
600
+ Optional tuple containing endpoints for YOLOX inference, if needed for image analysis.
601
+ yolox_infer_protocol : str, default="grpc"
602
+ The protocol to use for YOLOX inference.
603
+ auth_token : str, default=""
604
+ Authentication token to be used with the PPTX extraction configuration.
605
+
606
+ Returns
607
+ -------
608
+ pd.DataFrame
609
+ A DataFrame containing the extracted primitives from the PPTX files. Expected columns include
610
+ "document_type", "metadata", and "uuid".
611
+
612
+ Notes
613
+ -----
614
+ This function is decorated with `@unified_exception_handler` to handle exceptions uniformly.
615
+ The task configuration is assembled with two main keys:
616
+ - "params": Contains boolean flags for controlling which primitives to extract.
617
+ - "pptx_extraction_config": Contains additional settings for PPTX extraction (e.g., YOLOX endpoints,
618
+ inference protocol, and auth token).
619
+ It then calls `extract_primitives_from_pptx_internal` with the DataFrame, the task configuration,
620
+ and the extraction configuration.
621
+ """
622
+ task_config: Dict[str, Any] = {
623
+ "params": {
624
+ "extract_text": extract_text,
625
+ "extract_images": extract_images,
626
+ "extract_tables": extract_tables,
627
+ "extract_charts": extract_charts,
628
+ "extract_infographics": extract_infographics,
629
+ },
630
+ }
631
+
632
+ extraction_config = PPTXExtractorSchema(
633
+ **{
634
+ "pptx_extraction_config": {
635
+ "yolox_endpoints": yolox_endpoints,
636
+ "yolox_infer_protocol": yolox_infer_protocol,
637
+ "auth_token": auth_token,
638
+ },
639
+ }
640
+ ) # Assuming PPTXExtractorSchema is defined and imported
641
+
642
+ return extract_primitives_from_pptx_internal(
643
+ df_extraction_ledger=df_ledger,
644
+ task_config=task_config,
645
+ extraction_config=extraction_config,
646
+ execution_trace_log=None,
647
+ )
648
+
649
+
650
+ @unified_exception_handler
651
+ def extract_primitives_from_docx(
652
+ *,
653
+ df_ledger: pd.DataFrame,
654
+ extract_text: bool = True,
655
+ extract_images: bool = True,
656
+ extract_tables: bool = True,
657
+ extract_charts: bool = True,
658
+ extract_infographics: bool = True,
659
+ yolox_endpoints: Optional[Tuple[str, str]] = None,
660
+ yolox_infer_protocol: str = "grpc",
661
+ auth_token: str = "",
662
+ ) -> pd.DataFrame:
663
+ """
664
+ Extract primitives from DOCX documents in a DataFrame.
665
+
666
+ This function configures and invokes the DOCX extraction process. It builds a task configuration
667
+ using the provided extraction flags (for text, images, tables, charts, and infographics) and additional
668
+ settings for YOLOX endpoints, inference protocol, and authentication. It then creates a DOCX extraction
669
+ configuration (an instance of DocxExtractorSchema) and delegates the extraction to an internal function.
670
+
671
+ Parameters
672
+ ----------
673
+ df_ledger : pd.DataFrame
674
+ The input DataFrame containing DOCX documents in base64 encoding. The DataFrame is expected to
675
+ include required columns such as "content" (with the base64-encoded DOCX) and optionally "source_id".
676
+ extract_text : bool, optional
677
+ Flag indicating whether to extract text content from the DOCX documents (default is True).
678
+ extract_images : bool, optional
679
+ Flag indicating whether to extract images from the DOCX documents (default is True).
680
+ extract_tables : bool, optional
681
+ Flag indicating whether to extract tables from the DOCX documents (default is True).
682
+ extract_charts : bool, optional
683
+ Flag indicating whether to extract charts from the DOCX documents (default is True).
684
+ extract_infographics : bool, optional
685
+ Flag indicating whether to extract infographics from the DOCX documents (default is True).
686
+ yolox_endpoints : Optional[Tuple[str, str]], optional
687
+ A tuple containing YOLOX inference endpoints. If None, the default endpoints defined in the
688
+ DOCX extraction configuration will be used.
689
+ yolox_infer_protocol : str, optional
690
+ The inference protocol to use with the YOLOX endpoints (default is "grpc").
691
+ auth_token : str, optional
692
+ The authentication token for accessing the YOLOX inference service (default is an empty string).
693
+
694
+ Returns
695
+ -------
696
+ pd.DataFrame
697
+ A DataFrame containing the extracted DOCX primitives. Typically, the resulting DataFrame contains
698
+ columns such as "document_type", "metadata", and "uuid".
699
+
700
+ Raises
701
+ ------
702
+ Exception
703
+ If an error occurs during the DOCX extraction process, the exception is logged and re-raised.
704
+ """
705
+ # Build the task configuration with parameters and DOCX-specific extraction settings.
706
+ task_config: Dict[str, Any] = {
707
+ "params": {
708
+ "extract_text": extract_text,
709
+ "extract_images": extract_images,
710
+ "extract_tables": extract_tables,
711
+ "extract_charts": extract_charts,
712
+ "extract_infographics": extract_infographics,
713
+ },
714
+ }
715
+
716
+ # Create the extraction configuration object (instance of DocxExtractorSchema).
717
+ extraction_config = DocxExtractorSchema(
718
+ **{
719
+ "docx_extraction_config": {
720
+ "yolox_endpoints": yolox_endpoints,
721
+ "yolox_infer_protocol": yolox_infer_protocol,
722
+ "auth_token": auth_token,
723
+ },
724
+ }
725
+ )
726
+
727
+ # Delegate the actual extraction to the internal function.
728
+ return extract_primitives_from_docx_internal(
729
+ df_extraction_ledger=df_ledger,
730
+ task_config=task_config,
731
+ extraction_config=extraction_config,
732
+ execution_trace_log=None,
733
+ )
734
+
735
+
736
+ @unified_exception_handler
737
+ def extract_primitives_from_image(
738
+ *,
739
+ df_ledger: pd.DataFrame,
740
+ extract_text: bool = True,
741
+ extract_images: bool = True,
742
+ extract_tables: bool = True,
743
+ extract_charts: bool = True,
744
+ extract_infographics: bool = True,
745
+ yolox_endpoints: Optional[Tuple[str, str]] = None,
746
+ yolox_infer_protocol: str = "grpc",
747
+ auth_token: str = "",
748
+ ) -> pd.DataFrame:
749
+ task_config: Dict[str, Any] = {
750
+ "params": {
751
+ "extract_text": extract_text,
752
+ "extract_images": extract_images,
753
+ "extract_tables": extract_tables,
754
+ "extract_charts": extract_charts,
755
+ "extract_infographics": extract_infographics,
756
+ },
757
+ }
758
+
759
+ extraction_config = ImageExtractorSchema(
760
+ **{
761
+ "image_extraction_config": {
762
+ "yolox_endpoints": yolox_endpoints,
763
+ "yolox_infer_protocol": yolox_infer_protocol,
764
+ "auth_token": auth_token,
765
+ },
766
+ }
767
+ )
768
+
769
+ result, _ = extract_primitives_from_image_internal(
770
+ df_extraction_ledger=df_ledger,
771
+ task_config=task_config,
772
+ extraction_config=extraction_config,
773
+ execution_trace_log=None,
774
+ )
775
+
776
+ return result
777
+
778
+
779
+ @unified_exception_handler
780
+ def extract_chart_data_from_image(
781
+ *,
782
+ df_ledger: pd.DataFrame,
783
+ yolox_endpoints: Tuple[str, str],
784
+ paddle_endpoints: Tuple[str, str],
785
+ yolox_protocol: str = "grpc",
786
+ paddle_protocol: str = "grpc",
787
+ auth_token: str = "",
788
+ ) -> DataFrame:
789
+ """
790
+ Public interface to extract chart data from ledger DataFrame.
791
+
792
+ Parameters
793
+ ----------
794
+ df_ledger : pd.DataFrame
795
+ DataFrame containing metadata required for chart extraction.
796
+ yolox_endpoints : Tuple[str, str]
797
+ YOLOX inference server endpoints.
798
+ paddle_endpoints : Tuple[str, str]
799
+ PaddleOCR inference server endpoints.
800
+ yolox_protocol : str, optional
801
+ Protocol for YOLOX inference (default "grpc").
802
+ paddle_protocol : str, optional
803
+ Protocol for PaddleOCR inference (default "grpc").
804
+ auth_token : str, optional
805
+ Authentication token for inference services.
806
+ execution_trace_log : list, optional
807
+ Execution trace logs.
808
+
809
+ Returns
810
+ -------
811
+ pd.DataFrame
812
+ Updated DataFrame after chart extraction.
813
+
814
+ Raises
815
+ ------
816
+ Exception
817
+ If an error occurs during extraction.
818
+ """
819
+ task_config = IngestTaskChartExtraction()
820
+ extraction_config = ChartExtractorSchema(
821
+ **{
822
+ "endpoint_config": {
823
+ "yolox_endpoints": yolox_endpoints,
824
+ "paddle_endpoints": paddle_endpoints,
825
+ "yolox_infer_protocol": yolox_protocol,
826
+ "paddle_infer_protocol": paddle_protocol,
827
+ "auth_token": auth_token,
828
+ }
829
+ }
830
+ )
831
+
832
+ result, _ = extract_chart_data_from_image_internal(
833
+ df_extraction_ledger=df_ledger,
834
+ task_config=task_config,
835
+ extraction_config=extraction_config,
836
+ execution_trace_log=None,
837
+ )
838
+
839
+ return result
840
+
841
+
842
+ @unified_exception_handler
843
+ def extract_table_data_from_image(
844
+ *,
845
+ df_ledger: pd.DataFrame,
846
+ yolox_endpoints: Optional[Tuple[str, str]] = None,
847
+ paddle_endpoints: Optional[Tuple[str, str]] = None,
848
+ yolox_protocol: Optional[str] = None,
849
+ paddle_protocol: Optional[str] = None,
850
+ auth_token: Optional[str] = None,
851
+ ) -> pd.DataFrame:
852
+ """
853
+ Public interface to extract chart data from a ledger DataFrame.
854
+
855
+ Parameters
856
+ ----------
857
+ df_ledger : pd.DataFrame
858
+ DataFrame containing metadata required for chart extraction.
859
+ yolox_endpoints : Optional[Tuple[str, str]], default=None
860
+ YOLOX inference server endpoints. If None, the default defined in ChartExtractorConfigSchema is used.
861
+ paddle_endpoints : Optional[Tuple[str, str]], default=None
862
+ PaddleOCR inference server endpoints. If None, the default defined in ChartExtractorConfigSchema is used.
863
+ yolox_protocol : Optional[str], default=None
864
+ Protocol for YOLOX inference. If None, the default defined in ChartExtractorConfigSchema is used.
865
+ paddle_protocol : Optional[str], default=None
866
+ Protocol for PaddleOCR inference. If None, the default defined in ChartExtractorConfigSchema is used.
867
+ auth_token : Optional[str], default=None
868
+ Authentication token for inference services. If None, the default defined in ChartExtractorConfigSchema is used.
869
+
870
+ Returns
871
+ -------
872
+ pd.DataFrame
873
+ - The updated DataFrame after chart extraction.
874
+
875
+ Raises
876
+ ------
877
+ Exception
878
+ If an error occurs during extraction.
879
+ """
880
+ task_config = IngestTaskTableExtraction()
881
+
882
+ config_kwargs = {
883
+ "endpoint_config": {
884
+ "yolox_endpoints": yolox_endpoints,
885
+ "paddle_endpoints": paddle_endpoints,
886
+ "yolox_infer_protocol": yolox_protocol,
887
+ "paddle_infer_protocol": paddle_protocol,
888
+ "auth_token": auth_token,
889
+ }
890
+ }
891
+ # Remove keys with None values so that ChartExtractorConfigSchema's defaults are used.
892
+ config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
893
+
894
+ extraction_config = TableExtractorSchema(**config_kwargs)
895
+
896
+ result, _ = extract_table_data_from_image_internal(
897
+ df_extraction_ledger=df_ledger,
898
+ task_config=task_config,
899
+ extraction_config=extraction_config,
900
+ execution_trace_log=None,
901
+ )
902
+
903
+ return result
904
+
905
+
906
+ @unified_exception_handler
907
+ def extract_infographic_data_from_image(
908
+ *,
909
+ df_ledger: pd.DataFrame,
910
+ paddle_endpoints: Optional[Tuple[str, str]] = None,
911
+ paddle_protocol: Optional[str] = None,
912
+ auth_token: Optional[str] = None,
913
+ ) -> pd.DataFrame:
914
+ """
915
+ Extract infographic data from a DataFrame using the configured infographic extraction pipeline.
916
+
917
+ This function creates a task configuration for infographic extraction, builds the extraction
918
+ configuration from the provided PaddleOCR endpoints, protocol, and authentication token (or uses
919
+ the default values from InfographicExtractorConfigSchema if None), and then calls the internal
920
+ extraction function to process the DataFrame. The unified exception handler decorator ensures
921
+ that any errors are appropriately logged and managed.
922
+
923
+ Parameters
924
+ ----------
925
+ df_extraction_ledger : pd.DataFrame
926
+ DataFrame containing the images and associated metadata from which infographic data is to be extracted.
927
+ paddle_endpoints : Optional[Tuple[str, str]], default=None
928
+ A tuple of PaddleOCR endpoint addresses (e.g., (gRPC_endpoint, HTTP_endpoint)) used for inference.
929
+ If None, the default endpoints from InfographicExtractorConfigSchema are used.
930
+ paddle_protocol : Optional[str], default=None
931
+ The protocol (e.g., "grpc" or "http") for PaddleOCR inference.
932
+ If None, the default protocol from InfographicExtractorConfigSchema is used.
933
+ auth_token : Optional[str], default=None
934
+ The authentication token required for secure access to PaddleOCR inference services.
935
+ If None, the default value from InfographicExtractorConfigSchema is used.
936
+
937
+ Returns
938
+ -------
939
+ pd.DataFrame
940
+ The updated DataFrame after infographic extraction has been performed.
941
+
942
+ Raises
943
+ ------
944
+ Exception
945
+ Propagates any exception raised during the extraction process, after being handled by the
946
+ unified exception handler.
947
+ """
948
+
949
+ task_config = {}
950
+
951
+ extractor_config_kwargs = {
952
+ "endpoint_config": InfographicExtractorConfigSchema(
953
+ **{
954
+ "paddle_endpoints": paddle_endpoints,
955
+ "paddle_infer_protocol": paddle_protocol,
956
+ "auth_token": auth_token,
957
+ }
958
+ )
959
+ }
960
+ # Remove keys with None values so that InfographicExtractorConfigSchema's defaults are used.
961
+ extractor_config_kwargs = {k: v for k, v in extractor_config_kwargs.items() if v is not None}
962
+
963
+ extraction_config = InfographicExtractorSchema(**extractor_config_kwargs)
964
+
965
+ result, _ = extract_infographic_data_from_image_internal(
966
+ df_extraction_ledger=df_ledger,
967
+ task_config=task_config,
968
+ extraction_config=extraction_config,
969
+ execution_trace_log=None,
970
+ )
971
+
972
+ return result