nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -72
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
|
@@ -1,972 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Tuple, Optional, Dict, Any
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from pandas import DataFrame
|
|
10
|
-
|
|
11
|
-
from . import extraction_interface_relay_constructor
|
|
12
|
-
|
|
13
|
-
from nv_ingest_api.internal.extract.pdf.pdf_extractor import extract_primitives_from_pdf_internal
|
|
14
|
-
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
15
|
-
from nv_ingest_api.internal.extract.docx.docx_extractor import extract_primitives_from_docx_internal
|
|
16
|
-
from nv_ingest_api.internal.extract.pptx.pptx_extractor import extract_primitives_from_pptx_internal
|
|
17
|
-
from nv_ingest_api.internal.extract.image.chart_extractor import extract_chart_data_from_image_internal
|
|
18
|
-
from nv_ingest_api.internal.extract.image.image_extractor import extract_primitives_from_image_internal
|
|
19
|
-
from nv_ingest_api.internal.extract.image.table_extractor import extract_table_data_from_image_internal
|
|
20
|
-
from nv_ingest_api.internal.extract.image.infographic_extractor import extract_infographic_data_from_image_internal
|
|
21
|
-
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
22
|
-
from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
|
|
23
|
-
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
|
|
24
|
-
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import (
|
|
25
|
-
InfographicExtractorConfigSchema,
|
|
26
|
-
InfographicExtractorSchema,
|
|
27
|
-
)
|
|
28
|
-
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
|
|
29
|
-
from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
|
|
30
|
-
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import (
|
|
31
|
-
IngestTaskChartExtraction,
|
|
32
|
-
IngestTaskTableExtraction,
|
|
33
|
-
)
|
|
34
|
-
from nv_ingest_api.internal.extract.audio.audio_extraction import extract_text_from_audio_internal
|
|
35
|
-
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
36
|
-
|
|
37
|
-
logger = logging.getLogger(__name__)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@unified_exception_handler
|
|
41
|
-
@extraction_interface_relay_constructor(
|
|
42
|
-
api_fn=extract_primitives_from_pdf_internal,
|
|
43
|
-
task_keys=["extract_text", "extract_images", "extract_tables", "extract_charts", "extract_infographics"],
|
|
44
|
-
)
|
|
45
|
-
def extract_primitives_from_pdf(
|
|
46
|
-
*,
|
|
47
|
-
df_extraction_ledger: pd.DataFrame, # Ledger (e.g., a pandas DataFrame)
|
|
48
|
-
extract_method: str = "pdfium", # Determines which extraction schema to use
|
|
49
|
-
extract_text: bool = True,
|
|
50
|
-
extract_images: bool = True,
|
|
51
|
-
extract_infographics: bool = True,
|
|
52
|
-
extract_tables: bool = True,
|
|
53
|
-
extract_charts: bool = True,
|
|
54
|
-
text_depth: str = "page",
|
|
55
|
-
# Adobe-specific parameters:
|
|
56
|
-
adobe_client_id: Optional[str] = None,
|
|
57
|
-
adobe_client_secret: Optional[str] = None,
|
|
58
|
-
# LLama
|
|
59
|
-
llama_api_key: Optional[str] = None,
|
|
60
|
-
# PDFium-specific parameters:
|
|
61
|
-
yolox_auth_token: Optional[str] = None,
|
|
62
|
-
yolox_endpoints: Optional[Tuple[Optional[str], Optional[str]]] = None,
|
|
63
|
-
yolox_infer_protocol: str = "http",
|
|
64
|
-
# Nemoretriver Parse parameters:
|
|
65
|
-
nemoretriever_parse_endpoints: Optional[Tuple[str, str]] = None,
|
|
66
|
-
nemoretriever_parse_protocol: str = "http",
|
|
67
|
-
nemoretriever_parse_model_name: str = None,
|
|
68
|
-
# UnstructuredIO parameters:
|
|
69
|
-
unstructured_io_api_key: Optional[str] = None,
|
|
70
|
-
# Tika-specific parameter:
|
|
71
|
-
tika_server_url: Optional[str] = None,
|
|
72
|
-
):
|
|
73
|
-
"""
|
|
74
|
-
Extract text, images, tables, charts, and infographics from PDF documents.
|
|
75
|
-
|
|
76
|
-
This function serves as a unified interface for PDF primitive extraction, supporting multiple
|
|
77
|
-
extraction engines (pdfium, adobe, llama, nemoretriever_parse, unstructured_io, and tika).
|
|
78
|
-
It processes a DataFrame containing base64-encoded PDF data and returns a new DataFrame
|
|
79
|
-
with structured information about the extracted elements.
|
|
80
|
-
|
|
81
|
-
The function uses a decorator pattern to dynamically validate configuration parameters
|
|
82
|
-
and invoke the appropriate extraction pipeline. This design allows for flexible
|
|
83
|
-
engine-specific configuration while maintaining a consistent interface.
|
|
84
|
-
|
|
85
|
-
Parameters
|
|
86
|
-
----------
|
|
87
|
-
df_extraction_ledger : pd.DataFrame
|
|
88
|
-
DataFrame containing PDF documents to process. Must include the following columns:
|
|
89
|
-
- "content" : str
|
|
90
|
-
Base64-encoded PDF data
|
|
91
|
-
- "source_id" : str
|
|
92
|
-
Unique identifier for the document
|
|
93
|
-
- "source_name" : str
|
|
94
|
-
Name of the document (filename or descriptive name)
|
|
95
|
-
- "document_type" : str or enum
|
|
96
|
-
Document type identifier (should be "pdf" or related enum value)
|
|
97
|
-
- "metadata" : Dict[str, Any]
|
|
98
|
-
Dictionary containing additional metadata about the document
|
|
99
|
-
|
|
100
|
-
extract_method : str, default "pdfium"
|
|
101
|
-
The extraction engine to use. Valid options:
|
|
102
|
-
- "pdfium" : PDFium-based extraction (default)
|
|
103
|
-
- "adobe" : Adobe PDF Services API
|
|
104
|
-
- "llama" : LlamaParse extraction
|
|
105
|
-
- "nemoretriever_parse" : NVIDIA NemoRetriever Parse
|
|
106
|
-
- "unstructured_io" : Unstructured.io extraction
|
|
107
|
-
- "tika" : Apache Tika extraction
|
|
108
|
-
|
|
109
|
-
extract_text : bool, default True
|
|
110
|
-
Whether to extract text content from the PDFs.
|
|
111
|
-
|
|
112
|
-
extract_images : bool, default True
|
|
113
|
-
Whether to extract embedded images from the PDFs.
|
|
114
|
-
|
|
115
|
-
extract_infographics : bool, default True
|
|
116
|
-
Whether to extract infographics from the PDFs.
|
|
117
|
-
|
|
118
|
-
extract_tables : bool, default True
|
|
119
|
-
Whether to extract tables from the PDFs.
|
|
120
|
-
|
|
121
|
-
extract_charts : bool, default True
|
|
122
|
-
Whether to extract charts and graphs from the PDFs.
|
|
123
|
-
|
|
124
|
-
text_depth : str, default "page"
|
|
125
|
-
Level of text granularity to extract. Options:
|
|
126
|
-
- "page" : Text extracted at page level
|
|
127
|
-
- "block" : Text extracted at block level
|
|
128
|
-
- "paragraph" : Text extracted at paragraph level
|
|
129
|
-
- "line" : Text extracted at line level
|
|
130
|
-
|
|
131
|
-
adobe_client_id : str, optional
|
|
132
|
-
Client ID for Adobe PDF Services API. Required when extract_method="adobe".
|
|
133
|
-
|
|
134
|
-
adobe_client_secret : str, optional
|
|
135
|
-
Client secret for Adobe PDF Services API. Required when extract_method="adobe".
|
|
136
|
-
|
|
137
|
-
llama_api_key : str, optional
|
|
138
|
-
API key for LlamaParse service. Required when extract_method="llama".
|
|
139
|
-
|
|
140
|
-
yolox_auth_token : str, optional
|
|
141
|
-
Authentication token for YOLOX inference services.
|
|
142
|
-
|
|
143
|
-
yolox_endpoints : tuple of (str, str), optional
|
|
144
|
-
A tuple containing (gRPC endpoint, HTTP endpoint) for YOLOX services.
|
|
145
|
-
At least one endpoint must be non-empty.
|
|
146
|
-
|
|
147
|
-
yolox_infer_protocol : str, default "http"
|
|
148
|
-
Protocol to use for YOLOX inference. Options: "http" or "grpc".
|
|
149
|
-
|
|
150
|
-
nemoretriever_parse_endpoints : tuple of (str, str), optional
|
|
151
|
-
A tuple containing (gRPC endpoint, HTTP endpoint) for NemoRetriever Parse.
|
|
152
|
-
Required when extract_method="nemoretriever_parse".
|
|
153
|
-
|
|
154
|
-
nemoretriever_parse_protocol : str, default "http"
|
|
155
|
-
Protocol to use for NemoRetriever Parse. Options: "http" or "grpc".
|
|
156
|
-
|
|
157
|
-
nemoretriever_parse_model_name : str, optional
|
|
158
|
-
Model name for NemoRetriever Parse. Default is "nvidia/nemoretriever-parse".
|
|
159
|
-
|
|
160
|
-
unstructured_io_api_key : str, optional
|
|
161
|
-
API key for Unstructured.io services. Required when extract_method="unstructured_io".
|
|
162
|
-
|
|
163
|
-
tika_server_url : str, optional
|
|
164
|
-
URL for Apache Tika server. Required when extract_method="tika".
|
|
165
|
-
|
|
166
|
-
Returns
|
|
167
|
-
-------
|
|
168
|
-
pandas.DataFrame
|
|
169
|
-
A DataFrame containing the extracted primitives with the following columns:
|
|
170
|
-
- "document_type" : Type of the extracted element (e.g., "text", "image", "table")
|
|
171
|
-
- "metadata" : Dictionary containing detailed information about the extracted element
|
|
172
|
-
- "uuid" : Unique identifier for the extracted element
|
|
173
|
-
|
|
174
|
-
Raises
|
|
175
|
-
------
|
|
176
|
-
ValueError
|
|
177
|
-
If an unsupported extraction method is specified.
|
|
178
|
-
If required parameters for the specified extraction method are missing.
|
|
179
|
-
If the input DataFrame does not have the required structure.
|
|
180
|
-
|
|
181
|
-
KeyError
|
|
182
|
-
If required columns are missing from the input DataFrame.
|
|
183
|
-
|
|
184
|
-
RuntimeError
|
|
185
|
-
If extraction fails due to processing errors.
|
|
186
|
-
|
|
187
|
-
Notes
|
|
188
|
-
-----
|
|
189
|
-
The function uses a decorator pattern through `extraction_interface_relay_constructor`
|
|
190
|
-
which dynamically processes the parameters and validates them against the appropriate
|
|
191
|
-
configuration schema. The actual extraction work is delegated to the
|
|
192
|
-
`extract_primitives_from_pdf_internal` function.
|
|
193
|
-
|
|
194
|
-
For each extraction method, specific parameters are required:
|
|
195
|
-
- pdfium: yolox_endpoints
|
|
196
|
-
- adobe: adobe_client_id, adobe_client_secret
|
|
197
|
-
- llama: llama_api_key
|
|
198
|
-
- nemoretriever_parse: nemoretriever_parse_endpoints
|
|
199
|
-
- unstructured_io: unstructured_io_api_key
|
|
200
|
-
- tika: tika_server_url
|
|
201
|
-
|
|
202
|
-
Examples
|
|
203
|
-
--------
|
|
204
|
-
>>> import pandas as pd
|
|
205
|
-
>>> import base64
|
|
206
|
-
>>>
|
|
207
|
-
>>> # Read a PDF file and encode it as base64
|
|
208
|
-
>>> with open("document.pdf", "rb") as f:
|
|
209
|
-
>>> pdf_content = base64.b64encode(f.read()).decode("utf-8")
|
|
210
|
-
>>>
|
|
211
|
-
>>> # Create a DataFrame with the PDF content
|
|
212
|
-
>>> df = pd.DataFrame({
|
|
213
|
-
>>> "source_id": ["doc1"],
|
|
214
|
-
>>> "source_name": ["document.pdf"],
|
|
215
|
-
>>> "content": [pdf_content],
|
|
216
|
-
>>> "document_type": ["pdf"],
|
|
217
|
-
>>> "metadata": [{"content_metadata": {"type": "document"}}]
|
|
218
|
-
>>> })
|
|
219
|
-
>>>
|
|
220
|
-
>>> # Extract primitives using PDFium
|
|
221
|
-
>>> result_df = extract_primitives_from_pdf(
|
|
222
|
-
>>> df_extraction_ledger=df,
|
|
223
|
-
>>> extract_method="pdfium",
|
|
224
|
-
>>> yolox_endpoints=(None, "http://localhost:8000/v1/infer")
|
|
225
|
-
>>> )
|
|
226
|
-
>>>
|
|
227
|
-
>>> # Display the types of extracted elements
|
|
228
|
-
>>> print(result_df["document_type"].value_counts())
|
|
229
|
-
"""
|
|
230
|
-
pass
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def extract_primitives_from_pdf_pdfium(
|
|
234
|
-
df_extraction_ledger: pd.DataFrame,
|
|
235
|
-
*,
|
|
236
|
-
extract_text: bool = True,
|
|
237
|
-
extract_images: bool = True,
|
|
238
|
-
extract_tables: bool = True,
|
|
239
|
-
extract_charts: bool = True,
|
|
240
|
-
extract_infographics: bool = True,
|
|
241
|
-
text_depth: str = "page",
|
|
242
|
-
yolox_auth_token: Optional[str] = None,
|
|
243
|
-
yolox_endpoints: Optional[Tuple[Optional[str], Optional[str]]] = None,
|
|
244
|
-
yolox_infer_protocol: str = "http",
|
|
245
|
-
) -> pd.DataFrame:
|
|
246
|
-
"""
|
|
247
|
-
Extract primitives from PDF documents using the PDFium extraction method.
|
|
248
|
-
|
|
249
|
-
A simplified wrapper around the general extract_primitives_from_pdf function
|
|
250
|
-
that defaults to using the PDFium extraction engine.
|
|
251
|
-
|
|
252
|
-
Parameters
|
|
253
|
-
----------
|
|
254
|
-
df_extraction_ledger : pd.DataFrame
|
|
255
|
-
DataFrame containing PDF documents to process. Must include the following columns:
|
|
256
|
-
- "content" : str
|
|
257
|
-
Base64-encoded PDF data
|
|
258
|
-
- "source_id" : str
|
|
259
|
-
Unique identifier for the document
|
|
260
|
-
- "source_name" : str
|
|
261
|
-
Name of the document (filename or descriptive name)
|
|
262
|
-
- "document_type" : str or enum
|
|
263
|
-
Document type identifier (should be "pdf" or related enum value)
|
|
264
|
-
- "metadata" : Dict[str, Any]
|
|
265
|
-
Dictionary containing additional metadata about the document
|
|
266
|
-
extract_text : bool, default True
|
|
267
|
-
Whether to extract text content
|
|
268
|
-
extract_images : bool, default True
|
|
269
|
-
Whether to extract embedded images
|
|
270
|
-
extract_tables : bool, default True
|
|
271
|
-
Whether to extract tables
|
|
272
|
-
extract_charts : bool, default True
|
|
273
|
-
Whether to extract charts
|
|
274
|
-
extract_infographics : bool, default True
|
|
275
|
-
Whether to extract infographics
|
|
276
|
-
text_depth : str, default "page"
|
|
277
|
-
Level of text granularity (page, block, paragraph, line)
|
|
278
|
-
yolox_auth_token : str, optional
|
|
279
|
-
Authentication token for YOLOX inference services
|
|
280
|
-
yolox_endpoints : tuple of (str, str), optional
|
|
281
|
-
Tuple containing (gRPC endpoint, HTTP endpoint) for YOLOX services
|
|
282
|
-
yolox_infer_protocol : str, default "http"
|
|
283
|
-
Protocol to use for YOLOX inference ("http" or "grpc")
|
|
284
|
-
|
|
285
|
-
Returns
|
|
286
|
-
-------
|
|
287
|
-
pd.DataFrame
|
|
288
|
-
DataFrame containing the extracted primitives
|
|
289
|
-
"""
|
|
290
|
-
return extract_primitives_from_pdf(
|
|
291
|
-
df_extraction_ledger=df_extraction_ledger,
|
|
292
|
-
extract_method="pdfium",
|
|
293
|
-
extract_text=extract_text,
|
|
294
|
-
extract_images=extract_images,
|
|
295
|
-
extract_tables=extract_tables,
|
|
296
|
-
extract_charts=extract_charts,
|
|
297
|
-
extract_infographics=extract_infographics,
|
|
298
|
-
text_depth=text_depth,
|
|
299
|
-
yolox_auth_token=yolox_auth_token,
|
|
300
|
-
yolox_endpoints=yolox_endpoints,
|
|
301
|
-
yolox_infer_protocol=yolox_infer_protocol,
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def extract_primitives_from_pdf_nemoretriever_parse(
|
|
306
|
-
df_extraction_ledger: pd.DataFrame,
|
|
307
|
-
*,
|
|
308
|
-
extract_text: bool = True,
|
|
309
|
-
extract_images: bool = True,
|
|
310
|
-
extract_tables: bool = True,
|
|
311
|
-
extract_charts: bool = True,
|
|
312
|
-
extract_infographics: bool = True,
|
|
313
|
-
text_depth: str = "page",
|
|
314
|
-
yolox_auth_token: Optional[str] = None,
|
|
315
|
-
yolox_endpoints: Optional[Tuple[Optional[str], Optional[str]]] = None,
|
|
316
|
-
yolox_infer_protocol: str = "http",
|
|
317
|
-
nemoretriever_parse_endpoints: Optional[Tuple[str, str]] = None,
|
|
318
|
-
nemoretriever_parse_protocol: str = "http",
|
|
319
|
-
nemoretriever_parse_model_name: Optional[str] = None,
|
|
320
|
-
) -> pd.DataFrame:
|
|
321
|
-
"""
|
|
322
|
-
Extract primitives from PDF documents using the NemoRetriever Parse extraction method.
|
|
323
|
-
|
|
324
|
-
This function serves as a specialized wrapper around the general extract_primitives_from_pdf
|
|
325
|
-
function, pre-configured to use NemoRetriever Parse as the extraction engine. It processes
|
|
326
|
-
PDF documents to extract various content types including text, images, tables, charts, and
|
|
327
|
-
infographics, returning the results in a structured DataFrame.
|
|
328
|
-
|
|
329
|
-
Parameters
|
|
330
|
-
----------
|
|
331
|
-
df_extraction_ledger : pd.DataFrame
|
|
332
|
-
DataFrame containing PDF documents to process. Must include the following columns:
|
|
333
|
-
- "content" : str
|
|
334
|
-
Base64-encoded PDF data
|
|
335
|
-
- "source_id" : str
|
|
336
|
-
Unique identifier for the document
|
|
337
|
-
- "source_name" : str
|
|
338
|
-
Name of the document (filename or descriptive name)
|
|
339
|
-
- "document_type" : str or enum
|
|
340
|
-
Document type identifier (should be "pdf" or related enum value)
|
|
341
|
-
- "metadata" : Dict[str, Any]
|
|
342
|
-
Dictionary containing additional metadata about the document
|
|
343
|
-
|
|
344
|
-
extract_text : bool, default True
|
|
345
|
-
Whether to extract text content from the PDFs. When True, the function will
|
|
346
|
-
attempt to extract and structure all textual content according to the
|
|
347
|
-
granularity specified by `text_depth`.
|
|
348
|
-
|
|
349
|
-
extract_images : bool, default True
|
|
350
|
-
Whether to extract embedded images from the PDFs. When True, the function
|
|
351
|
-
will identify, extract, and process images embedded within the document.
|
|
352
|
-
|
|
353
|
-
extract_tables : bool, default True
|
|
354
|
-
Whether to extract tables from the PDFs. When True, the function will
|
|
355
|
-
detect tabular structures and convert them into structured data.
|
|
356
|
-
|
|
357
|
-
extract_charts : bool, default True
|
|
358
|
-
Whether to extract charts and graphs from the PDFs. When True, the function
|
|
359
|
-
will detect and extract visual data representations.
|
|
360
|
-
|
|
361
|
-
extract_infographics : bool, default True
|
|
362
|
-
Whether to extract infographics from the PDFs. When True, the function will
|
|
363
|
-
identify and extract complex visual information displays.
|
|
364
|
-
|
|
365
|
-
text_depth : str, default "page"
|
|
366
|
-
Level of text granularity to extract. Options:
|
|
367
|
-
- "page" : Text extracted at page level (coarsest granularity)
|
|
368
|
-
- "block" : Text extracted at block level (groups of paragraphs)
|
|
369
|
-
- "paragraph" : Text extracted at paragraph level (semantic units)
|
|
370
|
-
- "line" : Text extracted at line level (finest granularity)
|
|
371
|
-
|
|
372
|
-
yolox_auth_token : Optional[str], default None
|
|
373
|
-
Authentication token for YOLOX inference services used for image processing.
|
|
374
|
-
Required if the YOLOX services need authentication.
|
|
375
|
-
|
|
376
|
-
yolox_endpoints : Optional[Tuple[Optional[str], Optional[str]]], default None
|
|
377
|
-
A tuple containing (gRPC endpoint, HTTP endpoint) for YOLOX services.
|
|
378
|
-
Used for image processing capabilities within the extraction pipeline.
|
|
379
|
-
Format: (grpc_endpoint, http_endpoint)
|
|
380
|
-
Example: (None, "http://localhost:8000/v1/infer")
|
|
381
|
-
|
|
382
|
-
yolox_infer_protocol : str, default "http"
|
|
383
|
-
Protocol to use for YOLOX inference. Options:
|
|
384
|
-
- "http" : Use HTTP protocol for YOLOX inference services
|
|
385
|
-
- "grpc" : Use gRPC protocol for YOLOX inference services
|
|
386
|
-
|
|
387
|
-
nemoretriever_parse_endpoints : Optional[Tuple[str, str]], default None
|
|
388
|
-
A tuple containing (gRPC endpoint, HTTP endpoint) for NemoRetriever Parse.
|
|
389
|
-
Format: (grpc_endpoint, http_endpoint)
|
|
390
|
-
Example: (None, "http://localhost:8015/v1/chat/completions")
|
|
391
|
-
Required for this extraction method.
|
|
392
|
-
|
|
393
|
-
nemoretriever_parse_protocol : str, default "http"
|
|
394
|
-
Protocol to use for NemoRetriever Parse. Options:
|
|
395
|
-
- "http" : Use HTTP protocol for NemoRetriever Parse services
|
|
396
|
-
- "grpc" : Use gRPC protocol for NemoRetriever Parse services
|
|
397
|
-
|
|
398
|
-
nemoretriever_parse_model_name : Optional[str], default None
|
|
399
|
-
Model name for NemoRetriever Parse.
|
|
400
|
-
Default is typically "nvidia/nemoretriever-parse" if None is provided.
|
|
401
|
-
|
|
402
|
-
Returns
|
|
403
|
-
-------
|
|
404
|
-
pd.DataFrame
|
|
405
|
-
A DataFrame containing the extracted primitives with the following columns:
|
|
406
|
-
- "document_type" : str
|
|
407
|
-
Type of the extracted element (e.g., "text", "image", "structured")
|
|
408
|
-
- "metadata" : Dict[str, Any]
|
|
409
|
-
Dictionary containing detailed information about the extracted element
|
|
410
|
-
including position, content, confidence scores, etc.
|
|
411
|
-
- "uuid" : str
|
|
412
|
-
Unique identifier for the extracted element
|
|
413
|
-
|
|
414
|
-
Raises
|
|
415
|
-
------
|
|
416
|
-
ValueError
|
|
417
|
-
If `nemoretriever_parse_endpoints` is None or empty
|
|
418
|
-
If the input DataFrame does not have the required structure
|
|
419
|
-
|
|
420
|
-
KeyError
|
|
421
|
-
If required columns are missing from the input DataFrame
|
|
422
|
-
|
|
423
|
-
RuntimeError
|
|
424
|
-
If extraction fails due to service unavailability or processing errors
|
|
425
|
-
|
|
426
|
-
Examples
|
|
427
|
-
--------
|
|
428
|
-
>>> import pandas as pd
|
|
429
|
-
>>> import base64
|
|
430
|
-
>>>
|
|
431
|
-
>>> # Read a PDF file and encode it as base64
|
|
432
|
-
>>> with open("document.pdf", "rb") as f:
|
|
433
|
-
>>> pdf_content = base64.b64encode(f.read()).decode("utf-8")
|
|
434
|
-
>>>
|
|
435
|
-
>>> # Create a DataFrame with the PDF content
|
|
436
|
-
>>> df = pd.DataFrame({
|
|
437
|
-
>>> "source_id": ["doc1"],
|
|
438
|
-
>>> "source_name": ["document.pdf"],
|
|
439
|
-
>>> "content": [pdf_content],
|
|
440
|
-
>>> "document_type": ["pdf"],
|
|
441
|
-
>>> "metadata": [{"content_metadata": {"type": "document"}}]
|
|
442
|
-
>>> })
|
|
443
|
-
>>>
|
|
444
|
-
>>> # Extract primitives using NemoRetriever Parse
|
|
445
|
-
>>> result_df = extract_primitives_from_pdf_nemoretriever_parse(
|
|
446
|
-
>>> df_extraction_ledger=df,
|
|
447
|
-
>>> nemoretriever_parse_endpoints=(None, "http://localhost:8015/v1/chat/completions")
|
|
448
|
-
>>> )
|
|
449
|
-
>>>
|
|
450
|
-
>>> # Display the types of extracted elements
|
|
451
|
-
>>> print(result_df["document_type"].value_counts())
|
|
452
|
-
|
|
453
|
-
Notes
|
|
454
|
-
-----
|
|
455
|
-
- NemoRetriever Parse excels at extracting structured data like tables from PDFs
|
|
456
|
-
- For optimal results, ensure both NemoRetriever Parse and YOLOX services are
|
|
457
|
-
properly configured and accessible
|
|
458
|
-
- The extraction quality may vary depending on the complexity and quality of the input PDF
|
|
459
|
-
- This function wraps the more general `extract_primitives_from_pdf` function with
|
|
460
|
-
pre-configured parameters for NemoRetriever Parse extraction
|
|
461
|
-
"""
|
|
462
|
-
return extract_primitives_from_pdf(
|
|
463
|
-
df_extraction_ledger=df_extraction_ledger,
|
|
464
|
-
extract_method="nemoretriever_parse",
|
|
465
|
-
extract_text=extract_text,
|
|
466
|
-
extract_images=extract_images,
|
|
467
|
-
extract_tables=extract_tables,
|
|
468
|
-
extract_charts=extract_charts,
|
|
469
|
-
extract_infographics=extract_infographics,
|
|
470
|
-
text_depth=text_depth,
|
|
471
|
-
yolox_endpoints=yolox_endpoints,
|
|
472
|
-
yolox_auth_token=yolox_auth_token,
|
|
473
|
-
yolox_infer_protocol=yolox_infer_protocol,
|
|
474
|
-
nemoretriever_parse_endpoints=nemoretriever_parse_endpoints,
|
|
475
|
-
nemoretriever_parse_protocol=nemoretriever_parse_protocol,
|
|
476
|
-
nemoretriever_parse_model_name=nemoretriever_parse_model_name,
|
|
477
|
-
)
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
@unified_exception_handler
|
|
481
|
-
def extract_primitives_from_audio(
|
|
482
|
-
*,
|
|
483
|
-
df_ledger: pd.DataFrame,
|
|
484
|
-
audio_endpoints: Tuple[str, str],
|
|
485
|
-
audio_infer_protocol: str = "grpc",
|
|
486
|
-
auth_token: str = None,
|
|
487
|
-
use_ssl: bool = False,
|
|
488
|
-
ssl_cert: str = None,
|
|
489
|
-
) -> Any:
|
|
490
|
-
"""
|
|
491
|
-
Extract audio primitives from a ledger DataFrame using the specified audio configuration.
|
|
492
|
-
|
|
493
|
-
This function builds an extraction configuration based on the provided audio endpoints,
|
|
494
|
-
inference protocol, authentication token, and SSL settings. It then delegates the extraction
|
|
495
|
-
work to the internal function ``extract_text_from_audio_internal`` using the constructed
|
|
496
|
-
configuration and ledger DataFrame.
|
|
497
|
-
|
|
498
|
-
Parameters
|
|
499
|
-
----------
|
|
500
|
-
df_ledger : pandas.DataFrame
|
|
501
|
-
A DataFrame containing the ledger information required for audio extraction.
|
|
502
|
-
audio_endpoints : Tuple[str, str]
|
|
503
|
-
A tuple of two strings representing the audio service endpoints gRPC and HTTP services.
|
|
504
|
-
audio_infer_protocol : str, optional
|
|
505
|
-
The protocol to use for audio inference (e.g., "grpc"). Default is "grpc".
|
|
506
|
-
auth_token : str, optional
|
|
507
|
-
Authentication token for the audio inference service. Default is an empty string.
|
|
508
|
-
use_ssl : bool, optional
|
|
509
|
-
Flag indicating whether to use SSL for secure connections. Default is False.
|
|
510
|
-
ssl_cert : str, optional
|
|
511
|
-
Path to the SSL certificate file to use if ``use_ssl`` is True. Default is an empty string.
|
|
512
|
-
|
|
513
|
-
Returns
|
|
514
|
-
-------
|
|
515
|
-
Any
|
|
516
|
-
The result of the audio extraction as returned by
|
|
517
|
-
``extract_text_from_audio_internal``. The specific type depends on the internal implementation.
|
|
518
|
-
|
|
519
|
-
Raises
|
|
520
|
-
------
|
|
521
|
-
Exception
|
|
522
|
-
Any exceptions raised during the extraction process will be handled by the
|
|
523
|
-
``@unified_exception_handler`` decorator.
|
|
524
|
-
|
|
525
|
-
Examples
|
|
526
|
-
--------
|
|
527
|
-
>>> import pandas as pd
|
|
528
|
-
>>> # Create a sample DataFrame with ledger data
|
|
529
|
-
>>> df = pd.DataFrame({"audio_data": ["file1.wav", "file2.wav"]})
|
|
530
|
-
>>> result = extract_primitives_from_audio(
|
|
531
|
-
... df_ledger=df,
|
|
532
|
-
... audio_endpoints=("http://primary.endpoint", "http://secondary.endpoint"),
|
|
533
|
-
... audio_infer_protocol="grpc",
|
|
534
|
-
... auth_token="secret-token",
|
|
535
|
-
... use_ssl=True,
|
|
536
|
-
... ssl_cert="/path/to/cert.pem"
|
|
537
|
-
... )
|
|
538
|
-
"""
|
|
539
|
-
task_config: Dict[str, Any] = {"params": {"extract_audio_params": {}}}
|
|
540
|
-
|
|
541
|
-
extraction_config = AudioExtractorSchema(
|
|
542
|
-
**{
|
|
543
|
-
"audio_extraction_config": {
|
|
544
|
-
"audio_endpoints": audio_endpoints,
|
|
545
|
-
"audio_infer_protocol": audio_infer_protocol,
|
|
546
|
-
"auth_token": auth_token,
|
|
547
|
-
"ssl_cert": ssl_cert,
|
|
548
|
-
"use_ssl": use_ssl,
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
)
|
|
552
|
-
|
|
553
|
-
result, _ = extract_text_from_audio_internal(
|
|
554
|
-
df_extraction_ledger=df_ledger,
|
|
555
|
-
task_config=task_config,
|
|
556
|
-
extraction_config=extraction_config,
|
|
557
|
-
execution_trace_log=None,
|
|
558
|
-
)
|
|
559
|
-
|
|
560
|
-
return result
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
@unified_exception_handler
|
|
564
|
-
def extract_primitives_from_pptx(
|
|
565
|
-
*,
|
|
566
|
-
df_ledger: pd.DataFrame,
|
|
567
|
-
extract_text: bool = True,
|
|
568
|
-
extract_images: bool = True,
|
|
569
|
-
extract_tables: bool = True,
|
|
570
|
-
extract_charts: bool = True,
|
|
571
|
-
extract_infographics: bool = True,
|
|
572
|
-
yolox_endpoints: Optional[Tuple[str, str]] = None,
|
|
573
|
-
yolox_infer_protocol: str = "grpc",
|
|
574
|
-
auth_token: str = "",
|
|
575
|
-
) -> pd.DataFrame:
|
|
576
|
-
"""
|
|
577
|
-
Extract primitives from PPTX files provided in a DataFrame.
|
|
578
|
-
|
|
579
|
-
This function configures the PPTX extraction task by assembling a task configuration
|
|
580
|
-
dictionary using the provided parameters. It then creates an extraction configuration
|
|
581
|
-
object (e.g., an instance of PPTXExtractorSchema) and delegates the actual extraction
|
|
582
|
-
process to the internal function `extract_primitives_from_pptx_internal`.
|
|
583
|
-
|
|
584
|
-
Parameters
|
|
585
|
-
----------
|
|
586
|
-
df_ledger : pd.DataFrame
|
|
587
|
-
A DataFrame containing base64-encoded PPTX files. The DataFrame is expected to include
|
|
588
|
-
columns such as "content" (with the base64-encoded PPTX) and "source_id".
|
|
589
|
-
extract_text : bool, default=True
|
|
590
|
-
Flag indicating whether text should be extracted from the PPTX files.
|
|
591
|
-
extract_images : bool, default=True
|
|
592
|
-
Flag indicating whether images should be extracted.
|
|
593
|
-
extract_tables : bool, default=True
|
|
594
|
-
Flag indicating whether tables should be extracted.
|
|
595
|
-
extract_charts : bool, default=True
|
|
596
|
-
Flag indicating whether charts should be extracted.
|
|
597
|
-
extract_infographics : bool, default=True
|
|
598
|
-
Flag indicating whether infographics should be extracted.
|
|
599
|
-
yolox_endpoints : Optional[Tuple[str, str]], default=None
|
|
600
|
-
Optional tuple containing endpoints for YOLOX inference, if needed for image analysis.
|
|
601
|
-
yolox_infer_protocol : str, default="grpc"
|
|
602
|
-
The protocol to use for YOLOX inference.
|
|
603
|
-
auth_token : str, default=""
|
|
604
|
-
Authentication token to be used with the PPTX extraction configuration.
|
|
605
|
-
|
|
606
|
-
Returns
|
|
607
|
-
-------
|
|
608
|
-
pd.DataFrame
|
|
609
|
-
A DataFrame containing the extracted primitives from the PPTX files. Expected columns include
|
|
610
|
-
"document_type", "metadata", and "uuid".
|
|
611
|
-
|
|
612
|
-
Notes
|
|
613
|
-
-----
|
|
614
|
-
This function is decorated with `@unified_exception_handler` to handle exceptions uniformly.
|
|
615
|
-
The task configuration is assembled with two main keys:
|
|
616
|
-
- "params": Contains boolean flags for controlling which primitives to extract.
|
|
617
|
-
- "pptx_extraction_config": Contains additional settings for PPTX extraction (e.g., YOLOX endpoints,
|
|
618
|
-
inference protocol, and auth token).
|
|
619
|
-
It then calls `extract_primitives_from_pptx_internal` with the DataFrame, the task configuration,
|
|
620
|
-
and the extraction configuration.
|
|
621
|
-
"""
|
|
622
|
-
task_config: Dict[str, Any] = {
|
|
623
|
-
"params": {
|
|
624
|
-
"extract_text": extract_text,
|
|
625
|
-
"extract_images": extract_images,
|
|
626
|
-
"extract_tables": extract_tables,
|
|
627
|
-
"extract_charts": extract_charts,
|
|
628
|
-
"extract_infographics": extract_infographics,
|
|
629
|
-
},
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
extraction_config = PPTXExtractorSchema(
|
|
633
|
-
**{
|
|
634
|
-
"pptx_extraction_config": {
|
|
635
|
-
"yolox_endpoints": yolox_endpoints,
|
|
636
|
-
"yolox_infer_protocol": yolox_infer_protocol,
|
|
637
|
-
"auth_token": auth_token,
|
|
638
|
-
},
|
|
639
|
-
}
|
|
640
|
-
) # Assuming PPTXExtractorSchema is defined and imported
|
|
641
|
-
|
|
642
|
-
return extract_primitives_from_pptx_internal(
|
|
643
|
-
df_extraction_ledger=df_ledger,
|
|
644
|
-
task_config=task_config,
|
|
645
|
-
extraction_config=extraction_config,
|
|
646
|
-
execution_trace_log=None,
|
|
647
|
-
)
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
@unified_exception_handler
|
|
651
|
-
def extract_primitives_from_docx(
|
|
652
|
-
*,
|
|
653
|
-
df_ledger: pd.DataFrame,
|
|
654
|
-
extract_text: bool = True,
|
|
655
|
-
extract_images: bool = True,
|
|
656
|
-
extract_tables: bool = True,
|
|
657
|
-
extract_charts: bool = True,
|
|
658
|
-
extract_infographics: bool = True,
|
|
659
|
-
yolox_endpoints: Optional[Tuple[str, str]] = None,
|
|
660
|
-
yolox_infer_protocol: str = "grpc",
|
|
661
|
-
auth_token: str = "",
|
|
662
|
-
) -> pd.DataFrame:
|
|
663
|
-
"""
|
|
664
|
-
Extract primitives from DOCX documents in a DataFrame.
|
|
665
|
-
|
|
666
|
-
This function configures and invokes the DOCX extraction process. It builds a task configuration
|
|
667
|
-
using the provided extraction flags (for text, images, tables, charts, and infographics) and additional
|
|
668
|
-
settings for YOLOX endpoints, inference protocol, and authentication. It then creates a DOCX extraction
|
|
669
|
-
configuration (an instance of DocxExtractorSchema) and delegates the extraction to an internal function.
|
|
670
|
-
|
|
671
|
-
Parameters
|
|
672
|
-
----------
|
|
673
|
-
df_ledger : pd.DataFrame
|
|
674
|
-
The input DataFrame containing DOCX documents in base64 encoding. The DataFrame is expected to
|
|
675
|
-
include required columns such as "content" (with the base64-encoded DOCX) and optionally "source_id".
|
|
676
|
-
extract_text : bool, optional
|
|
677
|
-
Flag indicating whether to extract text content from the DOCX documents (default is True).
|
|
678
|
-
extract_images : bool, optional
|
|
679
|
-
Flag indicating whether to extract images from the DOCX documents (default is True).
|
|
680
|
-
extract_tables : bool, optional
|
|
681
|
-
Flag indicating whether to extract tables from the DOCX documents (default is True).
|
|
682
|
-
extract_charts : bool, optional
|
|
683
|
-
Flag indicating whether to extract charts from the DOCX documents (default is True).
|
|
684
|
-
extract_infographics : bool, optional
|
|
685
|
-
Flag indicating whether to extract infographics from the DOCX documents (default is True).
|
|
686
|
-
yolox_endpoints : Optional[Tuple[str, str]], optional
|
|
687
|
-
A tuple containing YOLOX inference endpoints. If None, the default endpoints defined in the
|
|
688
|
-
DOCX extraction configuration will be used.
|
|
689
|
-
yolox_infer_protocol : str, optional
|
|
690
|
-
The inference protocol to use with the YOLOX endpoints (default is "grpc").
|
|
691
|
-
auth_token : str, optional
|
|
692
|
-
The authentication token for accessing the YOLOX inference service (default is an empty string).
|
|
693
|
-
|
|
694
|
-
Returns
|
|
695
|
-
-------
|
|
696
|
-
pd.DataFrame
|
|
697
|
-
A DataFrame containing the extracted DOCX primitives. Typically, the resulting DataFrame contains
|
|
698
|
-
columns such as "document_type", "metadata", and "uuid".
|
|
699
|
-
|
|
700
|
-
Raises
|
|
701
|
-
------
|
|
702
|
-
Exception
|
|
703
|
-
If an error occurs during the DOCX extraction process, the exception is logged and re-raised.
|
|
704
|
-
"""
|
|
705
|
-
# Build the task configuration with parameters and DOCX-specific extraction settings.
|
|
706
|
-
task_config: Dict[str, Any] = {
|
|
707
|
-
"params": {
|
|
708
|
-
"extract_text": extract_text,
|
|
709
|
-
"extract_images": extract_images,
|
|
710
|
-
"extract_tables": extract_tables,
|
|
711
|
-
"extract_charts": extract_charts,
|
|
712
|
-
"extract_infographics": extract_infographics,
|
|
713
|
-
},
|
|
714
|
-
}
|
|
715
|
-
|
|
716
|
-
# Create the extraction configuration object (instance of DocxExtractorSchema).
|
|
717
|
-
extraction_config = DocxExtractorSchema(
|
|
718
|
-
**{
|
|
719
|
-
"docx_extraction_config": {
|
|
720
|
-
"yolox_endpoints": yolox_endpoints,
|
|
721
|
-
"yolox_infer_protocol": yolox_infer_protocol,
|
|
722
|
-
"auth_token": auth_token,
|
|
723
|
-
},
|
|
724
|
-
}
|
|
725
|
-
)
|
|
726
|
-
|
|
727
|
-
# Delegate the actual extraction to the internal function.
|
|
728
|
-
return extract_primitives_from_docx_internal(
|
|
729
|
-
df_extraction_ledger=df_ledger,
|
|
730
|
-
task_config=task_config,
|
|
731
|
-
extraction_config=extraction_config,
|
|
732
|
-
execution_trace_log=None,
|
|
733
|
-
)
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
@unified_exception_handler
|
|
737
|
-
def extract_primitives_from_image(
|
|
738
|
-
*,
|
|
739
|
-
df_ledger: pd.DataFrame,
|
|
740
|
-
extract_text: bool = True,
|
|
741
|
-
extract_images: bool = True,
|
|
742
|
-
extract_tables: bool = True,
|
|
743
|
-
extract_charts: bool = True,
|
|
744
|
-
extract_infographics: bool = True,
|
|
745
|
-
yolox_endpoints: Optional[Tuple[str, str]] = None,
|
|
746
|
-
yolox_infer_protocol: str = "grpc",
|
|
747
|
-
auth_token: str = "",
|
|
748
|
-
) -> pd.DataFrame:
|
|
749
|
-
task_config: Dict[str, Any] = {
|
|
750
|
-
"params": {
|
|
751
|
-
"extract_text": extract_text,
|
|
752
|
-
"extract_images": extract_images,
|
|
753
|
-
"extract_tables": extract_tables,
|
|
754
|
-
"extract_charts": extract_charts,
|
|
755
|
-
"extract_infographics": extract_infographics,
|
|
756
|
-
},
|
|
757
|
-
}
|
|
758
|
-
|
|
759
|
-
extraction_config = ImageExtractorSchema(
|
|
760
|
-
**{
|
|
761
|
-
"image_extraction_config": {
|
|
762
|
-
"yolox_endpoints": yolox_endpoints,
|
|
763
|
-
"yolox_infer_protocol": yolox_infer_protocol,
|
|
764
|
-
"auth_token": auth_token,
|
|
765
|
-
},
|
|
766
|
-
}
|
|
767
|
-
)
|
|
768
|
-
|
|
769
|
-
result, _ = extract_primitives_from_image_internal(
|
|
770
|
-
df_extraction_ledger=df_ledger,
|
|
771
|
-
task_config=task_config,
|
|
772
|
-
extraction_config=extraction_config,
|
|
773
|
-
execution_trace_log=None,
|
|
774
|
-
)
|
|
775
|
-
|
|
776
|
-
return result
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
@unified_exception_handler
|
|
780
|
-
def extract_chart_data_from_image(
|
|
781
|
-
*,
|
|
782
|
-
df_ledger: pd.DataFrame,
|
|
783
|
-
yolox_endpoints: Tuple[str, str],
|
|
784
|
-
paddle_endpoints: Tuple[str, str],
|
|
785
|
-
yolox_protocol: str = "grpc",
|
|
786
|
-
paddle_protocol: str = "grpc",
|
|
787
|
-
auth_token: str = "",
|
|
788
|
-
) -> DataFrame:
|
|
789
|
-
"""
|
|
790
|
-
Public interface to extract chart data from ledger DataFrame.
|
|
791
|
-
|
|
792
|
-
Parameters
|
|
793
|
-
----------
|
|
794
|
-
df_ledger : pd.DataFrame
|
|
795
|
-
DataFrame containing metadata required for chart extraction.
|
|
796
|
-
yolox_endpoints : Tuple[str, str]
|
|
797
|
-
YOLOX inference server endpoints.
|
|
798
|
-
paddle_endpoints : Tuple[str, str]
|
|
799
|
-
PaddleOCR inference server endpoints.
|
|
800
|
-
yolox_protocol : str, optional
|
|
801
|
-
Protocol for YOLOX inference (default "grpc").
|
|
802
|
-
paddle_protocol : str, optional
|
|
803
|
-
Protocol for PaddleOCR inference (default "grpc").
|
|
804
|
-
auth_token : str, optional
|
|
805
|
-
Authentication token for inference services.
|
|
806
|
-
execution_trace_log : list, optional
|
|
807
|
-
Execution trace logs.
|
|
808
|
-
|
|
809
|
-
Returns
|
|
810
|
-
-------
|
|
811
|
-
pd.DataFrame
|
|
812
|
-
Updated DataFrame after chart extraction.
|
|
813
|
-
|
|
814
|
-
Raises
|
|
815
|
-
------
|
|
816
|
-
Exception
|
|
817
|
-
If an error occurs during extraction.
|
|
818
|
-
"""
|
|
819
|
-
task_config = IngestTaskChartExtraction()
|
|
820
|
-
extraction_config = ChartExtractorSchema(
|
|
821
|
-
**{
|
|
822
|
-
"endpoint_config": {
|
|
823
|
-
"yolox_endpoints": yolox_endpoints,
|
|
824
|
-
"paddle_endpoints": paddle_endpoints,
|
|
825
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
826
|
-
"paddle_infer_protocol": paddle_protocol,
|
|
827
|
-
"auth_token": auth_token,
|
|
828
|
-
}
|
|
829
|
-
}
|
|
830
|
-
)
|
|
831
|
-
|
|
832
|
-
result, _ = extract_chart_data_from_image_internal(
|
|
833
|
-
df_extraction_ledger=df_ledger,
|
|
834
|
-
task_config=task_config,
|
|
835
|
-
extraction_config=extraction_config,
|
|
836
|
-
execution_trace_log=None,
|
|
837
|
-
)
|
|
838
|
-
|
|
839
|
-
return result
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
@unified_exception_handler
|
|
843
|
-
def extract_table_data_from_image(
|
|
844
|
-
*,
|
|
845
|
-
df_ledger: pd.DataFrame,
|
|
846
|
-
yolox_endpoints: Optional[Tuple[str, str]] = None,
|
|
847
|
-
paddle_endpoints: Optional[Tuple[str, str]] = None,
|
|
848
|
-
yolox_protocol: Optional[str] = None,
|
|
849
|
-
paddle_protocol: Optional[str] = None,
|
|
850
|
-
auth_token: Optional[str] = None,
|
|
851
|
-
) -> pd.DataFrame:
|
|
852
|
-
"""
|
|
853
|
-
Public interface to extract chart data from a ledger DataFrame.
|
|
854
|
-
|
|
855
|
-
Parameters
|
|
856
|
-
----------
|
|
857
|
-
df_ledger : pd.DataFrame
|
|
858
|
-
DataFrame containing metadata required for chart extraction.
|
|
859
|
-
yolox_endpoints : Optional[Tuple[str, str]], default=None
|
|
860
|
-
YOLOX inference server endpoints. If None, the default defined in ChartExtractorConfigSchema is used.
|
|
861
|
-
paddle_endpoints : Optional[Tuple[str, str]], default=None
|
|
862
|
-
PaddleOCR inference server endpoints. If None, the default defined in ChartExtractorConfigSchema is used.
|
|
863
|
-
yolox_protocol : Optional[str], default=None
|
|
864
|
-
Protocol for YOLOX inference. If None, the default defined in ChartExtractorConfigSchema is used.
|
|
865
|
-
paddle_protocol : Optional[str], default=None
|
|
866
|
-
Protocol for PaddleOCR inference. If None, the default defined in ChartExtractorConfigSchema is used.
|
|
867
|
-
auth_token : Optional[str], default=None
|
|
868
|
-
Authentication token for inference services. If None, the default defined in ChartExtractorConfigSchema is used.
|
|
869
|
-
|
|
870
|
-
Returns
|
|
871
|
-
-------
|
|
872
|
-
pd.DataFrame
|
|
873
|
-
- The updated DataFrame after chart extraction.
|
|
874
|
-
|
|
875
|
-
Raises
|
|
876
|
-
------
|
|
877
|
-
Exception
|
|
878
|
-
If an error occurs during extraction.
|
|
879
|
-
"""
|
|
880
|
-
task_config = IngestTaskTableExtraction()
|
|
881
|
-
|
|
882
|
-
config_kwargs = {
|
|
883
|
-
"endpoint_config": {
|
|
884
|
-
"yolox_endpoints": yolox_endpoints,
|
|
885
|
-
"paddle_endpoints": paddle_endpoints,
|
|
886
|
-
"yolox_infer_protocol": yolox_protocol,
|
|
887
|
-
"paddle_infer_protocol": paddle_protocol,
|
|
888
|
-
"auth_token": auth_token,
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
# Remove keys with None values so that ChartExtractorConfigSchema's defaults are used.
|
|
892
|
-
config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
|
|
893
|
-
|
|
894
|
-
extraction_config = TableExtractorSchema(**config_kwargs)
|
|
895
|
-
|
|
896
|
-
result, _ = extract_table_data_from_image_internal(
|
|
897
|
-
df_extraction_ledger=df_ledger,
|
|
898
|
-
task_config=task_config,
|
|
899
|
-
extraction_config=extraction_config,
|
|
900
|
-
execution_trace_log=None,
|
|
901
|
-
)
|
|
902
|
-
|
|
903
|
-
return result
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
@unified_exception_handler
|
|
907
|
-
def extract_infographic_data_from_image(
|
|
908
|
-
*,
|
|
909
|
-
df_ledger: pd.DataFrame,
|
|
910
|
-
paddle_endpoints: Optional[Tuple[str, str]] = None,
|
|
911
|
-
paddle_protocol: Optional[str] = None,
|
|
912
|
-
auth_token: Optional[str] = None,
|
|
913
|
-
) -> pd.DataFrame:
|
|
914
|
-
"""
|
|
915
|
-
Extract infographic data from a DataFrame using the configured infographic extraction pipeline.
|
|
916
|
-
|
|
917
|
-
This function creates a task configuration for infographic extraction, builds the extraction
|
|
918
|
-
configuration from the provided PaddleOCR endpoints, protocol, and authentication token (or uses
|
|
919
|
-
the default values from InfographicExtractorConfigSchema if None), and then calls the internal
|
|
920
|
-
extraction function to process the DataFrame. The unified exception handler decorator ensures
|
|
921
|
-
that any errors are appropriately logged and managed.
|
|
922
|
-
|
|
923
|
-
Parameters
|
|
924
|
-
----------
|
|
925
|
-
df_extraction_ledger : pd.DataFrame
|
|
926
|
-
DataFrame containing the images and associated metadata from which infographic data is to be extracted.
|
|
927
|
-
paddle_endpoints : Optional[Tuple[str, str]], default=None
|
|
928
|
-
A tuple of PaddleOCR endpoint addresses (e.g., (gRPC_endpoint, HTTP_endpoint)) used for inference.
|
|
929
|
-
If None, the default endpoints from InfographicExtractorConfigSchema are used.
|
|
930
|
-
paddle_protocol : Optional[str], default=None
|
|
931
|
-
The protocol (e.g., "grpc" or "http") for PaddleOCR inference.
|
|
932
|
-
If None, the default protocol from InfographicExtractorConfigSchema is used.
|
|
933
|
-
auth_token : Optional[str], default=None
|
|
934
|
-
The authentication token required for secure access to PaddleOCR inference services.
|
|
935
|
-
If None, the default value from InfographicExtractorConfigSchema is used.
|
|
936
|
-
|
|
937
|
-
Returns
|
|
938
|
-
-------
|
|
939
|
-
pd.DataFrame
|
|
940
|
-
The updated DataFrame after infographic extraction has been performed.
|
|
941
|
-
|
|
942
|
-
Raises
|
|
943
|
-
------
|
|
944
|
-
Exception
|
|
945
|
-
Propagates any exception raised during the extraction process, after being handled by the
|
|
946
|
-
unified exception handler.
|
|
947
|
-
"""
|
|
948
|
-
|
|
949
|
-
task_config = {}
|
|
950
|
-
|
|
951
|
-
extractor_config_kwargs = {
|
|
952
|
-
"endpoint_config": InfographicExtractorConfigSchema(
|
|
953
|
-
**{
|
|
954
|
-
"paddle_endpoints": paddle_endpoints,
|
|
955
|
-
"paddle_infer_protocol": paddle_protocol,
|
|
956
|
-
"auth_token": auth_token,
|
|
957
|
-
}
|
|
958
|
-
)
|
|
959
|
-
}
|
|
960
|
-
# Remove keys with None values so that InfographicExtractorConfigSchema's defaults are used.
|
|
961
|
-
extractor_config_kwargs = {k: v for k, v in extractor_config_kwargs.items() if v is not None}
|
|
962
|
-
|
|
963
|
-
extraction_config = InfographicExtractorSchema(**extractor_config_kwargs)
|
|
964
|
-
|
|
965
|
-
result, _ = extract_infographic_data_from_image_internal(
|
|
966
|
-
df_extraction_ledger=df_ledger,
|
|
967
|
-
task_config=task_config,
|
|
968
|
-
extraction_config=extraction_config,
|
|
969
|
-
execution_trace_log=None,
|
|
970
|
-
)
|
|
971
|
-
|
|
972
|
-
return result
|