nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import Optional, Dict, Any, Union, Tuple, List
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
13
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import MetadataSchema
|
|
14
|
+
from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
|
|
15
|
+
from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
16
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
17
|
+
|
|
18
|
+
from markitdown.converters import HtmlConverter
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@unified_exception_handler
|
|
24
|
+
def _convert_html(row: pd.Series, execution_trace_log: Optional[List[Any]] = None):
|
|
25
|
+
metadata = row.get("metadata")
|
|
26
|
+
html_content = row.get("content")
|
|
27
|
+
|
|
28
|
+
if html_content:
|
|
29
|
+
html_converter = HtmlConverter()
|
|
30
|
+
md_content = html_converter.convert_string(html_content=html_content).text_content
|
|
31
|
+
metadata["content"] = md_content
|
|
32
|
+
|
|
33
|
+
return [[ContentTypeEnum.TEXT, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_markdown_from_html_internal(
|
|
37
|
+
df_extraction_ledger: pd.DataFrame,
|
|
38
|
+
task_config: Dict[str, Any],
|
|
39
|
+
extraction_config: HtmlExtractorSchema,
|
|
40
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
41
|
+
) -> Tuple[pd.DataFrame, Union[Dict, None]]:
|
|
42
|
+
"""
|
|
43
|
+
Processes a pandas DataFrame containing HTML file content, extracting html as text from
|
|
44
|
+
each document and converting it to markdown.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
df_extraction_ledger : pd.DataFrame
|
|
49
|
+
The input DataFrame containing html files as raw text. Expected columns include
|
|
50
|
+
'source_id' and 'content'.
|
|
51
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
52
|
+
Configuration instructions for the document processing task. This can be provided as a
|
|
53
|
+
dictionary or a Pydantic model.
|
|
54
|
+
extraction_config : Any
|
|
55
|
+
A configuration object for document extraction that guides the extraction process.
|
|
56
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
57
|
+
An optional dictionary containing trace information for debugging or logging.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
pd.DataFrame
|
|
62
|
+
A DataFrame with the original html content converted to markdown. The resulting
|
|
63
|
+
DataFrame contains the columns "document_type", "metadata", and "uuid".
|
|
64
|
+
|
|
65
|
+
Raises
|
|
66
|
+
------
|
|
67
|
+
Exception
|
|
68
|
+
If an error occurs during the document extraction process, the exception is logged and
|
|
69
|
+
re-raised.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# Apply the decode_and_extract function to each row in the DataFrame.
|
|
73
|
+
sr_extraction = df_extraction_ledger.apply(lambda row: _convert_html(row, execution_trace_log), axis=1)
|
|
74
|
+
|
|
75
|
+
# Explode any list results and drop missing values.
|
|
76
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
77
|
+
|
|
78
|
+
# Convert the extraction results to a DataFrame if available.
|
|
79
|
+
if not sr_extraction.empty:
|
|
80
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
81
|
+
else:
|
|
82
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
83
|
+
|
|
84
|
+
return extracted_df, {}
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from typing import Any, Union
|
|
8
|
+
from typing import Dict
|
|
9
|
+
from typing import List
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from typing import Tuple
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
17
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
|
|
18
|
+
from nv_ingest_api.util.image_processing.table_and_chart import join_yolox_graphic_elements_and_ocr_output
|
|
19
|
+
from nv_ingest_api.util.image_processing.table_and_chart import process_yolox_graphic_elements
|
|
20
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import PaddleOCRModelInterface
|
|
21
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import NemoRetrieverOCRModelInterface
|
|
22
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.ocr import get_ocr_model_name
|
|
23
|
+
from nv_ingest_api.internal.primitives.nim import NimClient
|
|
24
|
+
from nv_ingest_api.internal.primitives.nim.model_interface.yolox import YoloxGraphicElementsModelInterface
|
|
25
|
+
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
26
|
+
from nv_ingest_api.util.nim import create_inference_client
|
|
27
|
+
|
|
28
|
+
PADDLE_MIN_WIDTH = 32
|
|
29
|
+
PADDLE_MIN_HEIGHT = 32
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(f"ray.{__name__}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _filter_valid_chart_images(
|
|
35
|
+
base64_images: List[str],
|
|
36
|
+
) -> Tuple[List[str], List[np.ndarray], List[int], List[Tuple[str, Optional[Dict]]]]:
|
|
37
|
+
"""
|
|
38
|
+
Filter base64-encoded images based on minimum dimensions for chart extraction.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
- valid_images: Base64 strings meeting size requirements.
|
|
42
|
+
- valid_arrays: Corresponding numpy arrays.
|
|
43
|
+
- valid_indices: Original indices of valid images.
|
|
44
|
+
- results: Initial results list where invalid images are set to (img, None).
|
|
45
|
+
"""
|
|
46
|
+
results: List[Tuple[str, Optional[Dict]]] = [("", None)] * len(base64_images)
|
|
47
|
+
valid_images: List[str] = []
|
|
48
|
+
valid_arrays: List[np.ndarray] = []
|
|
49
|
+
valid_indices: List[int] = []
|
|
50
|
+
|
|
51
|
+
for i, img in enumerate(base64_images):
|
|
52
|
+
array = base64_to_numpy(img)
|
|
53
|
+
height, width = array.shape[0], array.shape[1]
|
|
54
|
+
if width >= PADDLE_MIN_WIDTH and height >= PADDLE_MIN_HEIGHT:
|
|
55
|
+
valid_images.append(img)
|
|
56
|
+
valid_arrays.append(array)
|
|
57
|
+
valid_indices.append(i)
|
|
58
|
+
else:
|
|
59
|
+
# Image is too small; mark as skipped.
|
|
60
|
+
results[i] = (img, None)
|
|
61
|
+
return valid_images, valid_arrays, valid_indices, results
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _run_chart_inference(
|
|
65
|
+
yolox_client: Any,
|
|
66
|
+
ocr_client: Any,
|
|
67
|
+
ocr_model_name: str,
|
|
68
|
+
valid_arrays: List[np.ndarray],
|
|
69
|
+
valid_images: List[str],
|
|
70
|
+
trace_info: Dict,
|
|
71
|
+
) -> Tuple[List[Any], List[Any]]:
|
|
72
|
+
"""
|
|
73
|
+
Run concurrent inference for chart extraction using YOLOX and Paddle.
|
|
74
|
+
|
|
75
|
+
Returns a tuple of (yolox_results, ocr_results).
|
|
76
|
+
"""
|
|
77
|
+
data_yolox = {"images": valid_arrays}
|
|
78
|
+
data_ocr = {"base64_images": valid_images}
|
|
79
|
+
|
|
80
|
+
future_yolox_kwargs = dict(
|
|
81
|
+
data=data_yolox,
|
|
82
|
+
model_name="yolox_ensemble",
|
|
83
|
+
stage_name="chart_extraction",
|
|
84
|
+
input_names=["INPUT_IMAGES", "THRESHOLDS"],
|
|
85
|
+
dtypes=["BYTES", "FP32"],
|
|
86
|
+
output_names=["OUTPUT"],
|
|
87
|
+
trace_info=trace_info,
|
|
88
|
+
max_batch_size=8,
|
|
89
|
+
)
|
|
90
|
+
future_ocr_kwargs = dict(
|
|
91
|
+
data=data_ocr,
|
|
92
|
+
stage_name="chart_extraction",
|
|
93
|
+
trace_info=trace_info,
|
|
94
|
+
)
|
|
95
|
+
if ocr_model_name == "paddle":
|
|
96
|
+
future_ocr_kwargs.update(
|
|
97
|
+
model_name="paddle",
|
|
98
|
+
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
99
|
+
)
|
|
100
|
+
elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
|
|
101
|
+
future_ocr_kwargs.update(
|
|
102
|
+
model_name=ocr_model_name,
|
|
103
|
+
input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
|
|
104
|
+
output_names=["OUTPUT"],
|
|
105
|
+
dtypes=["BYTES", "BYTES"],
|
|
106
|
+
merge_level="paragraph",
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
raise ValueError(f"Unknown OCR model name: {ocr_model_name}")
|
|
110
|
+
|
|
111
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
112
|
+
future_yolox = executor.submit(yolox_client.infer, **future_yolox_kwargs)
|
|
113
|
+
future_ocr = executor.submit(ocr_client.infer, **future_ocr_kwargs)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
yolox_results = future_yolox.result()
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Error calling yolox_client.infer: {e}", exc_info=True)
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
ocr_results = future_ocr.result()
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Error calling ocr_client.infer: {e}", exc_info=True)
|
|
125
|
+
raise
|
|
126
|
+
|
|
127
|
+
return yolox_results, ocr_results
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _validate_chart_inference_results(
|
|
131
|
+
yolox_results: Any,
|
|
132
|
+
ocr_results: Any,
|
|
133
|
+
valid_arrays: List[Any],
|
|
134
|
+
valid_images: List[str],
|
|
135
|
+
) -> Tuple[List[Any], List[Any]]:
|
|
136
|
+
"""
|
|
137
|
+
Ensure inference results are lists and have expected lengths.
|
|
138
|
+
|
|
139
|
+
Raises:
|
|
140
|
+
ValueError if results do not match expected types or lengths.
|
|
141
|
+
"""
|
|
142
|
+
if not (isinstance(yolox_results, list) and isinstance(ocr_results, list)):
|
|
143
|
+
raise ValueError("Expected list results from both yolox_client and ocr_client infer calls.")
|
|
144
|
+
|
|
145
|
+
if len(yolox_results) != len(valid_arrays):
|
|
146
|
+
raise ValueError(f"Expected {len(valid_arrays)} yolox results, got {len(yolox_results)}")
|
|
147
|
+
if len(ocr_results) != len(valid_images):
|
|
148
|
+
raise ValueError(f"Expected {len(valid_images)} ocr results, got {len(ocr_results)}")
|
|
149
|
+
return yolox_results, ocr_results
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _merge_chart_results(
|
|
153
|
+
base64_images: List[str],
|
|
154
|
+
valid_indices: List[int],
|
|
155
|
+
yolox_results: List[Any],
|
|
156
|
+
ocr_results: List[Any],
|
|
157
|
+
initial_results: List[Tuple[str, Optional[Dict]]],
|
|
158
|
+
) -> List[Tuple[str, Optional[Dict]]]:
|
|
159
|
+
"""
|
|
160
|
+
Merge inference results into the initial results list using the original indices.
|
|
161
|
+
|
|
162
|
+
For each valid image, processes the results from both inference calls and updates the
|
|
163
|
+
corresponding entry in the results list.
|
|
164
|
+
"""
|
|
165
|
+
for idx, (yolox_res, ocr_res) in enumerate(zip(yolox_results, ocr_results)):
|
|
166
|
+
# Unpack ocr result into bounding boxes and text predictions.
|
|
167
|
+
bounding_boxes, text_predictions, _ = ocr_res
|
|
168
|
+
yolox_elements = join_yolox_graphic_elements_and_ocr_output(yolox_res, bounding_boxes, text_predictions)
|
|
169
|
+
chart_content = process_yolox_graphic_elements(yolox_elements)
|
|
170
|
+
original_index = valid_indices[idx]
|
|
171
|
+
initial_results[original_index] = (base64_images[original_index], chart_content)
|
|
172
|
+
return initial_results
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _update_chart_metadata(
|
|
176
|
+
base64_images: List[str],
|
|
177
|
+
yolox_client: Any,
|
|
178
|
+
ocr_client: Any,
|
|
179
|
+
ocr_model_name: str,
|
|
180
|
+
trace_info: Dict,
|
|
181
|
+
worker_pool_size: int = 8, # Not currently used.
|
|
182
|
+
) -> List[Tuple[str, Optional[Dict]]]:
|
|
183
|
+
"""
|
|
184
|
+
Given a list of base64-encoded chart images, concurrently call both YOLOX and Paddle
|
|
185
|
+
inference services to extract chart data.
|
|
186
|
+
|
|
187
|
+
For each base64-encoded image, returns:
|
|
188
|
+
(original_image_str, joined_chart_content_dict)
|
|
189
|
+
|
|
190
|
+
Images that do not meet minimum size requirements are marked as skipped.
|
|
191
|
+
"""
|
|
192
|
+
logger.debug("Running chart extraction using updated concurrency handling.")
|
|
193
|
+
|
|
194
|
+
# Initialize results with placeholders and filter valid images.
|
|
195
|
+
valid_images, valid_arrays, valid_indices, results = _filter_valid_chart_images(base64_images)
|
|
196
|
+
|
|
197
|
+
# Run concurrent inference only for valid images.
|
|
198
|
+
yolox_results, ocr_results = _run_chart_inference(
|
|
199
|
+
yolox_client=yolox_client,
|
|
200
|
+
ocr_client=ocr_client,
|
|
201
|
+
ocr_model_name=ocr_model_name,
|
|
202
|
+
valid_arrays=valid_arrays,
|
|
203
|
+
valid_images=valid_images,
|
|
204
|
+
trace_info=trace_info,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Validate that the returned inference results are lists of the expected length.
|
|
208
|
+
yolox_results, ocr_results = _validate_chart_inference_results(
|
|
209
|
+
yolox_results, ocr_results, valid_arrays, valid_images
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Merge the inference results into the results list.
|
|
213
|
+
return _merge_chart_results(base64_images, valid_indices, yolox_results, ocr_results, results)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _create_yolox_client(
|
|
217
|
+
yolox_endpoints: Tuple[str, str],
|
|
218
|
+
yolox_protocol: str,
|
|
219
|
+
auth_token: str,
|
|
220
|
+
) -> NimClient:
|
|
221
|
+
yolox_model_interface = YoloxGraphicElementsModelInterface()
|
|
222
|
+
|
|
223
|
+
yolox_client = create_inference_client(
|
|
224
|
+
endpoints=yolox_endpoints,
|
|
225
|
+
model_interface=yolox_model_interface,
|
|
226
|
+
auth_token=auth_token,
|
|
227
|
+
infer_protocol=yolox_protocol,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return yolox_client
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _create_ocr_client(
|
|
234
|
+
ocr_endpoints: Tuple[str, str],
|
|
235
|
+
ocr_protocol: str,
|
|
236
|
+
ocr_model_name: str,
|
|
237
|
+
auth_token: str,
|
|
238
|
+
) -> NimClient:
|
|
239
|
+
ocr_model_interface = (
|
|
240
|
+
NemoRetrieverOCRModelInterface()
|
|
241
|
+
if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
|
|
242
|
+
else PaddleOCRModelInterface()
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
ocr_client = create_inference_client(
|
|
246
|
+
endpoints=ocr_endpoints,
|
|
247
|
+
model_interface=ocr_model_interface,
|
|
248
|
+
auth_token=auth_token,
|
|
249
|
+
infer_protocol=ocr_protocol,
|
|
250
|
+
enable_dynamic_batching=(
|
|
251
|
+
True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
|
|
252
|
+
),
|
|
253
|
+
dynamic_batch_memory_budget_mb=32,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
return ocr_client
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def extract_chart_data_from_image_internal(
|
|
260
|
+
df_extraction_ledger: pd.DataFrame,
|
|
261
|
+
task_config: Union[IngestTaskChartExtraction, Dict[str, Any]],
|
|
262
|
+
extraction_config: ChartExtractorSchema,
|
|
263
|
+
execution_trace_log: Optional[Dict] = None,
|
|
264
|
+
) -> Tuple[pd.DataFrame, Dict]:
|
|
265
|
+
"""
|
|
266
|
+
Extracts chart data from a DataFrame in a bulk fashion rather than row-by-row.
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
----------
|
|
270
|
+
df_extraction_ledger : pd.DataFrame
|
|
271
|
+
DataFrame containing the content from which chart data is to be extracted.
|
|
272
|
+
task_config : Dict[str, Any]
|
|
273
|
+
Dictionary containing task properties and configurations.
|
|
274
|
+
extraction_config : Any
|
|
275
|
+
The validated configuration object for chart extraction.
|
|
276
|
+
execution_trace_log : Optional[Dict], optional
|
|
277
|
+
Optional trace information for debugging or logging. Defaults to None.
|
|
278
|
+
|
|
279
|
+
Returns
|
|
280
|
+
-------
|
|
281
|
+
Tuple[pd.DataFrame, Dict]
|
|
282
|
+
A tuple containing the updated DataFrame and the trace information.
|
|
283
|
+
|
|
284
|
+
Raises
|
|
285
|
+
------
|
|
286
|
+
Exception
|
|
287
|
+
If any error occurs during the chart data extraction process.
|
|
288
|
+
"""
|
|
289
|
+
_ = task_config # Unused variable
|
|
290
|
+
|
|
291
|
+
if execution_trace_log is None:
|
|
292
|
+
execution_trace_log = {}
|
|
293
|
+
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
294
|
+
|
|
295
|
+
if df_extraction_ledger.empty:
|
|
296
|
+
return df_extraction_ledger, execution_trace_log
|
|
297
|
+
|
|
298
|
+
endpoint_config = extraction_config.endpoint_config
|
|
299
|
+
|
|
300
|
+
# Get the grpc endpoint to determine the model if needed
|
|
301
|
+
ocr_grpc_endpoint = endpoint_config.ocr_endpoints[0]
|
|
302
|
+
ocr_model_name = get_ocr_model_name(ocr_grpc_endpoint)
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
# 1) Identify rows that meet criteria in a single pass
|
|
306
|
+
# - metadata exists
|
|
307
|
+
# - content_metadata.type == "structured"
|
|
308
|
+
# - content_metadata.subtype == "chart"
|
|
309
|
+
# - table_metadata not None
|
|
310
|
+
# - base64_image not None or ""
|
|
311
|
+
def meets_criteria(row):
|
|
312
|
+
m = row.get("metadata", {})
|
|
313
|
+
if not m:
|
|
314
|
+
return False
|
|
315
|
+
|
|
316
|
+
content_md = m.get("content_metadata", {})
|
|
317
|
+
if (
|
|
318
|
+
content_md.get("type") == "structured"
|
|
319
|
+
and content_md.get("subtype") == "chart"
|
|
320
|
+
and m.get("table_metadata") is not None
|
|
321
|
+
and m.get("content") not in [None, ""]
|
|
322
|
+
):
|
|
323
|
+
return True
|
|
324
|
+
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
mask = df_extraction_ledger.apply(meets_criteria, axis=1)
|
|
328
|
+
valid_indices = df_extraction_ledger[mask].index.tolist()
|
|
329
|
+
|
|
330
|
+
# If no rows meet the criteria, just return.
|
|
331
|
+
if not valid_indices:
|
|
332
|
+
return df_extraction_ledger, {"trace_info": execution_trace_log}
|
|
333
|
+
|
|
334
|
+
# 2) Extract base64 images + keep track of row -> image mapping.
|
|
335
|
+
base64_images = []
|
|
336
|
+
for idx in valid_indices:
|
|
337
|
+
meta = df_extraction_ledger.at[idx, "metadata"]
|
|
338
|
+
base64_images.append(meta["content"]) # guaranteed by meets_criteria
|
|
339
|
+
|
|
340
|
+
# 3) Call our bulk _update_metadata to get all results.
|
|
341
|
+
yolox_client = _create_yolox_client(
|
|
342
|
+
endpoint_config.yolox_endpoints,
|
|
343
|
+
endpoint_config.yolox_infer_protocol,
|
|
344
|
+
endpoint_config.auth_token,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
ocr_client = _create_ocr_client(
|
|
348
|
+
endpoint_config.ocr_endpoints,
|
|
349
|
+
endpoint_config.ocr_infer_protocol,
|
|
350
|
+
ocr_model_name,
|
|
351
|
+
endpoint_config.auth_token,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
bulk_results = _update_chart_metadata(
|
|
355
|
+
base64_images=base64_images,
|
|
356
|
+
yolox_client=yolox_client,
|
|
357
|
+
ocr_client=ocr_client,
|
|
358
|
+
ocr_model_name=ocr_model_name,
|
|
359
|
+
worker_pool_size=endpoint_config.workers_per_progress_engine,
|
|
360
|
+
trace_info=execution_trace_log,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# 4) Write the results back to each row’s table_metadata
|
|
364
|
+
# The order of base64_images in bulk_results should match their original
|
|
365
|
+
# indices if we process them in the same order.
|
|
366
|
+
for row_id, idx in enumerate(valid_indices):
|
|
367
|
+
_, chart_content = bulk_results[row_id]
|
|
368
|
+
df_extraction_ledger.at[idx, "metadata"]["table_metadata"]["table_content"] = chart_content
|
|
369
|
+
|
|
370
|
+
return df_extraction_ledger, {"trace_info": execution_trace_log}
|
|
371
|
+
|
|
372
|
+
except Exception:
|
|
373
|
+
logger.error("Error occurred while extracting chart data.", exc_info=True)
|
|
374
|
+
|
|
375
|
+
raise
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import functools
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Union, Tuple
|
|
11
|
+
from typing import Dict
|
|
12
|
+
from typing import List
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
|
|
19
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
20
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@unified_exception_handler
|
|
26
|
+
def _decode_and_extract_from_image(
|
|
27
|
+
base64_row: pd.Series,
|
|
28
|
+
task_config: Dict[str, Any],
|
|
29
|
+
validated_extraction_config: ImageConfigSchema,
|
|
30
|
+
execution_trace_log: Optional[List[Any]] = None,
|
|
31
|
+
) -> Any:
|
|
32
|
+
"""
|
|
33
|
+
Decode base64-encoded image content from a DataFrame row and extract data using a specified extraction method.
|
|
34
|
+
|
|
35
|
+
This function extracts the "content" (base64 string) from the row, prepares additional task parameters by
|
|
36
|
+
inserting the remaining row data under "row_data", and decodes the base64 content into a BytesIO stream.
|
|
37
|
+
It then determines which extraction method to use (defaulting to "image" if the specified method is not found)
|
|
38
|
+
and calls the corresponding function from the image_helpers module.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
base64_row : pd.Series
|
|
43
|
+
A pandas Series representing a row containing base64-encoded content under the key "content"
|
|
44
|
+
and optionally a "source_id" and "document_type".
|
|
45
|
+
task_config : Dict[str, Any]
|
|
46
|
+
A dictionary containing task properties. It should include:
|
|
47
|
+
- "method" (str): The extraction method to use (e.g., "image").
|
|
48
|
+
- "params" (dict): Additional parameters to pass to the extraction function.
|
|
49
|
+
validated_extraction_config : Any
|
|
50
|
+
A configuration object that contains an attribute `image_extraction_config` to be used when
|
|
51
|
+
extracting image content.
|
|
52
|
+
default : str, optional
|
|
53
|
+
The default extraction method to use if the specified method is not available (default is "image").
|
|
54
|
+
execution_trace_log : Optional[List[Any]], optional
|
|
55
|
+
An optional list of trace information to pass to the extraction function (default is None).
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
Any
|
|
60
|
+
The extracted data from the decoded image content. The exact return type depends on the extraction method used.
|
|
61
|
+
|
|
62
|
+
Raises
|
|
63
|
+
------
|
|
64
|
+
KeyError
|
|
65
|
+
If the "content" key is missing from `base64_row`.
|
|
66
|
+
Exception
|
|
67
|
+
For any other unhandled exceptions during extraction.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# Retrieve document type and initialize source_id.
|
|
71
|
+
document_type: Any = base64_row["document_type"]
|
|
72
|
+
source_id: Optional[Any] = None
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
base64_content: str = base64_row["content"]
|
|
76
|
+
except KeyError as e:
|
|
77
|
+
err_msg = f"decode_and_extract: Missing 'content' key in row: {base64_row}"
|
|
78
|
+
logger.error(err_msg, exc_info=True)
|
|
79
|
+
raise KeyError(err_msg) from e
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
# Prepare additional row data (exclude "content") and inject into task parameters.
|
|
83
|
+
row_data = base64_row.drop(labels=["content"], errors="ignore")
|
|
84
|
+
task_config.setdefault("params", {})["row_data"] = row_data
|
|
85
|
+
|
|
86
|
+
# Retrieve source_id if available.
|
|
87
|
+
source_id = base64_row.get("source_id", None)
|
|
88
|
+
|
|
89
|
+
# Decode the base64 image content.
|
|
90
|
+
image_bytes: bytes = base64.b64decode(base64_content)
|
|
91
|
+
image_stream: io.BytesIO = io.BytesIO(image_bytes)
|
|
92
|
+
|
|
93
|
+
# Determine the extraction method and parameters.
|
|
94
|
+
# extract_method: str = task_config.get("method", "image")
|
|
95
|
+
extract_params: Dict[str, Any] = task_config.get("params", {})
|
|
96
|
+
extract_params["document_type"] = document_type
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
extract_text: bool = extract_params.pop("extract_text", False)
|
|
100
|
+
extract_images: bool = extract_params.pop("extract_images", False)
|
|
101
|
+
extract_tables: bool = extract_params.pop("extract_tables", False)
|
|
102
|
+
extract_charts: bool = extract_params.pop("extract_charts", False)
|
|
103
|
+
extract_infographics: bool = extract_params.pop("extract_infographics", False)
|
|
104
|
+
except KeyError as e:
|
|
105
|
+
raise ValueError(f"Missing required extraction flag: {e}")
|
|
106
|
+
|
|
107
|
+
logger.debug(
|
|
108
|
+
f"decode_and_extract: Extracting image content using image_extraction_config: "
|
|
109
|
+
f"{validated_extraction_config}"
|
|
110
|
+
)
|
|
111
|
+
# Ensure we pass the correct nested config type (ImageConfigSchema) to helpers.
|
|
112
|
+
# Some callers provide the full ImageExtractorSchema; extract its inner image_extraction_config.
|
|
113
|
+
if validated_extraction_config is not None:
|
|
114
|
+
inner_cfg = getattr(validated_extraction_config, "image_extraction_config", validated_extraction_config)
|
|
115
|
+
if inner_cfg is not None:
|
|
116
|
+
extract_params["image_extraction_config"] = inner_cfg
|
|
117
|
+
|
|
118
|
+
if execution_trace_log is not None:
|
|
119
|
+
extract_params["trace_info"] = execution_trace_log
|
|
120
|
+
|
|
121
|
+
# func = getattr(image_helpers, extract_method, default)
|
|
122
|
+
extracted_data: Any = unstructured_image_extractor(
|
|
123
|
+
image_stream=image_stream,
|
|
124
|
+
extract_text=extract_text,
|
|
125
|
+
extract_images=extract_images,
|
|
126
|
+
extract_infographics=extract_infographics,
|
|
127
|
+
extract_tables=extract_tables,
|
|
128
|
+
extract_charts=extract_charts,
|
|
129
|
+
extraction_config=extract_params,
|
|
130
|
+
extraction_trace_log=execution_trace_log,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return extracted_data
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
err_msg = f"decode_and_extract: Unhandled exception for source '{source_id}'. Original error: {e}"
|
|
137
|
+
logger.error(err_msg, exc_info=True)
|
|
138
|
+
raise type(e)(err_msg) from e
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@unified_exception_handler
|
|
142
|
+
def extract_primitives_from_image_internal(
|
|
143
|
+
df_extraction_ledger: pd.DataFrame,
|
|
144
|
+
task_config: Union[Dict[str, Any], BaseModel],
|
|
145
|
+
extraction_config: Any,
|
|
146
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
147
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
148
|
+
"""
|
|
149
|
+
Process a DataFrame containing base64-encoded image files and extract primitives from each image.
|
|
150
|
+
|
|
151
|
+
This function applies the `decode_and_extract_from_image` routine to every row of the input DataFrame.
|
|
152
|
+
It then explodes any list results into separate rows, drops missing values, and compiles the extracted data
|
|
153
|
+
into a new DataFrame with columns "document_type", "metadata", and "uuid". In addition, trace information is
|
|
154
|
+
collected if provided.
|
|
155
|
+
|
|
156
|
+
Parameters
|
|
157
|
+
----------
|
|
158
|
+
df_extraction_ledger : pd.DataFrame
|
|
159
|
+
Input DataFrame containing image files in base64 encoding. Expected to include columns 'source_id'
|
|
160
|
+
and 'content'.
|
|
161
|
+
task_config : Union[Dict[str, Any], BaseModel]
|
|
162
|
+
A dictionary or Pydantic model with instructions and parameters for the image processing task.
|
|
163
|
+
extraction_config : Any
|
|
164
|
+
A configuration object validated for processing images (e.g., containing `image_extraction_config`).
|
|
165
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
166
|
+
An optional dictionary for tracing and logging additional information during processing.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
pd.DataFrame
|
|
171
|
+
A DataFrame with the extracted image primitives. Expected columns include "document_type", "metadata",
|
|
172
|
+
and "uuid". Also returns a dictionary containing trace information under the key "trace_info".
|
|
173
|
+
|
|
174
|
+
Raises
|
|
175
|
+
------
|
|
176
|
+
Exception
|
|
177
|
+
If an error occurs during the image processing stage, the exception is logged and re-raised.
|
|
178
|
+
"""
|
|
179
|
+
logger.debug("process_image: Processing image content")
|
|
180
|
+
if execution_trace_log is None:
|
|
181
|
+
execution_trace_log = {}
|
|
182
|
+
|
|
183
|
+
if isinstance(task_config, BaseModel):
|
|
184
|
+
task_config = task_config.model_dump()
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
# Create a partial function to decode and extract image data for each row.
|
|
188
|
+
_decode_and_extract = functools.partial(
|
|
189
|
+
_decode_and_extract_from_image,
|
|
190
|
+
task_config=task_config,
|
|
191
|
+
validated_extraction_config=extraction_config,
|
|
192
|
+
execution_trace_log=execution_trace_log,
|
|
193
|
+
)
|
|
194
|
+
logger.debug("process_image: Processing with method: %s", task_config.get("method", None))
|
|
195
|
+
sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
|
|
196
|
+
sr_extraction = sr_extraction.explode().dropna()
|
|
197
|
+
|
|
198
|
+
if not sr_extraction.empty:
|
|
199
|
+
extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
200
|
+
else:
|
|
201
|
+
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
202
|
+
|
|
203
|
+
return extracted_df, {"trace_info": execution_trace_log}
|
|
204
|
+
|
|
205
|
+
except Exception as e:
|
|
206
|
+
err_msg = f"process_image: Unhandled exception in image extractor stage. Original error: {e}"
|
|
207
|
+
logger.exception(err_msg)
|
|
208
|
+
raise type(e)(err_msg) from e
|