nv-ingest-api 2025.4.16.dev20250416__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.16.dev20250416.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import io
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
from nv_ingest_api.internal.extract.pdf.engines import (
|
|
14
|
+
adobe_extractor,
|
|
15
|
+
llama_parse_extractor,
|
|
16
|
+
nemoretriever_parse_extractor,
|
|
17
|
+
pdfium_extractor,
|
|
18
|
+
tika_extractor,
|
|
19
|
+
unstructured_io_extractor,
|
|
20
|
+
)
|
|
21
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
22
|
+
|
|
23
|
+
# Import extraction functions for different engines.
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# Lookup table mapping extraction method names to extractor functions.
|
|
28
|
+
EXTRACTOR_LOOKUP = {
|
|
29
|
+
"adobe": adobe_extractor,
|
|
30
|
+
"llama": llama_parse_extractor,
|
|
31
|
+
"nemoretriever_parse": nemoretriever_parse_extractor,
|
|
32
|
+
"pdfium": pdfium_extractor,
|
|
33
|
+
"tika": tika_extractor,
|
|
34
|
+
"unstructured_io": unstructured_io_extractor,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _work_extract_pdf(
|
|
39
|
+
*,
|
|
40
|
+
pdf_stream: io.BytesIO,
|
|
41
|
+
extract_text: bool,
|
|
42
|
+
extract_images: bool,
|
|
43
|
+
extract_infographics: bool,
|
|
44
|
+
extract_tables: bool,
|
|
45
|
+
extract_charts: bool,
|
|
46
|
+
extractor_config: dict,
|
|
47
|
+
execution_trace_log=None,
|
|
48
|
+
) -> Any:
|
|
49
|
+
"""
|
|
50
|
+
Perform PDF extraction on a decoded PDF stream using the given extraction parameters.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
extract_method = extractor_config["extract_method"]
|
|
54
|
+
extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
|
|
55
|
+
return extractor_fn(
|
|
56
|
+
pdf_stream,
|
|
57
|
+
extract_text,
|
|
58
|
+
extract_images,
|
|
59
|
+
extract_infographics,
|
|
60
|
+
extract_tables,
|
|
61
|
+
extract_charts,
|
|
62
|
+
extractor_config,
|
|
63
|
+
execution_trace_log,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@unified_exception_handler
|
|
68
|
+
def _orchestrate_row_extraction(
|
|
69
|
+
row: pd.Series,
|
|
70
|
+
task_config: Dict[str, Any],
|
|
71
|
+
extractor_config: Any,
|
|
72
|
+
execution_trace_log: Optional[List[Any]] = None,
|
|
73
|
+
) -> Any:
|
|
74
|
+
"""
|
|
75
|
+
Orchestrate extraction for a single DataFrame row by decoding the PDF stream,
|
|
76
|
+
building an extractor_config, and then delegating to the work function.
|
|
77
|
+
"""
|
|
78
|
+
if "content" not in row:
|
|
79
|
+
err_msg = f"Missing 'content' key in row: {row}"
|
|
80
|
+
logger.error(err_msg)
|
|
81
|
+
raise KeyError(err_msg)
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
pdf_stream = io.BytesIO(base64.b64decode(row["content"]))
|
|
85
|
+
except Exception as e:
|
|
86
|
+
err_msg = f"Error decoding base64 content: {e}"
|
|
87
|
+
logger.error(err_msg, exc_info=True)
|
|
88
|
+
raise type(e)(err_msg) from e
|
|
89
|
+
|
|
90
|
+
# Begin with a copy of the task parameters.
|
|
91
|
+
params = task_config.get("params", {}).copy()
|
|
92
|
+
|
|
93
|
+
# Extract required boolean flags from params.
|
|
94
|
+
try:
|
|
95
|
+
extract_text = params.pop("extract_text", False)
|
|
96
|
+
extract_images = params.pop("extract_images", False)
|
|
97
|
+
extract_tables = params.pop("extract_tables", False)
|
|
98
|
+
extract_charts = params.pop("extract_charts", False)
|
|
99
|
+
extract_infographics = params.pop("extract_infographics", False)
|
|
100
|
+
extract_method = params.get("extract_method", "pdfium")
|
|
101
|
+
except KeyError as e:
|
|
102
|
+
raise ValueError(f"Missing required extraction flag: {e}")
|
|
103
|
+
|
|
104
|
+
# Add row metadata (all columns except 'content') into the config.
|
|
105
|
+
row_metadata = row.drop("content")
|
|
106
|
+
params["row_data"] = row_metadata
|
|
107
|
+
|
|
108
|
+
extract_method = task_config.get("method", extract_method)
|
|
109
|
+
params["extract_method"] = extract_method
|
|
110
|
+
|
|
111
|
+
# Construct the config key based on the extraction method
|
|
112
|
+
config_key = f"{extract_method}_config"
|
|
113
|
+
|
|
114
|
+
# Handle both object and dictionary cases for extractor_config
|
|
115
|
+
if hasattr(extractor_config, config_key):
|
|
116
|
+
# Object case: extractor_config is a Pydantic model with attribute access
|
|
117
|
+
method_config = getattr(extractor_config, config_key)
|
|
118
|
+
elif isinstance(extractor_config, dict) and config_key in extractor_config:
|
|
119
|
+
# Dictionary case: extractor_config is a dict with key access
|
|
120
|
+
method_config = extractor_config[config_key]
|
|
121
|
+
else:
|
|
122
|
+
# If no matching config is found, log a warning but don't fail
|
|
123
|
+
logger.warning(f"No {config_key} found in extractor_config: {extractor_config}")
|
|
124
|
+
method_config = None
|
|
125
|
+
|
|
126
|
+
# Add the method-specific config to the parameters if available
|
|
127
|
+
if method_config is not None:
|
|
128
|
+
params[config_key] = method_config
|
|
129
|
+
logger.debug(f"Added {config_key} to extraction parameters")
|
|
130
|
+
|
|
131
|
+
# The resulting parameters constitute the complete extractor_config
|
|
132
|
+
extractor_config = params
|
|
133
|
+
logger.debug(f"Final extractor_config: {extractor_config}")
|
|
134
|
+
|
|
135
|
+
result = _work_extract_pdf(
|
|
136
|
+
pdf_stream=pdf_stream,
|
|
137
|
+
extract_text=extract_text,
|
|
138
|
+
extract_images=extract_images,
|
|
139
|
+
extract_infographics=extract_infographics,
|
|
140
|
+
extract_tables=extract_tables,
|
|
141
|
+
extract_charts=extract_charts,
|
|
142
|
+
extractor_config=extractor_config,
|
|
143
|
+
execution_trace_log=execution_trace_log,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return result
|