nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import inspect
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import Dict
|
|
12
|
+
from typing import List
|
|
13
|
+
from typing import Optional
|
|
14
|
+
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from nv_ingest_api.internal.extract.pdf.engines import adobe_extractor
|
|
18
|
+
from nv_ingest_api.internal.extract.pdf.engines import llama_parse_extractor
|
|
19
|
+
from nv_ingest_api.internal.extract.pdf.engines import nemotron_parse_extractor
|
|
20
|
+
from nv_ingest_api.internal.extract.pdf.engines import pdfium_extractor
|
|
21
|
+
from nv_ingest_api.internal.extract.pdf.engines import tika_extractor
|
|
22
|
+
from nv_ingest_api.internal.extract.pdf.engines import unstructured_io_extractor
|
|
23
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
24
|
+
|
|
25
|
+
# Import extraction functions for different engines.
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Lookup table mapping extraction method names to extractor functions.
|
|
30
|
+
EXTRACTOR_LOOKUP = {
|
|
31
|
+
"adobe": adobe_extractor,
|
|
32
|
+
"llama": llama_parse_extractor,
|
|
33
|
+
"nemotron_parse": nemotron_parse_extractor,
|
|
34
|
+
"pdfium": pdfium_extractor,
|
|
35
|
+
"pdfium_hybrid": pdfium_extractor, # Uses pdfium for native text and switches to OCR pipeline only for scanned pages. # noqa: E501
|
|
36
|
+
"tika": tika_extractor,
|
|
37
|
+
"unstructured_io": unstructured_io_extractor,
|
|
38
|
+
"ocr": pdfium_extractor, # Ignores pdfium's text entirely and processes every single page through the full OCR pipline. # noqa: E501
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
METHOD_TO_CONFIG_KEY_MAP = {
|
|
42
|
+
"pdfium_hybrid": "pdfium_config",
|
|
43
|
+
"ocr": "pdfium_config",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _work_extract_pdf(
|
|
48
|
+
*,
|
|
49
|
+
pdf_stream: io.BytesIO,
|
|
50
|
+
extract_text: bool,
|
|
51
|
+
extract_images: bool,
|
|
52
|
+
extract_infographics: bool,
|
|
53
|
+
extract_tables: bool,
|
|
54
|
+
extract_charts: bool,
|
|
55
|
+
extract_page_as_image: bool,
|
|
56
|
+
extractor_config: dict,
|
|
57
|
+
execution_trace_log=None,
|
|
58
|
+
) -> Any:
|
|
59
|
+
"""
|
|
60
|
+
Perform PDF extraction on a decoded PDF stream using the given extraction parameters.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
extract_method = extractor_config["extract_method"]
|
|
64
|
+
extractor_fn = EXTRACTOR_LOOKUP.get(extract_method, pdfium_extractor)
|
|
65
|
+
|
|
66
|
+
extractor_fn_args = dict(
|
|
67
|
+
pdf_stream=pdf_stream,
|
|
68
|
+
extract_text=extract_text,
|
|
69
|
+
extract_images=extract_images,
|
|
70
|
+
extract_infographics=extract_infographics,
|
|
71
|
+
extract_tables=extract_tables,
|
|
72
|
+
extract_charts=extract_charts,
|
|
73
|
+
extractor_config=extractor_config,
|
|
74
|
+
execution_trace_log=execution_trace_log,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if "extract_page_as_image" in inspect.signature(extractor_fn).parameters:
|
|
78
|
+
extractor_fn_args["extract_page_as_image"] = extract_page_as_image
|
|
79
|
+
elif extract_page_as_image:
|
|
80
|
+
logger.warning(f"`extract_page_as_image` is set to True, but {extract_method} does not support it.")
|
|
81
|
+
|
|
82
|
+
return extractor_fn(**extractor_fn_args)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@unified_exception_handler
|
|
86
|
+
def _orchestrate_row_extraction(
|
|
87
|
+
row: pd.Series,
|
|
88
|
+
task_config: Dict[str, Any],
|
|
89
|
+
extractor_config: Any,
|
|
90
|
+
execution_trace_log: Optional[List[Any]] = None,
|
|
91
|
+
) -> Any:
|
|
92
|
+
"""
|
|
93
|
+
Orchestrate extraction for a single DataFrame row by decoding the PDF stream,
|
|
94
|
+
building an extractor_config, and then delegating to the work function.
|
|
95
|
+
"""
|
|
96
|
+
if "content" not in row:
|
|
97
|
+
err_msg = f"Missing 'content' key in row: {row}"
|
|
98
|
+
logger.error(err_msg)
|
|
99
|
+
raise KeyError(err_msg)
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
pdf_stream = io.BytesIO(base64.b64decode(row["content"]))
|
|
103
|
+
except Exception as e:
|
|
104
|
+
err_msg = f"Error decoding base64 content: {e}"
|
|
105
|
+
logger.error(err_msg, exc_info=True)
|
|
106
|
+
raise type(e)(err_msg) from e
|
|
107
|
+
|
|
108
|
+
# Begin with a copy of the task parameters.
|
|
109
|
+
params = task_config.get("params", {}).copy()
|
|
110
|
+
|
|
111
|
+
# Extract required boolean flags from params.
|
|
112
|
+
try:
|
|
113
|
+
extract_text = params.pop("extract_text", False)
|
|
114
|
+
extract_images = params.pop("extract_images", False)
|
|
115
|
+
extract_tables = params.pop("extract_tables", False)
|
|
116
|
+
extract_charts = params.pop("extract_charts", False)
|
|
117
|
+
extract_infographics = params.pop("extract_infographics", False)
|
|
118
|
+
extract_page_as_image = params.pop("extract_page_as_image", False)
|
|
119
|
+
extract_method = params.get("extract_method", "pdfium")
|
|
120
|
+
except KeyError as e:
|
|
121
|
+
raise ValueError(f"Missing required extraction flag: {e}")
|
|
122
|
+
|
|
123
|
+
# Add row metadata (all columns except 'content') into the config.
|
|
124
|
+
row_metadata = row.drop("content")
|
|
125
|
+
params["row_data"] = row_metadata
|
|
126
|
+
|
|
127
|
+
extract_method = task_config.get("method", extract_method)
|
|
128
|
+
params["extract_method"] = extract_method
|
|
129
|
+
|
|
130
|
+
# Construct the config key based on the extraction method
|
|
131
|
+
config_key = METHOD_TO_CONFIG_KEY_MAP.get(extract_method, f"{extract_method}_config")
|
|
132
|
+
|
|
133
|
+
# Handle both object and dictionary cases for extractor_config
|
|
134
|
+
if hasattr(extractor_config, config_key):
|
|
135
|
+
# Object case: extractor_config is a Pydantic model with attribute access
|
|
136
|
+
method_config = getattr(extractor_config, config_key)
|
|
137
|
+
elif isinstance(extractor_config, dict) and config_key in extractor_config:
|
|
138
|
+
# Dictionary case: extractor_config is a dict with key access
|
|
139
|
+
method_config = extractor_config[config_key]
|
|
140
|
+
else:
|
|
141
|
+
# If no matching config is found, log a warning but don't fail
|
|
142
|
+
logger.warning(f"No {config_key} found in extractor_config: {sanitize_for_logging(extractor_config)}")
|
|
143
|
+
method_config = None
|
|
144
|
+
|
|
145
|
+
# Add the method-specific config to the parameters if available
|
|
146
|
+
if method_config is not None:
|
|
147
|
+
params[config_key] = method_config
|
|
148
|
+
logger.debug(f"Added {config_key} to extraction parameters")
|
|
149
|
+
|
|
150
|
+
# The resulting parameters constitute the complete extractor_config
|
|
151
|
+
extractor_config = params
|
|
152
|
+
logger.debug(f"Final extractor_config: {sanitize_for_logging(extractor_config)}")
|
|
153
|
+
|
|
154
|
+
result = _work_extract_pdf(
|
|
155
|
+
pdf_stream=pdf_stream,
|
|
156
|
+
extract_text=extract_text,
|
|
157
|
+
extract_images=extract_images,
|
|
158
|
+
extract_infographics=extract_infographics,
|
|
159
|
+
extract_page_as_image=extract_page_as_image,
|
|
160
|
+
extract_tables=extract_tables,
|
|
161
|
+
extract_charts=extract_charts,
|
|
162
|
+
extractor_config=extractor_config,
|
|
163
|
+
execution_trace_log=execution_trace_log,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return result
|