nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import copy
|
|
8
|
+
import logging
|
|
9
|
+
import uuid
|
|
10
|
+
from typing import Any, Optional, Dict
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from transformers import AutoTokenizer
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
17
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
18
|
+
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
|
|
24
|
+
"""Build documents from text chunks"""
|
|
25
|
+
documents: List[dict] = []
|
|
26
|
+
|
|
27
|
+
for i, text in enumerate(chunks):
|
|
28
|
+
if text is None or not text.strip():
|
|
29
|
+
continue
|
|
30
|
+
|
|
31
|
+
metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {}
|
|
32
|
+
metadata = copy.deepcopy(metadata)
|
|
33
|
+
|
|
34
|
+
metadata["content"] = text
|
|
35
|
+
|
|
36
|
+
documents.append({"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())})
|
|
37
|
+
|
|
38
|
+
return documents
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _split_into_chunks(text, tokenizer, chunk_size=1024, chunk_overlap=20):
|
|
42
|
+
# Tokenize the text into token IDs
|
|
43
|
+
encoding = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
|
|
44
|
+
|
|
45
|
+
# Get the token IDs and offsets for splitting
|
|
46
|
+
offsets = encoding["offset_mapping"]
|
|
47
|
+
|
|
48
|
+
# Split the tokens into chunks of the desired size with the desired overlap
|
|
49
|
+
chunks = [offsets[i : i + chunk_size] for i in range(0, len(offsets), chunk_size - chunk_overlap)]
|
|
50
|
+
|
|
51
|
+
# Convert token chunks back to text while preserving original spacing and case
|
|
52
|
+
text_chunks = []
|
|
53
|
+
for chunk in chunks:
|
|
54
|
+
text_chunk = text[chunk[0][0] : chunk[-1][0]]
|
|
55
|
+
text_chunks.append(text_chunk)
|
|
56
|
+
|
|
57
|
+
return text_chunks
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@unified_exception_handler
|
|
61
|
+
def transform_text_split_and_tokenize_internal(
|
|
62
|
+
df_transform_ledger: pd.DataFrame,
|
|
63
|
+
task_config: Dict[str, Any],
|
|
64
|
+
transform_config: TextSplitterSchema,
|
|
65
|
+
execution_trace_log: Optional[Dict[str, Any]],
|
|
66
|
+
) -> pd.DataFrame:
|
|
67
|
+
"""
|
|
68
|
+
Internal function to split and tokenize text in a ledger DataFrame.
|
|
69
|
+
|
|
70
|
+
This function extracts text from documents that match a filter criteria based on source types,
|
|
71
|
+
splits the text into chunks using the specified tokenizer, and rebuilds document records with the
|
|
72
|
+
split text. The resulting DataFrame contains both split and unsplit documents.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
df_transform_ledger : pd.DataFrame
|
|
77
|
+
DataFrame containing documents to be processed. Expected to have columns 'document_type' and
|
|
78
|
+
'metadata', where 'metadata' includes a 'content' field and nested source information.
|
|
79
|
+
task_config : dict
|
|
80
|
+
Dictionary with task-specific configuration. Expected keys include:
|
|
81
|
+
- "tokenizer": Tokenizer identifier or path.
|
|
82
|
+
- "chunk_size": Maximum number of tokens per chunk.
|
|
83
|
+
- "chunk_overlap": Number of tokens to overlap between chunks.
|
|
84
|
+
- "params": A sub-dictionary that may contain:
|
|
85
|
+
- "hf_access_token": Hugging Face access token.
|
|
86
|
+
- "split_source_types": List of source types to filter for splitting.
|
|
87
|
+
transform_config : TextSplitterSchema
|
|
88
|
+
Configuration object providing default values for text splitting parameters.
|
|
89
|
+
execution_trace_log : Optional[dict]
|
|
90
|
+
Optional dictionary for logging execution trace information; may be None.
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
pd.DataFrame
|
|
95
|
+
DataFrame with processed documents. Documents with text matching the filter are split into chunks,
|
|
96
|
+
and then merged with those that do not match the filter.
|
|
97
|
+
|
|
98
|
+
Raises
|
|
99
|
+
------
|
|
100
|
+
ValueError
|
|
101
|
+
If the text splitting or tokenization process fails.
|
|
102
|
+
"""
|
|
103
|
+
_ = execution_trace_log # Placeholder for potential execution trace logging.
|
|
104
|
+
|
|
105
|
+
# Override parameters using task_config, with fallback to transform_config.
|
|
106
|
+
tokenizer_identifier: Optional[str] = task_config.get("tokenizer", transform_config.tokenizer)
|
|
107
|
+
chunk_size: int = task_config.get("chunk_size", transform_config.chunk_size)
|
|
108
|
+
chunk_overlap: int = task_config.get("chunk_overlap", transform_config.chunk_overlap)
|
|
109
|
+
params: Dict[str, Any] = task_config.get("params", {})
|
|
110
|
+
|
|
111
|
+
hf_access_token: Optional[str] = params.get("hf_access_token", None)
|
|
112
|
+
split_source_types: List[str] = params.get("split_source_types", ["text"])
|
|
113
|
+
|
|
114
|
+
logger.debug(
|
|
115
|
+
f"Splitting text with tokenizer: {tokenizer_identifier}, "
|
|
116
|
+
f"chunk_size: {chunk_size} tokens, "
|
|
117
|
+
f"chunk_overlap: {chunk_overlap}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Filter to documents with text content.
|
|
121
|
+
bool_index = (df_transform_ledger["document_type"] == ContentTypeEnum.TEXT) & (
|
|
122
|
+
pd.json_normalize(df_transform_ledger["metadata"])["source_metadata.source_type"].isin(split_source_types)
|
|
123
|
+
)
|
|
124
|
+
df_filtered: pd.DataFrame = df_transform_ledger.loc[bool_index]
|
|
125
|
+
|
|
126
|
+
if df_filtered.empty:
|
|
127
|
+
return df_transform_ledger
|
|
128
|
+
|
|
129
|
+
model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
|
|
130
|
+
|
|
131
|
+
if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
|
|
132
|
+
tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
|
|
133
|
+
):
|
|
134
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
|
|
135
|
+
elif os.path.exists(os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")) and (
|
|
136
|
+
tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"
|
|
137
|
+
):
|
|
138
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
|
|
139
|
+
|
|
140
|
+
tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)
|
|
141
|
+
|
|
142
|
+
split_docs: List[Dict[str, Any]] = []
|
|
143
|
+
for _, row in df_filtered.iterrows():
|
|
144
|
+
content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
|
|
145
|
+
chunks: List[str] = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
|
|
146
|
+
split_docs.extend(_build_split_documents(row, chunks))
|
|
147
|
+
|
|
148
|
+
split_docs_df: pd.DataFrame = pd.DataFrame(split_docs)
|
|
149
|
+
|
|
150
|
+
# Merge split documents with unsplit documents.
|
|
151
|
+
merged_df: pd.DataFrame = pd.concat([split_docs_df, df_transform_ledger[~bool_index]], axis=0).reset_index(
|
|
152
|
+
drop=True
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
result, execution_trace_log = merged_df, {}
|
|
156
|
+
|
|
157
|
+
return result
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def cm_ensure_payload_not_null(control_message: IngestControlMessage):
|
|
9
|
+
"""
|
|
10
|
+
Ensures that the payload of a IngestControlMessage is not None.
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
control_message : IngestControlMessage
|
|
15
|
+
The IngestControlMessage to check.
|
|
16
|
+
|
|
17
|
+
Raises
|
|
18
|
+
------
|
|
19
|
+
ValueError
|
|
20
|
+
If the payload is None.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
if control_message.payload() is None:
|
|
24
|
+
raise ValueError("Payload cannot be None")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def cm_set_failure(control_message: IngestControlMessage, reason: str) -> IngestControlMessage:
|
|
28
|
+
"""
|
|
29
|
+
Sets the failure metadata on a IngestControlMessage.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
control_message : IngestControlMessage
|
|
34
|
+
The IngestControlMessage to set the failure metadata on.
|
|
35
|
+
reason : str
|
|
36
|
+
The reason for the failure.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
control_message : IngestControlMessage
|
|
41
|
+
The modified IngestControlMessage with the failure metadata set.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
control_message.set_metadata("cm_failed", True)
|
|
45
|
+
control_message.set_metadata("cm_failed_reason", reason)
|
|
46
|
+
|
|
47
|
+
return control_message
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bytesfromhex(hex_input):
|
|
10
|
+
"""
|
|
11
|
+
Function to convert hex to bytes.
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
hex_input : hex
|
|
16
|
+
Hex string to store bytes in cuDF.
|
|
17
|
+
|
|
18
|
+
Returns
|
|
19
|
+
-------
|
|
20
|
+
bytes
|
|
21
|
+
Hex encoded object converted to bytes.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
return bytes.fromhex(hex_input)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def hexfrombytes(bytes_input):
|
|
28
|
+
"""
|
|
29
|
+
Function to bytes to hex string.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
bytes_input : bytes
|
|
34
|
+
Raw bytes of object.
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
hex
|
|
39
|
+
Hex string to store bytes in cuDF.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
return bytes_input.hex()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def bytesfrombase64(base64_input):
|
|
46
|
+
"""
|
|
47
|
+
Function to convert base64 encoded string to bytes.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
base64_input : hex
|
|
52
|
+
Base64 encoded string to store bytes in cuDF.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
bytes
|
|
57
|
+
Base64 encoded string converted to bytes.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
return base64.b64decode(base64_input)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def base64frombytes(bytes_input, encoding="utf-8"):
|
|
64
|
+
"""
|
|
65
|
+
Function to bytes to base64 string.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
bytes_input : bytes
|
|
70
|
+
Raw bytes of object.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
base64
|
|
75
|
+
base64 encoded string to store bytes in cuDF.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
return base64.b64encode(bytes_input).decode(encoding)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def merge_dict(defaults: Dict[str, Any], overrides: Dict[str, Any]) -> Dict[str, Any]:
|
|
14
|
+
"""
|
|
15
|
+
Recursively merges two dictionaries, with values from the `overrides` dictionary taking precedence.
|
|
16
|
+
|
|
17
|
+
This function merges the `overrides` dictionary into the `defaults` dictionary. If a key in both dictionaries
|
|
18
|
+
has a dictionary as its value, the function will recursively merge those dictionaries. Otherwise, the value
|
|
19
|
+
from the `overrides` dictionary will overwrite the value in the `defaults` dictionary.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
defaults : dict of {str: Any}
|
|
24
|
+
The default dictionary that will be updated with values from the `overrides` dictionary.
|
|
25
|
+
overrides : dict of {str: Any}
|
|
26
|
+
The dictionary containing values that will override or extend those in the `defaults` dictionary.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
dict of {str: Any}
|
|
31
|
+
The merged dictionary, with values from the `overrides` dictionary taking precedence.
|
|
32
|
+
|
|
33
|
+
Examples
|
|
34
|
+
--------
|
|
35
|
+
>>> defaults = {
|
|
36
|
+
... "a": 1,
|
|
37
|
+
... "b": {
|
|
38
|
+
... "c": 3,
|
|
39
|
+
... "d": 4
|
|
40
|
+
... },
|
|
41
|
+
... "e": 5
|
|
42
|
+
... }
|
|
43
|
+
>>> overrides = {
|
|
44
|
+
... "b": {
|
|
45
|
+
... "c": 30
|
|
46
|
+
... },
|
|
47
|
+
... "f": 6
|
|
48
|
+
... }
|
|
49
|
+
>>> result = merge_dict(defaults, overrides)
|
|
50
|
+
>>> result
|
|
51
|
+
{'a': 1, 'b': {'c': 30, 'd': 4}, 'e': 5, 'f': 6}
|
|
52
|
+
|
|
53
|
+
Notes
|
|
54
|
+
-----
|
|
55
|
+
- The `merge_dict` function modifies the `defaults` dictionary in place. If you need to preserve the original
|
|
56
|
+
`defaults` dictionary, consider passing a copy instead.
|
|
57
|
+
- This function is particularly useful when combining configuration dictionaries where certain settings should
|
|
58
|
+
override defaults.
|
|
59
|
+
"""
|
|
60
|
+
for key, value in overrides.items():
|
|
61
|
+
if isinstance(value, dict) and value:
|
|
62
|
+
defaults[key] = merge_dict(defaults.get(key, {}), value)
|
|
63
|
+
else:
|
|
64
|
+
defaults[key] = overrides[key]
|
|
65
|
+
return defaults
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from datetime import timezone
|
|
8
|
+
|
|
9
|
+
from dateutil.parser import parse
|
|
10
|
+
|
|
11
|
+
from nv_ingest_api.util.exception_handlers.converters import datetools_exception_handler
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@datetools_exception_handler
|
|
15
|
+
def datetimefrompdfmeta(pdf_formated_date: str, keep_tz: bool = False) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Convert PDF metadata formatted date string to a datetime object.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
pdf_formated_date : str
|
|
22
|
+
A date string in standard PDF metadata format.
|
|
23
|
+
Example: `str("D:20211222141131-07'00'")`
|
|
24
|
+
keep_tz : bool, optional
|
|
25
|
+
Keep or remove the timezone attribute of the parsed datetime object. If `False` (necessary for arrow format),
|
|
26
|
+
the timezone offset will be added to the datetime. Parsed datetimes will be in the same local time.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
str
|
|
31
|
+
A datetime object parsed from the input date string in ISO 8601 format.
|
|
32
|
+
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
# standard pdf date format
|
|
37
|
+
pattern = "D:%Y%m%d%H%M%S%z"
|
|
38
|
+
# clean up date string
|
|
39
|
+
cleaned_date_string = pdf_formated_date[:-1].replace("'", ":")
|
|
40
|
+
parsed_dt_tz = datetime.strptime(cleaned_date_string, pattern)
|
|
41
|
+
except ValueError:
|
|
42
|
+
parsed_dt_tz = parse(pdf_formated_date, fuzzy=True)
|
|
43
|
+
|
|
44
|
+
if not keep_tz:
|
|
45
|
+
return remove_tz(parsed_dt_tz).isoformat()
|
|
46
|
+
|
|
47
|
+
return parsed_dt_tz.isoformat()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def remove_tz(datetime_obj: datetime) -> datetime:
|
|
51
|
+
"""
|
|
52
|
+
Remove timezone and add offset to a datetime object.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
datetime_obj : datetime.datetime
|
|
57
|
+
A datetime object with or without the timezone attribute set.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
datetime.datetime
|
|
62
|
+
A datetime object with the timezone offset added and the timezone attribute removed.
|
|
63
|
+
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
if datetime_obj.tzinfo is not None: # If timezone info is present
|
|
67
|
+
# Convert to UTC
|
|
68
|
+
datetime_obj = datetime_obj.astimezone(timezone.utc)
|
|
69
|
+
# Remove timezone information
|
|
70
|
+
datetime_obj = datetime_obj.replace(tzinfo=None)
|
|
71
|
+
|
|
72
|
+
return datetime_obj
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def validate_iso8601(date_string: str) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Verify that the given date string is in ISO 8601 format.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
date_string : str
|
|
82
|
+
A date string in human-readable format, ideally ISO 8601.
|
|
83
|
+
|
|
84
|
+
Raises
|
|
85
|
+
------
|
|
86
|
+
ValueError
|
|
87
|
+
If the date string is not in a valid ISO 8601 format.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
assert datetime.fromisoformat(date_string)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
import fastparquet
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
import cudf
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MemoryFiles:
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.output = {}
|
|
17
|
+
|
|
18
|
+
def open(self, fn, mode="rb"):
|
|
19
|
+
if mode != "wb":
|
|
20
|
+
try:
|
|
21
|
+
self.output[fn].seek(0)
|
|
22
|
+
except KeyError:
|
|
23
|
+
raise FileNotFoundError
|
|
24
|
+
return self.output[fn]
|
|
25
|
+
|
|
26
|
+
i = io.BytesIO()
|
|
27
|
+
self.output[fn] = i
|
|
28
|
+
self.output[fn].close = lambda: None
|
|
29
|
+
|
|
30
|
+
return i
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def pandas_to_cudf(
|
|
34
|
+
df: pd.DataFrame,
|
|
35
|
+
deserialize_cols: list = [],
|
|
36
|
+
default_cols: dict = {"document_type": str, "metadata": str},
|
|
37
|
+
default_type: type = str,
|
|
38
|
+
) -> cudf.DataFrame:
|
|
39
|
+
"""
|
|
40
|
+
Helper function to convert from pandas to cudf until https://github.com/apache/arrow/pull/40412 is resolved.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
df : pd.DataFrame
|
|
45
|
+
A pandas dataframe.
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
cudf.DataFrame
|
|
49
|
+
A cuDF dataframe.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
if not df.empty:
|
|
53
|
+
files = MemoryFiles()
|
|
54
|
+
for col in deserialize_cols:
|
|
55
|
+
df[col] = df[col].apply(lambda x: json.loads(x))
|
|
56
|
+
df = pd.concat([df, df.iloc[0:1]], axis=0)
|
|
57
|
+
|
|
58
|
+
fastparquet.write("_", df, open_with=files.open, compression="UNCOMPRESSED", object_encoding="json")
|
|
59
|
+
|
|
60
|
+
with files.output["_"] as bytes_buf:
|
|
61
|
+
gdf = cudf.read_parquet(bytes_buf).iloc[:-1]
|
|
62
|
+
gdf.index.name = None
|
|
63
|
+
|
|
64
|
+
return gdf
|
|
65
|
+
else:
|
|
66
|
+
gdf = cudf.DataFrame({col: [] for col in default_cols})
|
|
67
|
+
for col in df.columns:
|
|
68
|
+
field_type = default_cols.get(col, default_type)
|
|
69
|
+
gdf[col] = gdf[col].astype(field_type)
|
|
70
|
+
|
|
71
|
+
return gdf
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def cudf_to_pandas(gdf: cudf.DataFrame, deserialize_cols: list = []) -> pd.DataFrame:
|
|
75
|
+
"""
|
|
76
|
+
Helper function to convert from cudf to pandas until https://github.com/apache/arrow/pull/40412 is resolved.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
gdf : cudf.DataFrame
|
|
81
|
+
A cuDF dataframe.
|
|
82
|
+
nested_cols : list
|
|
83
|
+
A list of columns containing nested data.
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
pd.DataFrame
|
|
87
|
+
A pandas dataframe.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
with io.BytesIO() as bytes_buf:
|
|
91
|
+
gdf.to_parquet(bytes_buf)
|
|
92
|
+
df = pd.read_parquet(bytes_buf, engine="fastparquet", index=None)
|
|
93
|
+
|
|
94
|
+
for col in deserialize_cols:
|
|
95
|
+
if col in df.columns:
|
|
96
|
+
df[col] = df[col].apply(lambda x: json.loads(x))
|
|
97
|
+
|
|
98
|
+
return df
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def cudf_to_json(gdf: cudf.DataFrame, deserialize_cols: list = []) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Helper function to convert from cudf to json until https://github.com/apache/arrow/pull/40412 is resolved.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
gdf : cudf.DataFrame
|
|
108
|
+
A cuDF dataframe.
|
|
109
|
+
nested_cols : list
|
|
110
|
+
A list of columns containing nested data.
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
str
|
|
114
|
+
A JSON formated string.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
records = []
|
|
118
|
+
dict_vals = cudf_to_pandas(gdf).to_dict(orient="records")
|
|
119
|
+
for d in dict_vals:
|
|
120
|
+
temp = {}
|
|
121
|
+
for key, val in d.items():
|
|
122
|
+
if key in deserialize_cols:
|
|
123
|
+
val = json.loads(val)
|
|
124
|
+
temp[key] = val
|
|
125
|
+
records.append(temp)
|
|
126
|
+
|
|
127
|
+
return records
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# pylint: skip-file
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def ingest_json_results_to_blob(result_content):
|
|
11
|
+
"""
|
|
12
|
+
Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
str: The generated blob string.
|
|
16
|
+
"""
|
|
17
|
+
try:
|
|
18
|
+
# Load the JSON data
|
|
19
|
+
data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content)
|
|
20
|
+
data = data["data"]
|
|
21
|
+
|
|
22
|
+
# Smarter sorting: by page, then structured objects by x0, y0
|
|
23
|
+
def sorting_key(entry):
|
|
24
|
+
page = entry["metadata"]["content_metadata"]["page_number"]
|
|
25
|
+
if entry["document_type"] == "structured":
|
|
26
|
+
# Use table location's x0 and y0 as secondary keys
|
|
27
|
+
x0 = entry["metadata"]["table_metadata"]["table_location"][0]
|
|
28
|
+
y0 = entry["metadata"]["table_metadata"]["table_location"][1]
|
|
29
|
+
else:
|
|
30
|
+
# Non-structured objects are sorted after structured ones
|
|
31
|
+
x0 = float("inf")
|
|
32
|
+
y0 = float("inf")
|
|
33
|
+
return page, x0, y0
|
|
34
|
+
|
|
35
|
+
data.sort(key=sorting_key)
|
|
36
|
+
|
|
37
|
+
# Initialize the blob string
|
|
38
|
+
blob = []
|
|
39
|
+
|
|
40
|
+
for entry in data:
|
|
41
|
+
document_type = entry.get("document_type", "")
|
|
42
|
+
|
|
43
|
+
if document_type == "structured":
|
|
44
|
+
# Add table content to the blob
|
|
45
|
+
blob.append(entry["metadata"]["table_metadata"]["table_content"])
|
|
46
|
+
blob.append("\n")
|
|
47
|
+
|
|
48
|
+
elif document_type == "text":
|
|
49
|
+
# Add content to the blob
|
|
50
|
+
blob.append(entry["metadata"]["content"])
|
|
51
|
+
blob.append("\n")
|
|
52
|
+
|
|
53
|
+
elif document_type == "image":
|
|
54
|
+
# Add image caption to the blob
|
|
55
|
+
caption = entry["metadata"]["image_metadata"].get("caption", "")
|
|
56
|
+
blob.append(f"image_caption:[{caption}]")
|
|
57
|
+
blob.append("\n")
|
|
58
|
+
|
|
59
|
+
# Join all parts of the blob into a single string
|
|
60
|
+
return "".join(blob)
|
|
61
|
+
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print(f"[ERROR] An error occurred while processing JSON content: {e}")
|
|
64
|
+
return ""
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
|
|
5
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
6
|
+
|
|
7
|
+
DOC_TO_CONTENT_MAP = {
|
|
8
|
+
DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
|
|
9
|
+
DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
|
|
10
|
+
DocumentTypeEnum.HTML: ContentTypeEnum.STRUCTURED,
|
|
11
|
+
DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
|
|
12
|
+
DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
|
|
13
|
+
DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
|
|
14
|
+
DocumentTypeEnum.PNG: ContentTypeEnum.IMAGE,
|
|
15
|
+
DocumentTypeEnum.PPTX: ContentTypeEnum.STRUCTURED,
|
|
16
|
+
DocumentTypeEnum.SVG: ContentTypeEnum.IMAGE,
|
|
17
|
+
DocumentTypeEnum.TIFF: ContentTypeEnum.IMAGE,
|
|
18
|
+
DocumentTypeEnum.TXT: ContentTypeEnum.TEXT,
|
|
19
|
+
DocumentTypeEnum.WAV: ContentTypeEnum.AUDIO,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def doc_type_to_content_type(doc_type: DocumentTypeEnum) -> ContentTypeEnum:
|
|
24
|
+
"""
|
|
25
|
+
Convert DocumentTypeEnum to ContentTypeEnum
|
|
26
|
+
"""
|
|
27
|
+
return DOC_TO_CONTENT_MAP[doc_type]
|