nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -72
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
|
@@ -1,187 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import base64
|
|
6
|
-
import functools
|
|
7
|
-
import io
|
|
8
|
-
import logging
|
|
9
|
-
from typing import Any, Optional, Dict, Union, Tuple
|
|
10
|
-
|
|
11
|
-
import pandas as pd
|
|
12
|
-
from pydantic import BaseModel
|
|
13
|
-
|
|
14
|
-
from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import python_pptx
|
|
15
|
-
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def _prepare_task_properties(
|
|
21
|
-
base64_row: pd.Series, task_props: Union[Dict[str, Any], BaseModel]
|
|
22
|
-
) -> Tuple[Dict[str, Any], Optional[str]]:
|
|
23
|
-
"""
|
|
24
|
-
Prepare and return the task properties dictionary and source identifier from a DataFrame row.
|
|
25
|
-
|
|
26
|
-
This function converts task properties to a dictionary (if provided as a Pydantic model),
|
|
27
|
-
extracts row data (excluding the "content" field), and stores it under the "row_data" key within
|
|
28
|
-
the task properties. It also retrieves the "source_id" from the row if present.
|
|
29
|
-
|
|
30
|
-
Parameters
|
|
31
|
-
----------
|
|
32
|
-
base64_row : pd.Series
|
|
33
|
-
A pandas Series representing a row containing base64-encoded content under the key "content"
|
|
34
|
-
and optionally a "source_id".
|
|
35
|
-
task_props : Union[Dict[str, Any], BaseModel]
|
|
36
|
-
A dictionary or Pydantic model containing extraction instructions and parameters.
|
|
37
|
-
|
|
38
|
-
Returns
|
|
39
|
-
-------
|
|
40
|
-
Tuple[Dict[str, Any], Optional[str]]
|
|
41
|
-
A tuple where the first element is the prepared task properties dictionary (with "row_data" added)
|
|
42
|
-
and the second element is the source_id if present; otherwise, None.
|
|
43
|
-
"""
|
|
44
|
-
# If task_props is a Pydantic model, convert it to a dictionary.
|
|
45
|
-
if isinstance(task_props, BaseModel):
|
|
46
|
-
task_props = task_props.model_dump()
|
|
47
|
-
else:
|
|
48
|
-
task_props = dict(task_props)
|
|
49
|
-
|
|
50
|
-
# Exclude the "content" field from the row data.
|
|
51
|
-
row_data = base64_row.drop(labels=["content"], errors="ignore")
|
|
52
|
-
if "params" not in task_props:
|
|
53
|
-
task_props["params"] = {}
|
|
54
|
-
# Store the row data in the parameters.
|
|
55
|
-
task_props["params"]["row_data"] = row_data
|
|
56
|
-
|
|
57
|
-
# Retrieve the source identifier if available.
|
|
58
|
-
source_id = base64_row.get("source_id", None)
|
|
59
|
-
return task_props, source_id
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
@unified_exception_handler
|
|
63
|
-
def _decode_and_extract_from_pptx(
|
|
64
|
-
base64_row: pd.Series,
|
|
65
|
-
task_props: Union[Dict[str, Any], BaseModel],
|
|
66
|
-
extraction_config: Any,
|
|
67
|
-
trace_info: Dict[str, Any],
|
|
68
|
-
) -> Any:
|
|
69
|
-
"""
|
|
70
|
-
Decode base64-encoded PPTX content from a DataFrame row and extract data using the specified method.
|
|
71
|
-
|
|
72
|
-
The function prepares task properties (using `_prepare_task_properties`), decodes the base64 content
|
|
73
|
-
into a byte stream, determines extraction parameters, and calls the extraction function (e.g. `python_pptx`)
|
|
74
|
-
with the proper flags. If extraction fails, an exception tag is returned.
|
|
75
|
-
|
|
76
|
-
Parameters
|
|
77
|
-
----------
|
|
78
|
-
base64_row : pd.Series
|
|
79
|
-
A Series containing base64-encoded PPTX content under the key "content" and optionally a "source_id".
|
|
80
|
-
task_props : Union[Dict[str, Any], BaseModel]
|
|
81
|
-
A dictionary or Pydantic model containing extraction instructions (may include a "method" key and "params").
|
|
82
|
-
extraction_config : Any
|
|
83
|
-
A configuration object containing PPTX extraction settings (e.g. `pptx_extraction_config`).
|
|
84
|
-
trace_info : Dict[str, Any]
|
|
85
|
-
A dictionary with trace information for logging or debugging.
|
|
86
|
-
|
|
87
|
-
Returns
|
|
88
|
-
-------
|
|
89
|
-
Any
|
|
90
|
-
The extracted data from the PPTX file, or an exception tag indicating failure.
|
|
91
|
-
"""
|
|
92
|
-
# Prepare task properties and extract source_id.
|
|
93
|
-
prepared_task_props, source_id = _prepare_task_properties(base64_row, task_props)
|
|
94
|
-
|
|
95
|
-
# Decode base64 content into bytes and create a BytesIO stream.
|
|
96
|
-
base64_content: str = base64_row["content"]
|
|
97
|
-
pptx_bytes: bytes = base64.b64decode(base64_content)
|
|
98
|
-
pptx_stream: io.BytesIO = io.BytesIO(pptx_bytes)
|
|
99
|
-
|
|
100
|
-
# Retrieve extraction parameters (and remove boolean flags as they are consumed).
|
|
101
|
-
extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
|
|
102
|
-
try:
|
|
103
|
-
extract_text: bool = extract_params.pop("extract_text", False)
|
|
104
|
-
extract_images: bool = extract_params.pop("extract_images", False)
|
|
105
|
-
extract_tables: bool = extract_params.pop("extract_tables", False)
|
|
106
|
-
extract_charts: bool = extract_params.pop("extract_charts", False)
|
|
107
|
-
extract_infographics: bool = extract_params.pop("extract_infographics", False)
|
|
108
|
-
except KeyError as e:
|
|
109
|
-
raise ValueError(f"Missing required extraction flag: {e}")
|
|
110
|
-
|
|
111
|
-
# Inject additional configuration and trace information.
|
|
112
|
-
if getattr(extraction_config, "pptx_extraction_config", None) is not None:
|
|
113
|
-
extract_params["pptx_extraction_config"] = extraction_config.pptx_extraction_config
|
|
114
|
-
if trace_info is not None:
|
|
115
|
-
extract_params["trace_info"] = trace_info
|
|
116
|
-
|
|
117
|
-
# Call the PPTX extraction function.
|
|
118
|
-
extracted_data = python_pptx(
|
|
119
|
-
pptx_stream=pptx_stream,
|
|
120
|
-
extract_text=extract_text,
|
|
121
|
-
extract_images=extract_images,
|
|
122
|
-
extract_infographics=extract_infographics,
|
|
123
|
-
extract_tables=extract_tables,
|
|
124
|
-
extract_charts=extract_charts,
|
|
125
|
-
extraction_config=extract_params,
|
|
126
|
-
execution_trace_log=None,
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
return extracted_data
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
@unified_exception_handler
|
|
133
|
-
def extract_primitives_from_pptx_internal(
|
|
134
|
-
df_extraction_ledger: pd.DataFrame,
|
|
135
|
-
task_config: Union[Dict[str, Any], BaseModel],
|
|
136
|
-
extraction_config: Any, # Assuming PPTXExtractorSchema or similar type
|
|
137
|
-
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
138
|
-
) -> pd.DataFrame:
|
|
139
|
-
"""
|
|
140
|
-
Process a DataFrame containing base64-encoded PPTX files and extract primitive data.
|
|
141
|
-
|
|
142
|
-
This function applies a decoding and extraction routine to each row of the DataFrame
|
|
143
|
-
(via `_decode_and_extract_from_pptx`), then explodes any list results into separate rows, drops missing values,
|
|
144
|
-
and compiles the extracted data into a new DataFrame. The resulting DataFrame includes columns for document type,
|
|
145
|
-
extracted metadata, and a unique identifier (UUID).
|
|
146
|
-
|
|
147
|
-
Parameters
|
|
148
|
-
----------
|
|
149
|
-
df_extraction_ledger : pd.DataFrame
|
|
150
|
-
Input DataFrame with PPTX files in base64 encoding. Expected to include columns 'source_id' and 'content'.
|
|
151
|
-
task_config : Union[Dict[str, Any], BaseModel]
|
|
152
|
-
Configuration for the PPTX extraction task, as a dict or Pydantic model.
|
|
153
|
-
extraction_config : Any
|
|
154
|
-
Configuration object for PPTX extraction (e.g. PPTXExtractorSchema).
|
|
155
|
-
execution_trace_log : Optional[Dict[str, Any]], optional
|
|
156
|
-
Optional dictionary containing trace information for debugging.
|
|
157
|
-
|
|
158
|
-
Returns
|
|
159
|
-
-------
|
|
160
|
-
pd.DataFrame
|
|
161
|
-
DataFrame with extracted PPTX content containing columns:
|
|
162
|
-
"document_type", "metadata", and "uuid".
|
|
163
|
-
|
|
164
|
-
Raises
|
|
165
|
-
------
|
|
166
|
-
Exception
|
|
167
|
-
Reraises any exception encountered during extraction with additional context.
|
|
168
|
-
"""
|
|
169
|
-
# Create a partial function to decode and extract content from each DataFrame row.
|
|
170
|
-
decode_and_extract_partial = functools.partial(
|
|
171
|
-
_decode_and_extract_from_pptx,
|
|
172
|
-
task_props=task_config,
|
|
173
|
-
extraction_config=extraction_config,
|
|
174
|
-
trace_info=execution_trace_log,
|
|
175
|
-
)
|
|
176
|
-
# Apply the decoding and extraction to each row.
|
|
177
|
-
extraction_series = df_extraction_ledger.apply(decode_and_extract_partial, axis=1)
|
|
178
|
-
# Explode list results into separate rows and remove missing values.
|
|
179
|
-
extraction_series = extraction_series.explode().dropna()
|
|
180
|
-
|
|
181
|
-
# Convert the series into a DataFrame with defined columns.
|
|
182
|
-
if not extraction_series.empty:
|
|
183
|
-
extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
|
|
184
|
-
else:
|
|
185
|
-
extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
|
|
186
|
-
|
|
187
|
-
return extracted_df
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
import hashlib
|
|
7
|
-
from typing import Any, Dict, Optional, List
|
|
8
|
-
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
|
-
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
12
|
-
from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _hash_content(x: Any, algorithm: str = "md5") -> bytes:
|
|
18
|
-
"""
|
|
19
|
-
Compute a hash of the content using the specified algorithm.
|
|
20
|
-
|
|
21
|
-
Parameters
|
|
22
|
-
----------
|
|
23
|
-
x : dict
|
|
24
|
-
A dictionary containing the content under the key "content".
|
|
25
|
-
algorithm : str, optional
|
|
26
|
-
Hashing algorithm to use (default "md5").
|
|
27
|
-
|
|
28
|
-
Returns
|
|
29
|
-
-------
|
|
30
|
-
bytes
|
|
31
|
-
The computed hash.
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
return hashlib.new(algorithm, x["content"].encode()).digest()
|
|
35
|
-
except Exception as e:
|
|
36
|
-
msg = f"hash_content: Error computing hash: {e}"
|
|
37
|
-
logger.error(msg, exc_info=True)
|
|
38
|
-
raise type(e)(msg) from e
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def deduplicate_images_internal(
|
|
42
|
-
df_ledger: pd.DataFrame,
|
|
43
|
-
task_config: Dict[str, Any],
|
|
44
|
-
mutate_config: ImageDedupSchema = ImageDedupSchema(),
|
|
45
|
-
execution_trace_log: Optional[List[Any]] = None,
|
|
46
|
-
) -> pd.DataFrame:
|
|
47
|
-
"""
|
|
48
|
-
Deduplicate images in a DataFrame based on content hashes.
|
|
49
|
-
|
|
50
|
-
The function processes rows where the 'document_type' is IMAGE, computes a content hash for each,
|
|
51
|
-
and then either removes duplicates or marks them based on the 'filter' flag in task_config.
|
|
52
|
-
A 'hash_algorithm' flag in task_config determines the algorithm used for hashing.
|
|
53
|
-
|
|
54
|
-
Parameters
|
|
55
|
-
----------
|
|
56
|
-
df_ledger : pd.DataFrame
|
|
57
|
-
DataFrame containing at least 'document_type' and 'metadata' columns.
|
|
58
|
-
task_config : dict
|
|
59
|
-
Configuration parameters, including:
|
|
60
|
-
- "filter": bool, if True duplicate rows are removed; if False, duplicates are marked.
|
|
61
|
-
- "hash_algorithm": str, the algorithm to use for hashing (default "md5").
|
|
62
|
-
mutate_config : ImageDedupSchema, optional
|
|
63
|
-
execution_trace_log : Optional[List[Any]], optional
|
|
64
|
-
|
|
65
|
-
Returns
|
|
66
|
-
-------
|
|
67
|
-
pd.DataFrame
|
|
68
|
-
The DataFrame with duplicate images either removed or marked.
|
|
69
|
-
|
|
70
|
-
Raises
|
|
71
|
-
------
|
|
72
|
-
ValueError
|
|
73
|
-
If the required columns are missing.
|
|
74
|
-
Exception
|
|
75
|
-
For any other errors encountered during deduplication.
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
_ = mutate_config # Unused variable
|
|
79
|
-
_ = execution_trace_log # TODO(Devin): Implement trace logging
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
# Verify required columns exist.
|
|
83
|
-
for col in ("document_type", "metadata"):
|
|
84
|
-
if col not in df_ledger.columns:
|
|
85
|
-
raise ValueError(f"Missing required column '{col}'.")
|
|
86
|
-
|
|
87
|
-
# Select image rows.
|
|
88
|
-
image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
|
|
89
|
-
if not image_mask.any():
|
|
90
|
-
return df_ledger[~image_mask]
|
|
91
|
-
|
|
92
|
-
df_images = df_ledger.loc[image_mask].copy()
|
|
93
|
-
hash_algorithm = task_config.get("hash_algorithm", "md5")
|
|
94
|
-
|
|
95
|
-
# Compute content hash for each image.
|
|
96
|
-
df_images["_image_content_hash"] = df_images["metadata"].apply(_hash_content, args=(hash_algorithm,))
|
|
97
|
-
df_images_deduped = df_images.drop_duplicates(subset="_image_content_hash")
|
|
98
|
-
deduped_indices = df_images_deduped.index
|
|
99
|
-
|
|
100
|
-
non_image_rows = df_ledger.loc[~image_mask]
|
|
101
|
-
deduped_images = df_images.loc[deduped_indices][df_ledger.columns.difference(["_image_content_hash"])]
|
|
102
|
-
|
|
103
|
-
result, execution_trace_log = pd.concat([deduped_images, non_image_rows], axis=0), {}
|
|
104
|
-
_ = execution_trace_log
|
|
105
|
-
|
|
106
|
-
return result
|
|
107
|
-
except Exception as e:
|
|
108
|
-
msg = f"deduplicate_images_internal: Error applying deduplication filter: {e}"
|
|
109
|
-
logger.error(msg, exc_info=True)
|
|
110
|
-
raise type(e)(msg) from e
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Dict, Optional, List, Any
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
|
|
10
|
-
from nv_ingest_api.internal.enums.common import TaskTypeEnum
|
|
11
|
-
from nv_ingest_api.internal.schemas.meta.metadata_schema import (
|
|
12
|
-
ContentTypeEnum,
|
|
13
|
-
InfoMessageMetadataSchema,
|
|
14
|
-
StatusEnum,
|
|
15
|
-
)
|
|
16
|
-
from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
|
|
17
|
-
from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _add_info_message(x, info_msg):
|
|
23
|
-
x["info_message_metadata"] = info_msg
|
|
24
|
-
|
|
25
|
-
return x
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _calculate_average_image_size(x):
|
|
29
|
-
return (x["image_metadata"]["width"] + x["image_metadata"]["height"]) / 2
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def _calculate_aspect_ratio(x):
|
|
33
|
-
return x["image_metadata"]["width"] / max(x["image_metadata"]["height"], 1e-9)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def filter_images_internal(
|
|
37
|
-
df_ledger: pd.DataFrame,
|
|
38
|
-
task_config: Dict[str, Any],
|
|
39
|
-
mutate_config: ImageFilterSchema = ImageFilterSchema(),
|
|
40
|
-
execution_trace_log: Optional[List[Any]] = None,
|
|
41
|
-
) -> pd.DataFrame:
|
|
42
|
-
"""
|
|
43
|
-
Apply an image filtering operation to a DataFrame based on average image size and aspect ratio.
|
|
44
|
-
|
|
45
|
-
Parameters
|
|
46
|
-
----------
|
|
47
|
-
df_ledger : pd.DataFrame
|
|
48
|
-
DataFrame to be filtered. Must contain 'document_type' and 'metadata' columns.
|
|
49
|
-
task_config : dict
|
|
50
|
-
Dictionary with the following keys:
|
|
51
|
-
- "min_size": Minimum average image size threshold.
|
|
52
|
-
- "max_aspect_ratio": Maximum allowed aspect ratio.
|
|
53
|
-
- "min_aspect_ratio": Minimum allowed aspect ratio.
|
|
54
|
-
- "filter": If True, rows failing the criteria are dropped; if False, they are flagged.
|
|
55
|
-
mutate_config : ImageFilterSchema
|
|
56
|
-
execution_trace_log : Optional[List[Any]], optional
|
|
57
|
-
|
|
58
|
-
Returns
|
|
59
|
-
-------
|
|
60
|
-
pd.DataFrame
|
|
61
|
-
The updated DataFrame after applying the image filter.
|
|
62
|
-
|
|
63
|
-
Raises
|
|
64
|
-
------
|
|
65
|
-
ValueError
|
|
66
|
-
If required columns are missing or if parameters are invalid.
|
|
67
|
-
Exception
|
|
68
|
-
For other errors encountered during filtering.
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
_ = mutate_config # Unused variable
|
|
72
|
-
_ = execution_trace_log # TODO(Devin)
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
required_columns = {"document_type", "metadata"}
|
|
76
|
-
if not required_columns.issubset(df_ledger.columns):
|
|
77
|
-
raise ValueError(f"DataFrame must contain columns: {required_columns}")
|
|
78
|
-
|
|
79
|
-
min_size = task_config.get("min_size")
|
|
80
|
-
max_aspect_ratio = task_config.get("max_aspect_ratio")
|
|
81
|
-
min_aspect_ratio = task_config.get("min_aspect_ratio")
|
|
82
|
-
filter_flag = task_config.get("filter", True)
|
|
83
|
-
|
|
84
|
-
if not isinstance(min_size, (int, float)) or min_size < 0:
|
|
85
|
-
raise ValueError("min_size must be a non-negative number")
|
|
86
|
-
if not isinstance(max_aspect_ratio, (int, float)) or max_aspect_ratio <= 0:
|
|
87
|
-
raise ValueError("max_aspect_ratio must be a positive number")
|
|
88
|
-
if not isinstance(min_aspect_ratio, (int, float)) or min_aspect_ratio <= 0:
|
|
89
|
-
raise ValueError("min_aspect_ratio must be a positive number")
|
|
90
|
-
if min_aspect_ratio > max_aspect_ratio:
|
|
91
|
-
raise ValueError("min_aspect_ratio cannot be greater than max_aspect_ratio")
|
|
92
|
-
|
|
93
|
-
image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
|
|
94
|
-
if not image_mask.any():
|
|
95
|
-
return df_ledger.copy()
|
|
96
|
-
|
|
97
|
-
df_image = df_ledger.loc[image_mask].copy()
|
|
98
|
-
avg_size = df_image["metadata"].apply(_calculate_average_image_size)
|
|
99
|
-
avg_size_mask = avg_size > min_size
|
|
100
|
-
|
|
101
|
-
aspect_ratio = df_image["metadata"].apply(_calculate_aspect_ratio)
|
|
102
|
-
min_aspect_ratio_mask = aspect_ratio > min_aspect_ratio
|
|
103
|
-
max_aspect_ratio_mask = aspect_ratio < max_aspect_ratio
|
|
104
|
-
|
|
105
|
-
valid_mask = avg_size_mask & min_aspect_ratio_mask & max_aspect_ratio_mask
|
|
106
|
-
image_filter_mask = ~valid_mask
|
|
107
|
-
|
|
108
|
-
if image_filter_mask.any():
|
|
109
|
-
filtered_df = df_image.loc[image_filter_mask].copy()
|
|
110
|
-
if filter_flag:
|
|
111
|
-
df_ledger.drop(labels=filtered_df.index, inplace=True)
|
|
112
|
-
return df_ledger
|
|
113
|
-
|
|
114
|
-
info_msg = {
|
|
115
|
-
"task": TaskTypeEnum.FILTER.value,
|
|
116
|
-
"status": StatusEnum.SUCCESS.value,
|
|
117
|
-
"message": "Filtered due to image size or aspect ratio.",
|
|
118
|
-
"filter": True,
|
|
119
|
-
}
|
|
120
|
-
validated_info_msg = validate_schema(info_msg, InfoMessageMetadataSchema).model_dump()
|
|
121
|
-
filtered_df["info_message_metadata"] = [validated_info_msg] * filtered_df.shape[0]
|
|
122
|
-
filtered_df["metadata"] = filtered_df["metadata"].apply(_add_info_message, args=(info_msg,))
|
|
123
|
-
df_ledger.loc[filtered_df.index, "metadata"] = filtered_df["metadata"]
|
|
124
|
-
df_ledger.loc[filtered_df.index, "document_type"] = ContentTypeEnum.INFO_MSG
|
|
125
|
-
|
|
126
|
-
result, execution_trace_log = df_ledger, {}
|
|
127
|
-
|
|
128
|
-
return result
|
|
129
|
-
|
|
130
|
-
except Exception as e:
|
|
131
|
-
err_msg = f"filter_images_internal: Error applying image filter. Original error: {e}"
|
|
132
|
-
logger.error(err_msg, exc_info=True)
|
|
133
|
-
raise type(e)(err_msg) from e
|
|
File without changes
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
from .nim_client import NimClient
|
|
6
|
-
from .nim_model_interface import ModelInterface
|
|
7
|
-
|
|
8
|
-
__all__ = ["NimClient", "ModelInterface"]
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
YOLOX_MAX_BATCH_SIZE = 8
|
|
9
|
-
YOLOX_MAX_WIDTH = 1536
|
|
10
|
-
YOLOX_MAX_HEIGHT = 1536
|
|
11
|
-
YOLOX_NUM_CLASSES = 3
|
|
12
|
-
YOLOX_CONF_THRESHOLD = 0.01
|
|
13
|
-
YOLOX_IOU_THRESHOLD = 0.5
|
|
14
|
-
YOLOX_MIN_SCORE = 0.1
|
|
15
|
-
YOLOX_FINAL_SCORE = 0.48
|