nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -86
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
@@ -1,187 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import base64
6
- import functools
7
- import io
8
- import logging
9
- from typing import Any, Optional, Dict, Union, Tuple
10
-
11
- import pandas as pd
12
- from pydantic import BaseModel
13
-
14
- from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import python_pptx
15
- from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- def _prepare_task_properties(
21
- base64_row: pd.Series, task_props: Union[Dict[str, Any], BaseModel]
22
- ) -> Tuple[Dict[str, Any], Optional[str]]:
23
- """
24
- Prepare and return the task properties dictionary and source identifier from a DataFrame row.
25
-
26
- This function converts task properties to a dictionary (if provided as a Pydantic model),
27
- extracts row data (excluding the "content" field), and stores it under the "row_data" key within
28
- the task properties. It also retrieves the "source_id" from the row if present.
29
-
30
- Parameters
31
- ----------
32
- base64_row : pd.Series
33
- A pandas Series representing a row containing base64-encoded content under the key "content"
34
- and optionally a "source_id".
35
- task_props : Union[Dict[str, Any], BaseModel]
36
- A dictionary or Pydantic model containing extraction instructions and parameters.
37
-
38
- Returns
39
- -------
40
- Tuple[Dict[str, Any], Optional[str]]
41
- A tuple where the first element is the prepared task properties dictionary (with "row_data" added)
42
- and the second element is the source_id if present; otherwise, None.
43
- """
44
- # If task_props is a Pydantic model, convert it to a dictionary.
45
- if isinstance(task_props, BaseModel):
46
- task_props = task_props.model_dump()
47
- else:
48
- task_props = dict(task_props)
49
-
50
- # Exclude the "content" field from the row data.
51
- row_data = base64_row.drop(labels=["content"], errors="ignore")
52
- if "params" not in task_props:
53
- task_props["params"] = {}
54
- # Store the row data in the parameters.
55
- task_props["params"]["row_data"] = row_data
56
-
57
- # Retrieve the source identifier if available.
58
- source_id = base64_row.get("source_id", None)
59
- return task_props, source_id
60
-
61
-
62
- @unified_exception_handler
63
- def _decode_and_extract_from_pptx(
64
- base64_row: pd.Series,
65
- task_props: Union[Dict[str, Any], BaseModel],
66
- extraction_config: Any,
67
- trace_info: Dict[str, Any],
68
- ) -> Any:
69
- """
70
- Decode base64-encoded PPTX content from a DataFrame row and extract data using the specified method.
71
-
72
- The function prepares task properties (using `_prepare_task_properties`), decodes the base64 content
73
- into a byte stream, determines extraction parameters, and calls the extraction function (e.g. `python_pptx`)
74
- with the proper flags. If extraction fails, an exception tag is returned.
75
-
76
- Parameters
77
- ----------
78
- base64_row : pd.Series
79
- A Series containing base64-encoded PPTX content under the key "content" and optionally a "source_id".
80
- task_props : Union[Dict[str, Any], BaseModel]
81
- A dictionary or Pydantic model containing extraction instructions (may include a "method" key and "params").
82
- extraction_config : Any
83
- A configuration object containing PPTX extraction settings (e.g. `pptx_extraction_config`).
84
- trace_info : Dict[str, Any]
85
- A dictionary with trace information for logging or debugging.
86
-
87
- Returns
88
- -------
89
- Any
90
- The extracted data from the PPTX file, or an exception tag indicating failure.
91
- """
92
- # Prepare task properties and extract source_id.
93
- prepared_task_props, source_id = _prepare_task_properties(base64_row, task_props)
94
-
95
- # Decode base64 content into bytes and create a BytesIO stream.
96
- base64_content: str = base64_row["content"]
97
- pptx_bytes: bytes = base64.b64decode(base64_content)
98
- pptx_stream: io.BytesIO = io.BytesIO(pptx_bytes)
99
-
100
- # Retrieve extraction parameters (and remove boolean flags as they are consumed).
101
- extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
102
- try:
103
- extract_text: bool = extract_params.pop("extract_text", False)
104
- extract_images: bool = extract_params.pop("extract_images", False)
105
- extract_tables: bool = extract_params.pop("extract_tables", False)
106
- extract_charts: bool = extract_params.pop("extract_charts", False)
107
- extract_infographics: bool = extract_params.pop("extract_infographics", False)
108
- except KeyError as e:
109
- raise ValueError(f"Missing required extraction flag: {e}")
110
-
111
- # Inject additional configuration and trace information.
112
- if getattr(extraction_config, "pptx_extraction_config", None) is not None:
113
- extract_params["pptx_extraction_config"] = extraction_config.pptx_extraction_config
114
- if trace_info is not None:
115
- extract_params["trace_info"] = trace_info
116
-
117
- # Call the PPTX extraction function.
118
- extracted_data = python_pptx(
119
- pptx_stream=pptx_stream,
120
- extract_text=extract_text,
121
- extract_images=extract_images,
122
- extract_infographics=extract_infographics,
123
- extract_tables=extract_tables,
124
- extract_charts=extract_charts,
125
- extraction_config=extract_params,
126
- execution_trace_log=None,
127
- )
128
-
129
- return extracted_data
130
-
131
-
132
- @unified_exception_handler
133
- def extract_primitives_from_pptx_internal(
134
- df_extraction_ledger: pd.DataFrame,
135
- task_config: Union[Dict[str, Any], BaseModel],
136
- extraction_config: Any, # Assuming PPTXExtractorSchema or similar type
137
- execution_trace_log: Optional[Dict[str, Any]] = None,
138
- ) -> pd.DataFrame:
139
- """
140
- Process a DataFrame containing base64-encoded PPTX files and extract primitive data.
141
-
142
- This function applies a decoding and extraction routine to each row of the DataFrame
143
- (via `_decode_and_extract_from_pptx`), then explodes any list results into separate rows, drops missing values,
144
- and compiles the extracted data into a new DataFrame. The resulting DataFrame includes columns for document type,
145
- extracted metadata, and a unique identifier (UUID).
146
-
147
- Parameters
148
- ----------
149
- df_extraction_ledger : pd.DataFrame
150
- Input DataFrame with PPTX files in base64 encoding. Expected to include columns 'source_id' and 'content'.
151
- task_config : Union[Dict[str, Any], BaseModel]
152
- Configuration for the PPTX extraction task, as a dict or Pydantic model.
153
- extraction_config : Any
154
- Configuration object for PPTX extraction (e.g. PPTXExtractorSchema).
155
- execution_trace_log : Optional[Dict[str, Any]], optional
156
- Optional dictionary containing trace information for debugging.
157
-
158
- Returns
159
- -------
160
- pd.DataFrame
161
- DataFrame with extracted PPTX content containing columns:
162
- "document_type", "metadata", and "uuid".
163
-
164
- Raises
165
- ------
166
- Exception
167
- Reraises any exception encountered during extraction with additional context.
168
- """
169
- # Create a partial function to decode and extract content from each DataFrame row.
170
- decode_and_extract_partial = functools.partial(
171
- _decode_and_extract_from_pptx,
172
- task_props=task_config,
173
- extraction_config=extraction_config,
174
- trace_info=execution_trace_log,
175
- )
176
- # Apply the decoding and extraction to each row.
177
- extraction_series = df_extraction_ledger.apply(decode_and_extract_partial, axis=1)
178
- # Explode list results into separate rows and remove missing values.
179
- extraction_series = extraction_series.explode().dropna()
180
-
181
- # Convert the series into a DataFrame with defined columns.
182
- if not extraction_series.empty:
183
- extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
184
- else:
185
- extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
186
-
187
- return extracted_df
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,110 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- import hashlib
7
- from typing import Any, Dict, Optional, List
8
-
9
- import pandas as pd
10
-
11
- from nv_ingest_api.internal.enums.common import ContentTypeEnum
12
- from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
13
-
14
- logger = logging.getLogger(__name__)
15
-
16
-
17
- def _hash_content(x: Any, algorithm: str = "md5") -> bytes:
18
- """
19
- Compute a hash of the content using the specified algorithm.
20
-
21
- Parameters
22
- ----------
23
- x : dict
24
- A dictionary containing the content under the key "content".
25
- algorithm : str, optional
26
- Hashing algorithm to use (default "md5").
27
-
28
- Returns
29
- -------
30
- bytes
31
- The computed hash.
32
- """
33
- try:
34
- return hashlib.new(algorithm, x["content"].encode()).digest()
35
- except Exception as e:
36
- msg = f"hash_content: Error computing hash: {e}"
37
- logger.error(msg, exc_info=True)
38
- raise type(e)(msg) from e
39
-
40
-
41
- def deduplicate_images_internal(
42
- df_ledger: pd.DataFrame,
43
- task_config: Dict[str, Any],
44
- mutate_config: ImageDedupSchema = ImageDedupSchema(),
45
- execution_trace_log: Optional[List[Any]] = None,
46
- ) -> pd.DataFrame:
47
- """
48
- Deduplicate images in a DataFrame based on content hashes.
49
-
50
- The function processes rows where the 'document_type' is IMAGE, computes a content hash for each,
51
- and then either removes duplicates or marks them based on the 'filter' flag in task_config.
52
- A 'hash_algorithm' flag in task_config determines the algorithm used for hashing.
53
-
54
- Parameters
55
- ----------
56
- df_ledger : pd.DataFrame
57
- DataFrame containing at least 'document_type' and 'metadata' columns.
58
- task_config : dict
59
- Configuration parameters, including:
60
- - "filter": bool, if True duplicate rows are removed; if False, duplicates are marked.
61
- - "hash_algorithm": str, the algorithm to use for hashing (default "md5").
62
- mutate_config : ImageDedupSchema, optional
63
- execution_trace_log : Optional[List[Any]], optional
64
-
65
- Returns
66
- -------
67
- pd.DataFrame
68
- The DataFrame with duplicate images either removed or marked.
69
-
70
- Raises
71
- ------
72
- ValueError
73
- If the required columns are missing.
74
- Exception
75
- For any other errors encountered during deduplication.
76
- """
77
-
78
- _ = mutate_config # Unused variable
79
- _ = execution_trace_log # TODO(Devin): Implement trace logging
80
-
81
- try:
82
- # Verify required columns exist.
83
- for col in ("document_type", "metadata"):
84
- if col not in df_ledger.columns:
85
- raise ValueError(f"Missing required column '{col}'.")
86
-
87
- # Select image rows.
88
- image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
89
- if not image_mask.any():
90
- return df_ledger[~image_mask]
91
-
92
- df_images = df_ledger.loc[image_mask].copy()
93
- hash_algorithm = task_config.get("hash_algorithm", "md5")
94
-
95
- # Compute content hash for each image.
96
- df_images["_image_content_hash"] = df_images["metadata"].apply(_hash_content, args=(hash_algorithm,))
97
- df_images_deduped = df_images.drop_duplicates(subset="_image_content_hash")
98
- deduped_indices = df_images_deduped.index
99
-
100
- non_image_rows = df_ledger.loc[~image_mask]
101
- deduped_images = df_images.loc[deduped_indices][df_ledger.columns.difference(["_image_content_hash"])]
102
-
103
- result, execution_trace_log = pd.concat([deduped_images, non_image_rows], axis=0), {}
104
- _ = execution_trace_log
105
-
106
- return result
107
- except Exception as e:
108
- msg = f"deduplicate_images_internal: Error applying deduplication filter: {e}"
109
- logger.error(msg, exc_info=True)
110
- raise type(e)(msg) from e
@@ -1,133 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- from typing import Dict, Optional, List, Any
7
-
8
- import pandas as pd
9
-
10
- from nv_ingest_api.internal.enums.common import TaskTypeEnum
11
- from nv_ingest_api.internal.schemas.meta.metadata_schema import (
12
- ContentTypeEnum,
13
- InfoMessageMetadataSchema,
14
- StatusEnum,
15
- )
16
- from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
17
- from nv_ingest_api.util.schema.schema_validator import validate_schema
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- def _add_info_message(x, info_msg):
23
- x["info_message_metadata"] = info_msg
24
-
25
- return x
26
-
27
-
28
- def _calculate_average_image_size(x):
29
- return (x["image_metadata"]["width"] + x["image_metadata"]["height"]) / 2
30
-
31
-
32
- def _calculate_aspect_ratio(x):
33
- return x["image_metadata"]["width"] / max(x["image_metadata"]["height"], 1e-9)
34
-
35
-
36
- def filter_images_internal(
37
- df_ledger: pd.DataFrame,
38
- task_config: Dict[str, Any],
39
- mutate_config: ImageFilterSchema = ImageFilterSchema(),
40
- execution_trace_log: Optional[List[Any]] = None,
41
- ) -> pd.DataFrame:
42
- """
43
- Apply an image filtering operation to a DataFrame based on average image size and aspect ratio.
44
-
45
- Parameters
46
- ----------
47
- df_ledger : pd.DataFrame
48
- DataFrame to be filtered. Must contain 'document_type' and 'metadata' columns.
49
- task_config : dict
50
- Dictionary with the following keys:
51
- - "min_size": Minimum average image size threshold.
52
- - "max_aspect_ratio": Maximum allowed aspect ratio.
53
- - "min_aspect_ratio": Minimum allowed aspect ratio.
54
- - "filter": If True, rows failing the criteria are dropped; if False, they are flagged.
55
- mutate_config : ImageFilterSchema
56
- execution_trace_log : Optional[List[Any]], optional
57
-
58
- Returns
59
- -------
60
- pd.DataFrame
61
- The updated DataFrame after applying the image filter.
62
-
63
- Raises
64
- ------
65
- ValueError
66
- If required columns are missing or if parameters are invalid.
67
- Exception
68
- For other errors encountered during filtering.
69
- """
70
-
71
- _ = mutate_config # Unused variable
72
- _ = execution_trace_log # TODO(Devin)
73
-
74
- try:
75
- required_columns = {"document_type", "metadata"}
76
- if not required_columns.issubset(df_ledger.columns):
77
- raise ValueError(f"DataFrame must contain columns: {required_columns}")
78
-
79
- min_size = task_config.get("min_size")
80
- max_aspect_ratio = task_config.get("max_aspect_ratio")
81
- min_aspect_ratio = task_config.get("min_aspect_ratio")
82
- filter_flag = task_config.get("filter", True)
83
-
84
- if not isinstance(min_size, (int, float)) or min_size < 0:
85
- raise ValueError("min_size must be a non-negative number")
86
- if not isinstance(max_aspect_ratio, (int, float)) or max_aspect_ratio <= 0:
87
- raise ValueError("max_aspect_ratio must be a positive number")
88
- if not isinstance(min_aspect_ratio, (int, float)) or min_aspect_ratio <= 0:
89
- raise ValueError("min_aspect_ratio must be a positive number")
90
- if min_aspect_ratio > max_aspect_ratio:
91
- raise ValueError("min_aspect_ratio cannot be greater than max_aspect_ratio")
92
-
93
- image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
94
- if not image_mask.any():
95
- return df_ledger.copy()
96
-
97
- df_image = df_ledger.loc[image_mask].copy()
98
- avg_size = df_image["metadata"].apply(_calculate_average_image_size)
99
- avg_size_mask = avg_size > min_size
100
-
101
- aspect_ratio = df_image["metadata"].apply(_calculate_aspect_ratio)
102
- min_aspect_ratio_mask = aspect_ratio > min_aspect_ratio
103
- max_aspect_ratio_mask = aspect_ratio < max_aspect_ratio
104
-
105
- valid_mask = avg_size_mask & min_aspect_ratio_mask & max_aspect_ratio_mask
106
- image_filter_mask = ~valid_mask
107
-
108
- if image_filter_mask.any():
109
- filtered_df = df_image.loc[image_filter_mask].copy()
110
- if filter_flag:
111
- df_ledger.drop(labels=filtered_df.index, inplace=True)
112
- return df_ledger
113
-
114
- info_msg = {
115
- "task": TaskTypeEnum.FILTER.value,
116
- "status": StatusEnum.SUCCESS.value,
117
- "message": "Filtered due to image size or aspect ratio.",
118
- "filter": True,
119
- }
120
- validated_info_msg = validate_schema(info_msg, InfoMessageMetadataSchema).model_dump()
121
- filtered_df["info_message_metadata"] = [validated_info_msg] * filtered_df.shape[0]
122
- filtered_df["metadata"] = filtered_df["metadata"].apply(_add_info_message, args=(info_msg,))
123
- df_ledger.loc[filtered_df.index, "metadata"] = filtered_df["metadata"]
124
- df_ledger.loc[filtered_df.index, "document_type"] = ContentTypeEnum.INFO_MSG
125
-
126
- result, execution_trace_log = df_ledger, {}
127
-
128
- return result
129
-
130
- except Exception as e:
131
- err_msg = f"filter_images_internal: Error applying image filter. Original error: {e}"
132
- logger.error(err_msg, exc_info=True)
133
- raise type(e)(err_msg) from e
File without changes
@@ -1,8 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- from .nim_client import NimClient
6
- from .nim_model_interface import ModelInterface
7
-
8
- __all__ = ["NimClient", "ModelInterface"]
@@ -1,15 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- # Copyright (c) 2024, NVIDIA CORPORATION.
6
-
7
-
8
- YOLOX_MAX_BATCH_SIZE = 8
9
- YOLOX_MAX_WIDTH = 1536
10
- YOLOX_MAX_HEIGHT = 1536
11
- YOLOX_NUM_CLASSES = 3
12
- YOLOX_CONF_THRESHOLD = 0.01
13
- YOLOX_IOU_THRESHOLD = 0.5
14
- YOLOX_MIN_SCORE = 0.1
15
- YOLOX_FINAL_SCORE = 0.48
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0