nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,154 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Union, Dict
7
+
8
+ import pandas as pd
9
+
10
+ from nv_ingest_api.internal.mutate.deduplicate import deduplicate_images_internal
11
+ from nv_ingest_api.internal.mutate.filter import filter_images_internal
12
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
13
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @unified_exception_handler
20
+ def filter_images(
21
+ *,
22
+ df_ledger: pd.DataFrame,
23
+ min_size: int = 128,
24
+ max_aspect_ratio: Union[float, int] = 5.0,
25
+ min_aspect_ratio: Union[float, int] = 2.0,
26
+ ) -> pd.DataFrame:
27
+ """
28
+ Apply an image filter to the ledger DataFrame based on size and aspect ratio criteria.
29
+
30
+ This function builds a set of task parameters and then delegates the filtering work to
31
+ `filter_images_internal`. If an exception occurs during filtering, the error is logged
32
+ and re-raised with additional context.
33
+
34
+ Parameters
35
+ ----------
36
+ df_ledger : pd.DataFrame
37
+ DataFrame containing image metadata. It must include the columns 'document_type' and 'metadata'.
38
+ min_size : int, optional
39
+ Minimum average image size threshold. Images with an average size less than or equal to this
40
+ value are considered for filtering. Default is 128.
41
+ max_aspect_ratio : float or int, optional
42
+ Maximum allowed image aspect ratio. Images with an aspect ratio greater than or equal to this value
43
+ are considered for filtering. Default is 5.0.
44
+ min_aspect_ratio : float or int, optional
45
+ Minimum allowed image aspect ratio. Images with an aspect ratio less than or equal to this value
46
+ are considered for filtering. Default is 2.0.
47
+ execution_trace_log : Optional[List[Any]], optional
48
+
49
+ Returns
50
+ -------
51
+ pd.DataFrame
52
+ The DataFrame after applying the image filter.
53
+
54
+ Raises
55
+ ------
56
+ Exception
57
+ If an error occurs during the filtering process.
58
+ """
59
+
60
+ task_params: Dict[str, Union[int, float, bool]] = {
61
+ "min_size": min_size,
62
+ "max_aspect_ratio": max_aspect_ratio,
63
+ "min_aspect_ratio": min_aspect_ratio,
64
+ "filter": True,
65
+ }
66
+ mutate_config = ImageFilterSchema()
67
+
68
+ result = filter_images_internal(df_ledger, task_params, mutate_config=mutate_config, execution_trace_log=None)
69
+
70
+ return result
71
+
72
+
73
+ @unified_exception_handler
74
+ def deduplicate_images(
75
+ *,
76
+ df_ledger: pd.DataFrame,
77
+ hash_algorithm: str = "md5",
78
+ ) -> pd.DataFrame:
79
+ """
80
+ Deduplicate images in the DataFrame based on content hashes.
81
+
82
+ This function constructs a task configuration using the specified hashing algorithm
83
+ and delegates the deduplication process to the internal function
84
+ ``deduplicate_images_internal``. The deduplication is performed by computing content
85
+ hashes for each image in the DataFrame and then removing duplicate images.
86
+
87
+ Parameters
88
+ ----------
89
+ df_ledger : pd.DataFrame
90
+ A pandas DataFrame containing image metadata. The DataFrame must include at least
91
+ the columns:
92
+ - ``document_type``: A string representing the document type (e.g., "png").
93
+ - ``metadata``: A dictionary that contains image-related metadata. For example,
94
+ it should include keys such as ``content`` (base64-encoded image data),
95
+ ``source_metadata``, and ``content_metadata``.
96
+ hash_algorithm : str, optional
97
+ The hashing algorithm to use for deduplication. Valid algorithms are those supported
98
+ by Python's ``hashlib.new()`` function (e.g., "md5", "sha1", "sha256"). Default is "md5".
99
+
100
+ Returns
101
+ -------
102
+ pd.DataFrame
103
+ A deduplicated DataFrame in which duplicate images have been removed. The structure
104
+ of the returned DataFrame is the same as the input, with duplicate rows eliminated.
105
+
106
+ Raises
107
+ ------
108
+ Exception
109
+ Propagates any exceptions encountered during the deduplication process.
110
+
111
+ Examples
112
+ --------
113
+ >>> import pandas as pd
114
+ >>> # Example DataFrame with image metadata.
115
+ >>> df = pd.DataFrame({
116
+ ... "source_name": ["image1.png", "image2.png"],
117
+ ... "source_id": ["image1.png", "image2.png"],
118
+ ... "content": ["<base64-encoded-image-1>", "<base64-encoded-image-2>"],
119
+ ... "document_type": ["png", "png"],
120
+ ... "metadata": [{
121
+ ... "content": "<base64-encoded-image-1>",
122
+ ... "source_metadata": {"source_id": "image1.png", "source_name": "image1.png", "source_type": "png"},
123
+ ... "content_metadata": {"type": "image"},
124
+ ... "audio_metadata": None,
125
+ ... "text_metadata": None,
126
+ ... "image_metadata": {},
127
+ ... "raise_on_failure": False,
128
+ ... },
129
+ ... {
130
+ ... "content": "<base64-encoded-image-2>",
131
+ ... "source_metadata": {"source_id": "image2.png", "source_name": "image2.png", "source_type": "png"},
132
+ ... "content_metadata": {"type": "image"},
133
+ ... "audio_metadata": None,
134
+ ... "text_metadata": None,
135
+ ... "image_metadata": {},
136
+ ... "raise_on_failure": False,
137
+ ... }]
138
+ ... })
139
+ >>> dedup_df = deduplicate_images(df_ledger=df, hash_algorithm="md5")
140
+ >>> dedup_df
141
+ """
142
+ task_config: Dict[str, Union[int, float, bool, str]] = {
143
+ "hash_algorithm": hash_algorithm,
144
+ }
145
+ mutate_config: ImageDedupSchema = ImageDedupSchema()
146
+
147
+ result = deduplicate_images_internal(
148
+ df_ledger=df_ledger,
149
+ task_config=task_config,
150
+ mutate_config=mutate_config,
151
+ execution_trace_log=None,
152
+ )
153
+
154
+ return result
@@ -0,0 +1,218 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from typing import Dict, Any, Optional
6
+
7
+ import pandas as pd
8
+
9
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
10
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
11
+ from nv_ingest_api.internal.store.embed_text_upload import store_text_embeddings_internal
12
+ from nv_ingest_api.internal.store.image_upload import store_images_to_minio_internal
13
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
14
+
15
+
16
+ @unified_exception_handler
17
+ def store_embeddings(
18
+ *,
19
+ df_ledger: pd.DataFrame,
20
+ milvus_address: Optional[str] = None,
21
+ milvus_uri: Optional[str] = None,
22
+ milvus_host: Optional[str] = None,
23
+ milvus_port: Optional[int] = None,
24
+ milvus_collection_name: Optional[str] = None,
25
+ minio_access_key: Optional[str] = None,
26
+ minio_secret_key: Optional[str] = None,
27
+ minio_session_token: Optional[str] = None,
28
+ minio_endpoint: Optional[str] = None,
29
+ minio_bucket_name: Optional[str] = None,
30
+ minio_bucket_path: Optional[str] = None,
31
+ minio_secure: Optional[bool] = None,
32
+ minio_region: Optional[str] = None,
33
+ ) -> pd.DataFrame:
34
+ """
35
+ Stores embeddings by configuring task parameters and invoking the internal storage routine.
36
+
37
+ If any of the connection or configuration parameters are None, they will be omitted from the task
38
+ configuration, allowing default values defined in the storage schema to be used.
39
+
40
+ Parameters
41
+ ----------
42
+ df_ledger : pd.DataFrame
43
+ DataFrame containing the data whose embeddings need to be stored.
44
+ milvus_address : Optional[str], default=None
45
+ The address of the Milvus service.
46
+ milvus_uri : Optional[str], default=None
47
+ The URI for the Milvus service.
48
+ milvus_host : Optional[str], default=None
49
+ The host for the Milvus service.
50
+ milvus_port : Optional[int], default=None
51
+ The port for the Milvus service.
52
+ milvus_collection_name : Optional[str], default=None
53
+ The name of the Milvus collection.
54
+ minio_access_key : Optional[str], default=None
55
+ The access key for MinIO.
56
+ minio_secret_key : Optional[str], default=None
57
+ The secret key for MinIO.
58
+ minio_session_token : Optional[str], default=None
59
+ The session token for MinIO.
60
+ minio_endpoint : Optional[str], default=None
61
+ The endpoint URL for MinIO.
62
+ minio_bucket_name : Optional[str], default=None
63
+ The name of the MinIO bucket.
64
+ minio_bucket_path : Optional[str], default=None
65
+ The bucket path where embeddings will be stored.
66
+ minio_secure : Optional[bool], default=None
67
+ Whether to use a secure connection to MinIO.
68
+ minio_region : Optional[str], default=None
69
+ The region of the MinIO service.
70
+
71
+ Returns
72
+ -------
73
+ pd.DataFrame
74
+ The updated DataFrame after embeddings have been stored.
75
+
76
+ Raises
77
+ ------
78
+ Exception
79
+ Propagates any exception raised during the storage process, wrapped with additional context.
80
+ """
81
+ params: Dict[str, Any] = {
82
+ "milvus_address": milvus_address,
83
+ "milvus_collection_name": milvus_collection_name,
84
+ "milvus_host": milvus_host,
85
+ "milvus_port": milvus_port,
86
+ "milvus_uri": milvus_uri,
87
+ "minio_access_key": minio_access_key,
88
+ "minio_bucket_name": minio_bucket_name,
89
+ "minio_bucket_path": minio_bucket_path,
90
+ "minio_endpoint": minio_endpoint,
91
+ "minio_region": minio_region,
92
+ "minio_secret_key": minio_secret_key,
93
+ "minio_secure": minio_secure,
94
+ "minio_session_token": minio_session_token,
95
+ }
96
+ # Remove keys with None values so that default values in the storage schema are used.
97
+ filtered_params = {key: value for key, value in params.items() if value is not None}
98
+ task_config: Dict[str, Any] = {"params": filtered_params}
99
+
100
+ store_config = EmbeddingStorageSchema()
101
+
102
+ result, _ = store_text_embeddings_internal(
103
+ df_ledger,
104
+ task_config=task_config,
105
+ store_config=store_config,
106
+ execution_trace_log=None,
107
+ )
108
+
109
+ return result
110
+
111
+
112
+ @unified_exception_handler
113
+ def store_images_to_minio(
114
+ *,
115
+ df_ledger: pd.DataFrame,
116
+ store_structured: bool = True,
117
+ store_unstructured: bool = False,
118
+ minio_access_key: Optional[str] = None,
119
+ minio_bucket_name: Optional[str] = None,
120
+ minio_endpoint: Optional[str] = None,
121
+ minio_region: Optional[str] = None,
122
+ minio_secret_key: Optional[str] = None,
123
+ minio_secure: bool = False,
124
+ minio_session_token: Optional[str] = None,
125
+ ) -> pd.DataFrame:
126
+ """
127
+ Store images to a Minio storage backend.
128
+
129
+ This function prepares a flat configuration dictionary for storing images and structured
130
+ data to a Minio storage system. It determines which content types to store based on the
131
+ provided flags and delegates the storage operation to the internal function
132
+ `store_images_to_minio_internal`.
133
+
134
+ Parameters
135
+ ----------
136
+ df_ledger : pd.DataFrame
137
+ DataFrame containing ledger information with document metadata.
138
+ store_structured : bool, optional
139
+ Flag indicating whether to store structured content. Defaults to True.
140
+ store_unstructured : bool, optional
141
+ Flag indicating whether to store unstructured image content. Defaults to False.
142
+ minio_access_key : Optional[str], optional
143
+ Access key for authenticating with Minio. Defaults to None.
144
+ minio_bucket_name : Optional[str], optional
145
+ Name of the Minio bucket where images will be stored. Defaults to None.
146
+ minio_endpoint : Optional[str], optional
147
+ Endpoint URL for the Minio service. Defaults to None.
148
+ minio_region : Optional[str], optional
149
+ Region identifier for the Minio service. Defaults to None.
150
+ minio_secret_key : Optional[str], optional
151
+ Secret key for authenticating with Minio. Defaults to None.
152
+ minio_secure : bool, optional
153
+ Whether to use a secure connection (HTTPS) with Minio. Defaults to False.
154
+ minio_session_token : Optional[str], optional
155
+ Session token for temporary credentials with Minio. Defaults to None.
156
+
157
+ Returns
158
+ -------
159
+ pd.DataFrame
160
+ The updated DataFrame after uploading images if matching objects were found;
161
+ otherwise, the original DataFrame is returned.
162
+
163
+ Raises
164
+ ------
165
+ Exception
166
+ Any exceptions raised during the image storage process will be handled by the
167
+ `unified_exception_handler` decorator.
168
+
169
+ See Also
170
+ --------
171
+ store_images_to_minio_internal : Internal function that performs the actual image storage.
172
+ _upload_images_to_minio : Function that uploads images to MinIO and updates the ledger metadata.
173
+
174
+ Examples
175
+ --------
176
+ >>> import pandas as pd
177
+ >>> df = pd.DataFrame({
178
+ ... 'document_type': ['IMAGE'],
179
+ ... 'metadata': [{
180
+ ... 'source_metadata': {'source_id': '123'},
181
+ ... 'image_metadata': {'image_type': 'png'},
182
+ ... 'content': 'base64_encoded_content'
183
+ ... }]
184
+ ... })
185
+ >>> result = store_images_to_minio(
186
+ ... df_ledger=df,
187
+ ... minio_access_key='ACCESS_KEY',
188
+ ... minio_secret_key='SECRET_KEY',
189
+ ... minio_bucket_name='mybucket'
190
+ ... )
191
+ """
192
+ content_types = {
193
+ ContentTypeEnum.STRUCTURED: store_structured,
194
+ ContentTypeEnum.IMAGE: store_unstructured,
195
+ }
196
+
197
+ # Build the task configuration as a flat dictionary, matching the internal function's expectations.
198
+ task_config = {
199
+ "access_key": minio_access_key,
200
+ "bucket_name": minio_bucket_name,
201
+ "content_types": content_types,
202
+ "endpoint": minio_endpoint,
203
+ "region": minio_region,
204
+ "secret_key": minio_secret_key,
205
+ "secure": minio_secure,
206
+ "session_token": minio_session_token,
207
+ }
208
+
209
+ storage_config = {}
210
+
211
+ result, _ = store_images_to_minio_internal(
212
+ df_storage_ledger=df_ledger,
213
+ task_config=task_config,
214
+ storage_config=storage_config,
215
+ execution_trace_log=None,
216
+ )
217
+
218
+ return result