nv-ingest 2025.12.2.dev20251202__py3-none-any.whl → 2025.12.4.dev20251204__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,9 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
7
9
 
8
10
  import pandas as pd
9
11
  import ray
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
26
28
  @ray.remote
27
29
  class ImageStorageStage(RayActorStage):
28
30
  """
29
- A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
31
+ A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
32
+ metadata with storage URLs.
30
33
 
31
34
  This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
32
35
  payload and updates the control message accordingly.
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
69
72
  task_config = remove_task_by_type(control_message, "store")
70
73
  # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
71
74
 
72
- store_structured: bool = task_config.get("structured", True)
73
- store_unstructured: bool = task_config.get("images", False)
75
+ stage_defaults = {
76
+ "structured": self.validated_config.structured,
77
+ "images": self.validated_config.images,
78
+ "storage_uri": self.validated_config.storage_uri,
79
+ "storage_options": self.validated_config.storage_options,
80
+ "public_base_url": self.validated_config.public_base_url,
81
+ }
82
+
83
+ store_structured: bool = task_config.get("structured", stage_defaults["structured"])
84
+ store_unstructured: bool = task_config.get("images", stage_defaults["images"])
74
85
 
75
86
  content_types: Dict[Any, Any] = {}
76
87
  if store_structured:
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
80
91
  content_types[ContentTypeEnum.IMAGE] = store_unstructured
81
92
 
82
93
  params: Dict[str, Any] = task_config.get("params", {})
83
- params["content_types"] = content_types
84
94
 
85
- logger.debug(f"Processing storage task with parameters: {params}")
95
+ storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
96
+ storage_options = {
97
+ **(stage_defaults["storage_options"] or {}),
98
+ **(task_config.get("storage_options") or {}),
99
+ **params.get("storage_options", {}),
100
+ }
101
+ if "public_base_url" in task_config:
102
+ public_base_url = task_config["public_base_url"]
103
+ else:
104
+ public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
105
+
106
+ storage_options = self._inject_storage_defaults(storage_uri, storage_options)
107
+
108
+ storage_params: Dict[str, Any] = {
109
+ "content_types": content_types,
110
+ "storage_uri": storage_uri,
111
+ "storage_options": storage_options,
112
+ }
113
+ if public_base_url:
114
+ storage_params["public_base_url"] = public_base_url
115
+
116
+ logger.debug("Processing storage task with parameters: %s", storage_params)
86
117
 
87
118
  # Store images or structured content.
88
119
  df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
89
120
  df_storage_ledger=df_payload,
90
- task_config=params,
121
+ task_config=storage_params,
91
122
  storage_config={},
92
123
  execution_trace_log=None,
93
124
  )
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
98
129
  control_message.payload(df_storage_ledger)
99
130
 
100
131
  return control_message
132
+
133
+ @staticmethod
134
+ def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
135
+ """
136
+ Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
137
+ """
138
+ parsed_scheme = urlparse(storage_uri).scheme.lower()
139
+ merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
140
+
141
+ if parsed_scheme not in {"s3", "s3a", "s3n"}:
142
+ return merged_options
143
+
144
+ def _set_if_absent(key: str, env_var: str) -> None:
145
+ if key not in merged_options and env_var in os.environ:
146
+ merged_options[key] = os.environ[env_var]
147
+
148
+ _set_if_absent("key", "MINIO_ACCESS_KEY")
149
+ _set_if_absent("secret", "MINIO_SECRET_KEY")
150
+ if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
151
+ merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
152
+
153
+ client_kwargs = dict(merged_options.get("client_kwargs", {}))
154
+ endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
155
+ if not endpoint:
156
+ endpoint = "http://minio:9000"
157
+ if endpoint and not endpoint.startswith(("http://", "https://")):
158
+ endpoint = f"http://{endpoint}"
159
+ client_kwargs.setdefault("endpoint_url", endpoint)
160
+ region = os.environ.get("MINIO_REGION")
161
+ if region:
162
+ client_kwargs.setdefault("region_name", region)
163
+ if client_kwargs:
164
+ merged_options["client_kwargs"] = client_kwargs
165
+
166
+ return merged_options
@@ -372,6 +372,9 @@ stages:
372
372
  type: "stage"
373
373
  phase: 5 # RESPONSE
374
374
  actor: "nv_ingest.framework.orchestration.ray.stages.storage.image_storage:ImageStorageStage"
375
+ config:
376
+ storage_uri: $IMAGE_STORAGE_URI|"s3://nv-ingest/artifacts/store/images"
377
+ public_base_url: $IMAGE_STORAGE_PUBLIC_BASE_URL|""
375
378
  replicas:
376
379
  min_replicas: 0
377
380
  max_replicas:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.12.2.dev20251202
3
+ Version: 2025.12.4.dev20251204
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,6 +219,8 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
@@ -64,7 +64,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,s
64
64
  nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
65
65
  nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=LrqaWpWyuiAHlpXWKYSyHZJBFegGXfNlpCXrucbK5NM,24067
66
66
  nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
67
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=WZN_-3Li-izDaPtk8IMrtn2os1ckT3U8Rb2PsfOWrcI,4009
67
+ nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=f1iA7rjYFA1G1EXqFM6URUi_QRql1Y1OrnMPKONsSqo,6907
68
68
  nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=EUtwhSDf-qGLVEhWEInr1VaLsvpcHUSyzCmHQVai-Ps,3547
69
69
  nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
70
70
  nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py,sha256=jEtEUibqs6IS6QakrzWY9zmxSUzuBpg_hzXy2R-I10Y,2870
@@ -112,14 +112,14 @@ nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusX
112
112
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
113
113
  nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
114
114
  nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=yNJtjfHQyxtasGa1hQrvgX7UrPa7BAd0oog8EIN8Y_w,15592
115
- nv_ingest/pipeline/default_pipeline_impl.py,sha256=DhClC17lWUvtBIi2mCC4WkLWT0lxY-CFY0n6nriAxas,16017
115
+ nv_ingest/pipeline/default_pipeline_impl.py,sha256=ID4XGTfdppLnlWk2aFdL-z4xOa-x2ZTuFuvWAjCN6LU,16164
116
116
  nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
117
117
  nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
118
118
  nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
119
119
  nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
120
120
  nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
121
- nv_ingest-2025.12.2.dev20251202.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
- nv_ingest-2025.12.2.dev20251202.dist-info/METADATA,sha256=1NlilI2PWLdUJh3yhNQ3SNPiEovED1G1KNcJIut7mac,15091
123
- nv_ingest-2025.12.2.dev20251202.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
- nv_ingest-2025.12.2.dev20251202.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
125
- nv_ingest-2025.12.2.dev20251202.dist-info/RECORD,,
121
+ nv_ingest-2025.12.4.dev20251204.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
+ nv_ingest-2025.12.4.dev20251204.dist-info/METADATA,sha256=JOBABH9qylbBggE403iK7KUZCvv4k2ItS7zJeJKB6e4,15162
123
+ nv_ingest-2025.12.4.dev20251204.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
+ nv_ingest-2025.12.4.dev20251204.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
125
+ nv_ingest-2025.12.4.dev20251204.dist-info/RECORD,,