nv-ingest 2025.6.2.dev20250602__tar.gz → 2025.7.7.dev20250707__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/PKG-INFO +4 -4
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +4 -4
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +2 -2
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +33 -5
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +1 -1
- nv_ingest-2025.7.7.dev20250707/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +161 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -6
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +1 -10
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +1 -1
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +47 -2
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +3 -3
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +7 -3
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +7 -9
- nv_ingest-2025.7.7.dev20250707/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +203 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/version.py +0 -8
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest.egg-info/PKG-INFO +4 -4
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest.egg-info/SOURCES.txt +1 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest.egg-info/requires.txt +3 -3
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/pyproject.toml +3 -3
- nv_ingest-2025.6.2.dev20250602/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -97
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/LICENSE +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/MANIFEST.in +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/api/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/api/main.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/api/v1/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/api/v1/metrics.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.7.7.dev20250707}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.7.7.dev20250707
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -240,13 +240,13 @@ Requires-Dist: python-docx>=1.1.2
|
|
|
240
240
|
Requires-Dist: python-dotenv>=1.0.1
|
|
241
241
|
Requires-Dist: python-pptx>=1.0.2
|
|
242
242
|
Requires-Dist: prometheus-client
|
|
243
|
-
Requires-Dist: torch
|
|
243
|
+
Requires-Dist: torch>=2.4.1
|
|
244
244
|
Requires-Dist: ray[all]>=2.37.0
|
|
245
245
|
Requires-Dist: redis>=5.2.1
|
|
246
246
|
Requires-Dist: requests>=2.28.2
|
|
247
247
|
Requires-Dist: scikit-learn>=1.6.0
|
|
248
248
|
Requires-Dist: scipy>=1.15.1
|
|
249
|
-
Requires-Dist: setuptools>=
|
|
249
|
+
Requires-Dist: setuptools>=78.1.1
|
|
250
250
|
Requires-Dist: tabulate>=0.9.0
|
|
251
251
|
Requires-Dist: torchvision
|
|
252
252
|
Requires-Dist: torchaudio
|
|
@@ -259,7 +259,7 @@ Requires-Dist: opencv-python
|
|
|
259
259
|
Requires-Dist: pymilvus>=2.5.10
|
|
260
260
|
Requires-Dist: pymilvus[bulk_writer,model]
|
|
261
261
|
Requires-Dist: tritonclient
|
|
262
|
-
Requires-Dist: nvidia-riva-client
|
|
262
|
+
Requires-Dist: nvidia-riva-client==2.20.0
|
|
263
263
|
Requires-Dist: unstructured-client
|
|
264
264
|
Requires-Dist: markitdown
|
|
265
265
|
Dynamic: license-file
|
|
@@ -63,7 +63,7 @@ def get_nim_service(env_var_prefix):
|
|
|
63
63
|
"",
|
|
64
64
|
)
|
|
65
65
|
auth_token = os.environ.get(
|
|
66
|
-
"
|
|
66
|
+
"NVIDIA_API_KEY",
|
|
67
67
|
"",
|
|
68
68
|
) or os.environ.get(
|
|
69
69
|
"NGC_API_KEY",
|
|
@@ -151,11 +151,11 @@ if __name__ == "__main__":
|
|
|
151
151
|
os.environ["PADDLE_INFER_PROTOCOL"] = "grpc"
|
|
152
152
|
os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
153
153
|
os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
154
|
-
os.environ["VLM_CAPTION_MODEL_NAME"] = "
|
|
154
|
+
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
|
|
155
155
|
logger.info("Environment variables set.")
|
|
156
156
|
|
|
157
157
|
image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
158
|
-
|
|
158
|
+
model_name = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
|
|
159
159
|
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
160
160
|
(
|
|
161
161
|
yolox_table_structure_grpc,
|
|
@@ -228,7 +228,7 @@ if __name__ == "__main__":
|
|
|
228
228
|
image_caption_config = {
|
|
229
229
|
"api_key": yolox_auth,
|
|
230
230
|
"endpoint_url": image_caption_endpoint_url,
|
|
231
|
-
"
|
|
231
|
+
"model_name": model_name,
|
|
232
232
|
"prompt": "Caption the content of this image:",
|
|
233
233
|
}
|
|
234
234
|
logger.info("Service configuration retrieved from get_nim_service and environment variables.")
|
|
@@ -555,7 +555,7 @@ class PipelineTopology:
|
|
|
555
555
|
return None
|
|
556
556
|
|
|
557
557
|
def get_connections(self) -> Dict[str, List[Tuple[str, int]]]:
|
|
558
|
-
"""Returns a shallow copy of the
|
|
558
|
+
"""Returns a shallow copy of the connection dictionary."""
|
|
559
559
|
with self._lock:
|
|
560
560
|
# Shallow copy is usually sufficient here as tuples are immutable
|
|
561
561
|
return self._connections.copy()
|
|
@@ -571,7 +571,7 @@ class PipelineTopology:
|
|
|
571
571
|
return len(self._stage_actors.get(stage_name, []))
|
|
572
572
|
|
|
573
573
|
def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
|
|
574
|
-
"""Returns a shallow copy of the edge queues dictionary."""
|
|
574
|
+
"""Returns a shallow copy of the edge queues' dictionary."""
|
|
575
575
|
with self._lock:
|
|
576
576
|
return self._edge_queues.copy()
|
|
577
577
|
|
|
@@ -9,6 +9,7 @@ import threading
|
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
+
from types import FunctionType
|
|
12
13
|
|
|
13
14
|
import psutil
|
|
14
15
|
import uuid
|
|
@@ -24,6 +25,9 @@ import time
|
|
|
24
25
|
from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
|
|
25
26
|
from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
|
|
26
27
|
from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
|
|
28
|
+
from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
|
|
29
|
+
from nv_ingest_api.util.imports.callable_signatures import ingest_stage_callable_signature
|
|
30
|
+
from nv_ingest_api.util.imports.dynamic_resolvers import resolve_callable_from_path
|
|
27
31
|
|
|
28
32
|
logger = logging.getLogger(__name__)
|
|
29
33
|
|
|
@@ -43,7 +47,7 @@ class PipelineInterface(ABC):
|
|
|
43
47
|
Parameters
|
|
44
48
|
----------
|
|
45
49
|
monitor_poll_interval : float
|
|
46
|
-
Interval in seconds for monitoring poll (default: 5.0).
|
|
50
|
+
Interval in seconds for the monitoring poll (default: 5.0).
|
|
47
51
|
scaling_poll_interval : float
|
|
48
52
|
Interval in seconds for scaling decisions (default: 30.0).
|
|
49
53
|
"""
|
|
@@ -270,7 +274,7 @@ class RayPipeline(PipelineInterface):
|
|
|
270
274
|
|
|
271
275
|
logger.info("RayStatsCollector initialized using StatsConfig.")
|
|
272
276
|
|
|
273
|
-
# --- Accessor Methods for
|
|
277
|
+
# --- Accessor Methods for Stat Collector (and internal use) ---
|
|
274
278
|
|
|
275
279
|
def __del__(self):
|
|
276
280
|
try:
|
|
@@ -428,15 +432,39 @@ class RayPipeline(PipelineInterface):
|
|
|
428
432
|
return self
|
|
429
433
|
|
|
430
434
|
def add_stage(
|
|
431
|
-
self,
|
|
435
|
+
self,
|
|
436
|
+
*,
|
|
437
|
+
name: str,
|
|
438
|
+
stage_actor: Any,
|
|
439
|
+
config: BaseModel,
|
|
440
|
+
min_replicas: int = 0,
|
|
441
|
+
max_replicas: int = 1,
|
|
432
442
|
) -> "RayPipeline":
|
|
433
443
|
if min_replicas < 0:
|
|
434
444
|
logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
|
|
435
445
|
min_replicas = 0
|
|
446
|
+
|
|
447
|
+
resolved_actor = stage_actor
|
|
448
|
+
|
|
449
|
+
# Support module path (e.g., "mypkg.mymodule:my_lambda")
|
|
450
|
+
if isinstance(stage_actor, str):
|
|
451
|
+
resolved_actor = resolve_callable_from_path(
|
|
452
|
+
callable_path=stage_actor, signature_schema=ingest_stage_callable_signature
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Wrap callables
|
|
456
|
+
if isinstance(resolved_actor, FunctionType):
|
|
457
|
+
schema_type = type(config)
|
|
458
|
+
resolved_actor = wrap_callable_as_stage(resolved_actor, schema_type)
|
|
459
|
+
|
|
436
460
|
stage_info = StageInfo(
|
|
437
|
-
name=name,
|
|
461
|
+
name=name,
|
|
462
|
+
callable=resolved_actor,
|
|
463
|
+
config=config,
|
|
464
|
+
min_replicas=min_replicas,
|
|
465
|
+
max_replicas=max_replicas,
|
|
438
466
|
)
|
|
439
|
-
self.topology.add_stage(stage_info)
|
|
467
|
+
self.topology.add_stage(stage_info)
|
|
440
468
|
|
|
441
469
|
return self
|
|
442
470
|
|
|
@@ -40,7 +40,7 @@ class RayStatsCollector:
|
|
|
40
40
|
- `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
|
|
41
41
|
These methods should return snapshots suitable for iteration.
|
|
42
42
|
interval : float, optional
|
|
43
|
-
The interval in seconds between
|
|
43
|
+
The interval in seconds between stat collection attempts, by default 5.0.
|
|
44
44
|
actor_timeout : float, optional
|
|
45
45
|
Timeout in seconds for waiting for stats from a single actor, by default 5.0.
|
|
46
46
|
queue_timeout : float, optional
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
import logging
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from typing import Any
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
import ray
|
|
11
|
+
|
|
12
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
13
|
+
from nv_ingest_api.internal.enums.common import (
|
|
14
|
+
DocumentTypeEnum,
|
|
15
|
+
ContentTypeEnum,
|
|
16
|
+
AccessLevelEnum,
|
|
17
|
+
TextTypeEnum,
|
|
18
|
+
LanguageEnum,
|
|
19
|
+
)
|
|
20
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
21
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
|
|
22
|
+
from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
|
|
23
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
24
|
+
nv_ingest_node_failure_try_except,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# logging.basicConfig(level=logging.DEBUG)
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@ray.remote
|
|
32
|
+
class MetadataInjectionStage(RayActorStage):
|
|
33
|
+
"""
|
|
34
|
+
A Ray actor stage that performs metadata injection on IngestControlMessages.
|
|
35
|
+
|
|
36
|
+
This stage iterates over the rows of the DataFrame payload, checks if metadata
|
|
37
|
+
injection is required, and if so, injects the appropriate metadata.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, config: BaseModel) -> None:
|
|
41
|
+
# Call the base initializer to set attributes like self._running.
|
|
42
|
+
super().__init__(config)
|
|
43
|
+
# Additional initialization can be added here if necessary.
|
|
44
|
+
logger.info("MetadataInjectionStage initialized with config: %s", config)
|
|
45
|
+
|
|
46
|
+
@traceable("metadata_injector")
|
|
47
|
+
@nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
|
|
48
|
+
def on_data(self, message: Any) -> Any:
|
|
49
|
+
"""
|
|
50
|
+
Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
message : IngestControlMessage
|
|
55
|
+
The incoming message containing the payload DataFrame.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
IngestControlMessage
|
|
60
|
+
The message with updated metadata if injection was required.
|
|
61
|
+
"""
|
|
62
|
+
df = message.payload()
|
|
63
|
+
update_required = False
|
|
64
|
+
rows = []
|
|
65
|
+
logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
|
|
66
|
+
|
|
67
|
+
for _, row in df.iterrows():
|
|
68
|
+
try:
|
|
69
|
+
# Convert document type to content type using enums.
|
|
70
|
+
content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
|
|
71
|
+
# Check if metadata is missing or doesn't contain 'content'
|
|
72
|
+
if (
|
|
73
|
+
"metadata" not in row
|
|
74
|
+
or not isinstance(row["metadata"], dict)
|
|
75
|
+
or "content" not in row["metadata"].keys()
|
|
76
|
+
):
|
|
77
|
+
update_required = True
|
|
78
|
+
|
|
79
|
+
# Initialize default structures based on MetaDataSchema
|
|
80
|
+
default_source_metadata = {
|
|
81
|
+
"source_id": row.get("source_id"),
|
|
82
|
+
"source_name": row.get("source_name"),
|
|
83
|
+
"source_type": row["document_type"],
|
|
84
|
+
"source_location": "",
|
|
85
|
+
"collection_id": "",
|
|
86
|
+
"date_created": datetime.now().isoformat(),
|
|
87
|
+
"last_modified": datetime.now().isoformat(),
|
|
88
|
+
"summary": "",
|
|
89
|
+
"partition_id": -1,
|
|
90
|
+
"access_level": AccessLevelEnum.UNKNOWN.value,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
default_content_metadata = {
|
|
94
|
+
"type": content_type.name.lower(),
|
|
95
|
+
"page_number": -1,
|
|
96
|
+
"description": "",
|
|
97
|
+
"hierarchy": ContentHierarchySchema().model_dump(),
|
|
98
|
+
"subtype": "",
|
|
99
|
+
"start_time": -1,
|
|
100
|
+
"end_time": -1,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
default_audio_metadata = None
|
|
104
|
+
if content_type == ContentTypeEnum.AUDIO:
|
|
105
|
+
default_audio_metadata = {
|
|
106
|
+
"audio_type": row["document_type"],
|
|
107
|
+
"audio_transcript": "",
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
default_image_metadata = None
|
|
111
|
+
if content_type == ContentTypeEnum.IMAGE:
|
|
112
|
+
default_image_metadata = {
|
|
113
|
+
"image_type": row["document_type"],
|
|
114
|
+
"structured_image_type": ContentTypeEnum.NONE.value,
|
|
115
|
+
"caption": "",
|
|
116
|
+
"text": "",
|
|
117
|
+
"image_location": (0, 0, 0, 0),
|
|
118
|
+
"image_location_max_dimensions": (0, 0),
|
|
119
|
+
"uploaded_image_url": "",
|
|
120
|
+
"width": 0,
|
|
121
|
+
"height": 0,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
default_text_metadata = None
|
|
125
|
+
if content_type == ContentTypeEnum.TEXT:
|
|
126
|
+
default_text_metadata = {
|
|
127
|
+
"text_type": TextTypeEnum.DOCUMENT.value,
|
|
128
|
+
"summary": "",
|
|
129
|
+
"keywords": "",
|
|
130
|
+
"language": LanguageEnum.UNKNOWN.value,
|
|
131
|
+
"text_location": (0, 0, 0, 0),
|
|
132
|
+
"text_location_max_dimensions": (0, 0, 0, 0),
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
row["metadata"] = {
|
|
136
|
+
"content": row["content"],
|
|
137
|
+
"content_metadata": default_content_metadata,
|
|
138
|
+
"error_metadata": None,
|
|
139
|
+
"audio_metadata": default_audio_metadata,
|
|
140
|
+
"image_metadata": default_image_metadata,
|
|
141
|
+
"source_metadata": default_source_metadata,
|
|
142
|
+
"text_metadata": default_text_metadata,
|
|
143
|
+
}
|
|
144
|
+
logger.info(
|
|
145
|
+
f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
|
|
146
|
+
f"Metadata keys: {list(row['metadata'].keys())}."
|
|
147
|
+
f"'content' present: {'content' in row['metadata']}"
|
|
148
|
+
)
|
|
149
|
+
except Exception as inner_e:
|
|
150
|
+
logger.exception("Failed to process row during metadata injection")
|
|
151
|
+
raise inner_e
|
|
152
|
+
rows.append(row)
|
|
153
|
+
|
|
154
|
+
if update_required:
|
|
155
|
+
docs = pd.DataFrame(rows)
|
|
156
|
+
message.payload(docs)
|
|
157
|
+
logger.info("Metadata injection updated payload with %d rows", len(docs))
|
|
158
|
+
else:
|
|
159
|
+
logger.info("No metadata update was necessary during metadata injection")
|
|
160
|
+
|
|
161
|
+
return message
|
|
@@ -23,16 +23,13 @@ class RayActorSourceStage(RayActorStage, ABC):
|
|
|
23
23
|
super().__init__(config, log_to_stdout=log_to_stdout)
|
|
24
24
|
self.paused = False
|
|
25
25
|
|
|
26
|
+
def on_data(self, IngestControlMessage):
|
|
27
|
+
return NotImplemented("Source stages do not implement on_data().")
|
|
28
|
+
|
|
26
29
|
@ray.method(num_returns=1)
|
|
27
30
|
def set_input_queue(self, queue_handle: Any) -> bool:
|
|
28
31
|
raise NotImplementedError("Source stages do not support an input queue.")
|
|
29
32
|
|
|
30
|
-
def get_input(self) -> Any:
|
|
31
|
-
"""
|
|
32
|
-
Source stages must implement get_input() to fetch control messages from an external source.
|
|
33
|
-
"""
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
33
|
@abstractmethod
|
|
37
34
|
def _read_input(self) -> Any:
|
|
38
35
|
"""
|
|
@@ -304,14 +304,6 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
304
304
|
|
|
305
305
|
return control_message
|
|
306
306
|
|
|
307
|
-
def on_data(self, control_message: any) -> any:
|
|
308
|
-
"""
|
|
309
|
-
Process the control message.
|
|
310
|
-
For this source stage, no additional processing is done, so simply return it.
|
|
311
|
-
"""
|
|
312
|
-
self._logger.debug("on_data: Received control message for processing")
|
|
313
|
-
return control_message
|
|
314
|
-
|
|
315
307
|
# In the processing loop, instead of checking a boolean, we wait on the event.
|
|
316
308
|
def _processing_loop(self) -> None:
|
|
317
309
|
"""
|
|
@@ -336,7 +328,6 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
336
328
|
self._active_processing = True
|
|
337
329
|
|
|
338
330
|
self._logger.debug("Control message received; processing data")
|
|
339
|
-
updated_cm = self.on_data(control_message)
|
|
340
331
|
|
|
341
332
|
# Block until not paused using the pause event.
|
|
342
333
|
if self.output_queue is not None:
|
|
@@ -349,7 +340,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
349
340
|
|
|
350
341
|
while True:
|
|
351
342
|
try:
|
|
352
|
-
self.output_queue.put(
|
|
343
|
+
self.output_queue.put(control_message)
|
|
353
344
|
self.stats["successful_queue_writes"] += 1
|
|
354
345
|
break
|
|
355
346
|
except Exception:
|
|
@@ -32,7 +32,7 @@ class TextEmbeddingTransformStage(RayActorStage):
|
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
34
|
def __init__(self, config: TextEmbeddingSchema) -> None:
|
|
35
|
-
super().__init__(config)
|
|
35
|
+
super().__init__(config, log_to_stdout=False)
|
|
36
36
|
try:
|
|
37
37
|
self.validated_config = config
|
|
38
38
|
logger.info("TextEmbeddingTransformStage configuration validated successfully.")
|
|
@@ -1,11 +1,14 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
1
5
|
import logging
|
|
2
6
|
from typing import Any
|
|
3
7
|
import ray
|
|
4
8
|
|
|
5
|
-
# Assume these imports come from your project:
|
|
6
9
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
7
10
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
8
|
-
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
|
|
9
12
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
10
13
|
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
11
14
|
from nv_ingest_api.internal.transform.split_text import transform_text_split_and_tokenize_internal
|
|
@@ -72,3 +75,45 @@ class TextSplitterStage(RayActorStage):
|
|
|
72
75
|
logger.info("TextSplitterStage.on_data: Finished processing, returning updated message.")
|
|
73
76
|
|
|
74
77
|
return message
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def text_splitter_fn(control_message: IngestControlMessage, stage_config: TextSplitterSchema) -> IngestControlMessage:
|
|
81
|
+
"""
|
|
82
|
+
Process an incoming IngestControlMessage by splitting and tokenizing its text.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
control_message : IngestControlMessage
|
|
87
|
+
The incoming message containing the payload DataFrame.
|
|
88
|
+
|
|
89
|
+
stage_config : BaseModel
|
|
90
|
+
The stage level configuration object
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
IngestControlMessage
|
|
95
|
+
The updated message with its payload transformed.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
# Extract the DataFrame payload.
|
|
99
|
+
df_payload = control_message.payload()
|
|
100
|
+
logger.debug("Extracted payload with %d rows.", len(df_payload))
|
|
101
|
+
|
|
102
|
+
# Remove the "split" task to obtain task-specific configuration.
|
|
103
|
+
task_config = remove_task_by_type(control_message, "split")
|
|
104
|
+
logger.debug("Extracted task config: %s", task_config)
|
|
105
|
+
|
|
106
|
+
# Transform the DataFrame (split text and tokenize).
|
|
107
|
+
df_updated = transform_text_split_and_tokenize_internal(
|
|
108
|
+
df_transform_ledger=df_payload,
|
|
109
|
+
task_config=task_config,
|
|
110
|
+
transform_config=stage_config,
|
|
111
|
+
execution_trace_log=None,
|
|
112
|
+
)
|
|
113
|
+
logger.info("TextSplitterStage.on_data: Transformation complete. Updated payload has %d rows.", len(df_updated))
|
|
114
|
+
|
|
115
|
+
# Update the message payload.
|
|
116
|
+
control_message.payload(df_updated)
|
|
117
|
+
logger.info("TextSplitterStage.on_data: Finished processing, returning updated message.")
|
|
118
|
+
|
|
119
|
+
return control_message
|
|
@@ -174,9 +174,9 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
|
|
|
174
174
|
pipeline.make_edge(image_dedup_stage_id, text_splitter_stage_id, queue_size=ingest_edge_buffer_size)
|
|
175
175
|
|
|
176
176
|
###### Primitive Transforms ########
|
|
177
|
-
pipeline.make_edge(text_splitter_stage_id,
|
|
178
|
-
pipeline.make_edge(
|
|
179
|
-
pipeline.make_edge(
|
|
177
|
+
pipeline.make_edge(text_splitter_stage_id, image_caption_stage_id, queue_size=ingest_edge_buffer_size)
|
|
178
|
+
pipeline.make_edge(image_caption_stage_id, embed_extractions_stage_id, queue_size=ingest_edge_buffer_size)
|
|
179
|
+
pipeline.make_edge(embed_extractions_stage_id, image_storage_stage_id, queue_size=ingest_edge_buffer_size)
|
|
180
180
|
|
|
181
181
|
###### Primitive Storage ########
|
|
182
182
|
pipeline.make_edge(image_storage_stage_id, embedding_storage_stage_id, queue_size=ingest_edge_buffer_size)
|
|
@@ -72,7 +72,7 @@ class PipelineCreationSchema(BaseModel):
|
|
|
72
72
|
|
|
73
73
|
# API keys
|
|
74
74
|
ngc_api_key: str = os.getenv("NGC_API_KEY", "")
|
|
75
|
-
|
|
75
|
+
nvidia_api_key: str = os.getenv("NVIDIA_API_KEY", "")
|
|
76
76
|
|
|
77
77
|
# Observability settings
|
|
78
78
|
otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
|
|
@@ -86,9 +86,9 @@ class PipelineCreationSchema(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
# Vision language model settings
|
|
88
88
|
vlm_caption_endpoint: str = os.getenv(
|
|
89
|
-
"VLM_CAPTION_ENDPOINT", "https://ai.api.nvidia.com/v1/gr/
|
|
89
|
+
"VLM_CAPTION_ENDPOINT", "https://ai.api.nvidia.com/v1/gr/nvidia/llama-3.1-nemotron-nano-vl-8b-v1/chat/completions"
|
|
90
90
|
)
|
|
91
|
-
vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "
|
|
91
|
+
vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "nvidia/llama-3.1-nemotron-nano-vl-8b-v1")
|
|
92
92
|
|
|
93
93
|
# YOLOX image processing settings
|
|
94
94
|
yolox_graphic_elements_http_endpoint: str = os.getenv(
|
|
@@ -331,6 +331,10 @@ def run_pipeline(
|
|
|
331
331
|
"""
|
|
332
332
|
if run_in_subprocess:
|
|
333
333
|
logger.info("Launching pipeline in Python subprocess using multiprocessing.")
|
|
334
|
+
if (ingest_config.ngc_api_key is None or ingest_config.ngc_api_key == "") and (
|
|
335
|
+
ingest_config.nvidia_api_key is None or ingest_config.nvidia_api_key == ""
|
|
336
|
+
):
|
|
337
|
+
logger.warning("NGC_API_KEY or NVIDIA_API_KEY are not set. NIM Related functions will not work.")
|
|
334
338
|
|
|
335
339
|
ctx = multiprocessing.get_context("fork")
|
|
336
340
|
process = ctx.Process(
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
# TODO(Devin)
|
|
6
|
-
# flake8: noqa
|
|
7
5
|
import os
|
|
8
6
|
|
|
9
7
|
import click
|
|
@@ -11,6 +9,7 @@ import logging
|
|
|
11
9
|
|
|
12
10
|
from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
|
|
13
11
|
from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
|
|
12
|
+
from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
|
|
14
13
|
from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
|
|
15
14
|
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
16
15
|
|
|
@@ -41,7 +40,6 @@ from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import I
|
|
|
41
40
|
from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
|
|
42
41
|
from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
|
|
43
42
|
from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
|
|
44
|
-
from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
|
|
45
43
|
from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
|
|
46
44
|
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
47
45
|
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
@@ -107,7 +105,7 @@ def get_nim_service(env_var_prefix):
|
|
|
107
105
|
"",
|
|
108
106
|
)
|
|
109
107
|
auth_token = os.environ.get(
|
|
110
|
-
"
|
|
108
|
+
"NVIDIA_API_KEY",
|
|
111
109
|
"",
|
|
112
110
|
) or os.environ.get(
|
|
113
111
|
"NGC_API_KEY",
|
|
@@ -137,7 +135,7 @@ def get_audio_retrieval_service(env_var_prefix):
|
|
|
137
135
|
"",
|
|
138
136
|
)
|
|
139
137
|
auth_token = os.environ.get(
|
|
140
|
-
"
|
|
138
|
+
"NVIDIA_API_KEY",
|
|
141
139
|
"",
|
|
142
140
|
) or os.environ.get(
|
|
143
141
|
"NGC_API_KEY",
|
|
@@ -465,7 +463,7 @@ def add_text_splitter_stage(pipeline, default_cpu_count, stage_name="text_splitt
|
|
|
465
463
|
|
|
466
464
|
def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_caption"):
|
|
467
465
|
auth_token = os.environ.get(
|
|
468
|
-
"
|
|
466
|
+
"NVIDIA_API_KEY",
|
|
469
467
|
"",
|
|
470
468
|
) or os.environ.get(
|
|
471
469
|
"NGC_API_KEY",
|
|
@@ -473,13 +471,13 @@ def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_capti
|
|
|
473
471
|
)
|
|
474
472
|
|
|
475
473
|
endpoint_url = os.environ.get("VLM_CAPTION_ENDPOINT", "localhost:5000")
|
|
476
|
-
model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "
|
|
474
|
+
model_name = os.environ.get("VLM_CAPTION_MODEL_NAME", "nvidia/llama-3.1-nemotron-nano-vl-8b-v1")
|
|
477
475
|
|
|
478
476
|
config = ImageCaptionExtractionSchema(
|
|
479
477
|
**{
|
|
480
478
|
"api_key": auth_token,
|
|
481
479
|
"endpoint_url": endpoint_url,
|
|
482
|
-
"
|
|
480
|
+
"model_name": model_name,
|
|
483
481
|
"prompt": "Caption the content of this image:",
|
|
484
482
|
}
|
|
485
483
|
)
|
|
@@ -497,7 +495,7 @@ def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_capti
|
|
|
497
495
|
|
|
498
496
|
def add_text_embedding_stage(pipeline, default_cpu_count, stage_name="text_embedding"):
|
|
499
497
|
api_key = os.environ.get(
|
|
500
|
-
"
|
|
498
|
+
"NVIDIA_API_KEY",
|
|
501
499
|
"",
|
|
502
500
|
) or os.environ.get(
|
|
503
501
|
"NGC_API_KEY",
|