nv-ingest 2025.11.26.dev20251126__tar.gz → 2026.1.6.dev20260106__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/PKG-INFO +4 -2
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/main.py +1 -1
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/ingest.py +12 -7
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/execution.py +6 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +9 -9
- nv_ingest-2026.1.6.dev20260106/nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py +64 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +40 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/config/replica_resolver.py +12 -2
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +32 -17
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/default_pipeline_impl.py +26 -8
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/PKG-INFO +4 -2
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/SOURCES.txt +1 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/requires.txt +3 -1
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/pyproject.toml +3 -1
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/LICENSE +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/MANIFEST.in +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/tracing.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/metrics.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/README.md +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/execution/options.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/termination.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/config/__init__.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/config/loaders.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/pipeline_schema.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/version.py +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.1.6.dev20260106
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -219,13 +219,15 @@ Requires-Dist: diskcache>=5.6.3
|
|
|
219
219
|
Requires-Dist: fastapi>=0.115.6
|
|
220
220
|
Requires-Dist: fastparquet>=2024.11.0
|
|
221
221
|
Requires-Dist: fsspec>=2024.10.0
|
|
222
|
+
Requires-Dist: universal_pathlib>=0.2.6
|
|
223
|
+
Requires-Dist: s3fs>=2024.10.0
|
|
222
224
|
Requires-Dist: gunicorn
|
|
223
225
|
Requires-Dist: h11>=0.16.0
|
|
224
226
|
Requires-Dist: httpx>=0.28.1
|
|
225
227
|
Requires-Dist: isodate>=0.7.2
|
|
226
228
|
Requires-Dist: langdetect>=1.0.9
|
|
227
229
|
Requires-Dist: minio>=7.2.12
|
|
228
|
-
Requires-Dist: librosa
|
|
230
|
+
Requires-Dist: librosa==0.10.2
|
|
229
231
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
230
232
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
231
233
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
|
|
|
23
23
|
app = FastAPI(
|
|
24
24
|
title="NV-Ingest Microservice",
|
|
25
25
|
description="Service for ingesting heterogenous datatypes",
|
|
26
|
-
version="
|
|
26
|
+
version="26.1.0",
|
|
27
27
|
contact={
|
|
28
28
|
"name": "NVIDIA Corporation",
|
|
29
29
|
"url": "https://nvidia.com",
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/ingest.py
RENAMED
|
@@ -122,11 +122,16 @@ def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
|
|
|
122
122
|
)
|
|
123
123
|
return DEFAULT_PDF_SPLIT_PAGE_COUNT
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
125
|
+
clamped = max(MIN_PAGES, min(parsed, MAX_PAGES))
|
|
126
|
+
if clamped != parsed:
|
|
127
|
+
logger.warning(
|
|
128
|
+
"Env PDF_SPLIT_PAGE_COUNT=%s clamped to %s (min=%s, max=%s)",
|
|
129
|
+
parsed,
|
|
130
|
+
clamped,
|
|
131
|
+
MIN_PAGES,
|
|
132
|
+
MAX_PAGES,
|
|
133
|
+
)
|
|
134
|
+
return clamped
|
|
130
135
|
|
|
131
136
|
|
|
132
137
|
def split_pdf_to_chunks(pdf_content: bytes, pages_per_chunk: int) -> List[Dict[str, Any]]:
|
|
@@ -955,7 +960,7 @@ async def submit_job_v2(
|
|
|
955
960
|
"subjob_order": subjob_ids,
|
|
956
961
|
}
|
|
957
962
|
)
|
|
958
|
-
elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
|
|
963
|
+
elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav", "mkv"]:
|
|
959
964
|
document_type = document_types[0]
|
|
960
965
|
upload_path = f"./{Path(original_source_id).name}"
|
|
961
966
|
# dump the payload to a file, just came from client
|
|
@@ -1003,7 +1008,7 @@ async def submit_job_v2(
|
|
|
1003
1008
|
"page_count": chunk.get("page_count", 0),
|
|
1004
1009
|
}
|
|
1005
1010
|
)
|
|
1006
|
-
logger.
|
|
1011
|
+
logger.debug(f"Removing uploaded file {upload_path}")
|
|
1007
1012
|
os.remove(upload_path)
|
|
1008
1013
|
|
|
1009
1014
|
if submission_items:
|
|
@@ -162,6 +162,11 @@ def build_logging_config_from_env() -> LoggingConfig:
|
|
|
162
162
|
if key not in os.environ:
|
|
163
163
|
os.environ[key] = default_value
|
|
164
164
|
|
|
165
|
+
# For PRODUCTION mode, also suppress nv-ingest module INFO logs
|
|
166
|
+
if preset_level == "PRODUCTION":
|
|
167
|
+
logging.getLogger("nv_ingest").setLevel(logging.WARNING)
|
|
168
|
+
logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
|
|
169
|
+
|
|
165
170
|
logger.info(f"Applied Ray logging preset: {preset_level}")
|
|
166
171
|
|
|
167
172
|
# Get log level from environment, default to INFO
|
|
@@ -324,6 +329,7 @@ def launch_pipeline(
|
|
|
324
329
|
pipeline_config = resolve_static_replicas(pipeline_config)
|
|
325
330
|
|
|
326
331
|
# Pretty print the final pipeline configuration (after replica resolution)
|
|
332
|
+
# INFO level so it shows in docker/helm deployments; quiet mode suppresses in library mode
|
|
327
333
|
pretty_output = pretty_print_pipeline_config(pipeline_config, config_path=None)
|
|
328
334
|
logger.info("\n" + pretty_output)
|
|
329
335
|
|
|
@@ -150,7 +150,7 @@ if __name__ == "__main__":
|
|
|
150
150
|
os.environ["OCR_GRPC_ENDPOINT"] = "localhost:8010"
|
|
151
151
|
os.environ["OCR_INFER_PROTOCOL"] = "grpc"
|
|
152
152
|
os.environ["OCR_MODEL_NAME"] = "paddle"
|
|
153
|
-
os.environ["
|
|
153
|
+
os.environ["NEMOTRON_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
154
154
|
os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
155
155
|
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
|
|
156
156
|
logger.info("Environment variables set.")
|
|
@@ -170,23 +170,23 @@ if __name__ == "__main__":
|
|
|
170
170
|
yolox_graphic_elements_auth,
|
|
171
171
|
yolox_graphic_elements_protocol,
|
|
172
172
|
) = get_nim_service("yolox_graphic_elements")
|
|
173
|
-
|
|
174
|
-
|
|
173
|
+
nemotron_parse_grpc, nemotron_parse_http, nemotron_parse_auth, nemotron_parse_protocol = get_nim_service(
|
|
174
|
+
"nemotron_parse"
|
|
175
175
|
)
|
|
176
176
|
ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
|
|
177
177
|
|
|
178
|
-
model_name = os.environ.get("
|
|
178
|
+
model_name = os.environ.get("NEMOTRON_PARSE_MODEL_NAME", "nvidia/nemotron-parse")
|
|
179
179
|
pdf_extractor_config = {
|
|
180
180
|
"pdfium_config": {
|
|
181
181
|
"auth_token": yolox_auth, # All auth tokens are the same for the moment
|
|
182
182
|
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
183
183
|
"yolox_infer_protocol": yolox_protocol,
|
|
184
184
|
},
|
|
185
|
-
"
|
|
186
|
-
"auth_token":
|
|
187
|
-
"
|
|
188
|
-
"
|
|
189
|
-
"
|
|
185
|
+
"nemotron_parse_config": {
|
|
186
|
+
"auth_token": nemotron_parse_auth,
|
|
187
|
+
"nemotron_parse_endpoints": (nemotron_parse_grpc, nemotron_parse_http),
|
|
188
|
+
"nemotron_parse_infer_protocol": nemotron_parse_protocol,
|
|
189
|
+
"nemotron_parse_model_name": model_name,
|
|
190
190
|
"yolox_endpoints": (yolox_grpc, yolox_http),
|
|
191
191
|
"yolox_infer_protocol": yolox_protocol,
|
|
192
192
|
},
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Added this no-op UDF ray stage to the pipeline to help speed up the LLM api calls
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
UDF Parallel Stage - A high-concurrency no-op stage for parallel UDF execution.
|
|
9
|
+
|
|
10
|
+
This stage does nothing except pass messages through, but with high replica count
|
|
11
|
+
it provides a parallel execution pool for UDFs to achieve N-way concurrency.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Optional
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
import ray
|
|
18
|
+
|
|
19
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
20
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
21
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
22
|
+
from nv_ingest_api.util.exception_handlers.decorators import (
|
|
23
|
+
nv_ingest_node_failure_try_except,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@ray.remote
|
|
30
|
+
class UDFParallelStage(RayActorStage):
|
|
31
|
+
"""
|
|
32
|
+
A no-op pass-through stage designed for parallel UDF execution.
|
|
33
|
+
|
|
34
|
+
This stage simply returns the input message unchanged, but when configured
|
|
35
|
+
with multiple replicas, it provides a high-concurrency pool for UDFs to
|
|
36
|
+
achieve parallel execution without blocking.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
|
|
40
|
+
super().__init__(config, stage_name=stage_name)
|
|
41
|
+
logger.info(f"UDFParallelStage initialized: {stage_name}")
|
|
42
|
+
|
|
43
|
+
@nv_ingest_node_failure_try_except()
|
|
44
|
+
@traceable()
|
|
45
|
+
@udf_intercept_hook()
|
|
46
|
+
def on_data(self, message: Any) -> Any:
|
|
47
|
+
"""
|
|
48
|
+
Pass-through processing that simply returns the message unchanged.
|
|
49
|
+
|
|
50
|
+
The @udf_intercept_hook decorator allows UDFs to target this stage,
|
|
51
|
+
and multiple replicas provide parallel execution capacity.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
message : Any
|
|
56
|
+
The incoming control message.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
Any
|
|
61
|
+
The unmodified control message.
|
|
62
|
+
"""
|
|
63
|
+
# No-op: just return the message
|
|
64
|
+
return message
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from typing import Dict, Any, Optional
|
|
8
|
+
from urllib.parse import urlparse
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import ray
|
|
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
|
|
|
26
28
|
@ray.remote
|
|
27
29
|
class ImageStorageStage(RayActorStage):
|
|
28
30
|
"""
|
|
29
|
-
A Ray actor stage that stores images or structured content
|
|
31
|
+
A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
|
|
32
|
+
metadata with storage URLs.
|
|
30
33
|
|
|
31
34
|
This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
|
|
32
35
|
payload and updates the control message accordingly.
|
|
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
|
|
|
69
72
|
task_config = remove_task_by_type(control_message, "store")
|
|
70
73
|
# logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
|
|
71
74
|
|
|
72
|
-
|
|
73
|
-
|
|
75
|
+
stage_defaults = {
|
|
76
|
+
"structured": self.validated_config.structured,
|
|
77
|
+
"images": self.validated_config.images,
|
|
78
|
+
"storage_uri": self.validated_config.storage_uri,
|
|
79
|
+
"storage_options": self.validated_config.storage_options,
|
|
80
|
+
"public_base_url": self.validated_config.public_base_url,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
store_structured: bool = task_config.get("structured", stage_defaults["structured"])
|
|
84
|
+
store_unstructured: bool = task_config.get("images", stage_defaults["images"])
|
|
74
85
|
|
|
75
86
|
content_types: Dict[Any, Any] = {}
|
|
76
87
|
if store_structured:
|
|
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
|
|
|
80
91
|
content_types[ContentTypeEnum.IMAGE] = store_unstructured
|
|
81
92
|
|
|
82
93
|
params: Dict[str, Any] = task_config.get("params", {})
|
|
83
|
-
params["content_types"] = content_types
|
|
84
94
|
|
|
85
|
-
|
|
95
|
+
storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
|
|
96
|
+
storage_options = {
|
|
97
|
+
**(stage_defaults["storage_options"] or {}),
|
|
98
|
+
**(task_config.get("storage_options") or {}),
|
|
99
|
+
**params.get("storage_options", {}),
|
|
100
|
+
}
|
|
101
|
+
if "public_base_url" in task_config:
|
|
102
|
+
public_base_url = task_config["public_base_url"]
|
|
103
|
+
else:
|
|
104
|
+
public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
|
|
105
|
+
|
|
106
|
+
storage_options = self._inject_storage_defaults(storage_uri, storage_options)
|
|
107
|
+
|
|
108
|
+
storage_params: Dict[str, Any] = {
|
|
109
|
+
"content_types": content_types,
|
|
110
|
+
"storage_uri": storage_uri,
|
|
111
|
+
"storage_options": storage_options,
|
|
112
|
+
}
|
|
113
|
+
if public_base_url:
|
|
114
|
+
storage_params["public_base_url"] = public_base_url
|
|
115
|
+
|
|
116
|
+
logger.debug("Processing storage task with parameters: %s", storage_params)
|
|
86
117
|
|
|
87
118
|
# Store images or structured content.
|
|
88
119
|
df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
|
|
89
120
|
df_storage_ledger=df_payload,
|
|
90
|
-
task_config=
|
|
121
|
+
task_config=storage_params,
|
|
91
122
|
storage_config={},
|
|
92
123
|
execution_trace_log=None,
|
|
93
124
|
)
|
|
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
|
|
|
98
129
|
control_message.payload(df_storage_ledger)
|
|
99
130
|
|
|
100
131
|
return control_message
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
|
|
135
|
+
"""
|
|
136
|
+
Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
|
|
137
|
+
"""
|
|
138
|
+
parsed_scheme = urlparse(storage_uri).scheme.lower()
|
|
139
|
+
merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
|
|
140
|
+
|
|
141
|
+
if parsed_scheme not in {"s3", "s3a", "s3n"}:
|
|
142
|
+
return merged_options
|
|
143
|
+
|
|
144
|
+
def _set_if_absent(key: str, env_var: str) -> None:
|
|
145
|
+
if key not in merged_options and env_var in os.environ:
|
|
146
|
+
merged_options[key] = os.environ[env_var]
|
|
147
|
+
|
|
148
|
+
_set_if_absent("key", "MINIO_ACCESS_KEY")
|
|
149
|
+
_set_if_absent("secret", "MINIO_SECRET_KEY")
|
|
150
|
+
if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
|
|
151
|
+
merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
|
|
152
|
+
|
|
153
|
+
client_kwargs = dict(merged_options.get("client_kwargs", {}))
|
|
154
|
+
endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
|
|
155
|
+
if not endpoint:
|
|
156
|
+
endpoint = "http://minio:9000"
|
|
157
|
+
if endpoint and not endpoint.startswith(("http://", "https://")):
|
|
158
|
+
endpoint = f"http://{endpoint}"
|
|
159
|
+
client_kwargs.setdefault("endpoint_url", endpoint)
|
|
160
|
+
region = os.environ.get("MINIO_REGION")
|
|
161
|
+
if region:
|
|
162
|
+
client_kwargs.setdefault("region_name", region)
|
|
163
|
+
if client_kwargs:
|
|
164
|
+
merged_options["client_kwargs"] = client_kwargs
|
|
165
|
+
|
|
166
|
+
return merged_options
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from typing import Union, Optional, TextIO
|
|
7
8
|
|
|
8
9
|
|
|
@@ -23,6 +24,34 @@ from nv_ingest.framework.orchestration.execution.helpers import (
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def _configure_quiet_mode():
|
|
28
|
+
"""
|
|
29
|
+
Configure environment for quiet/production logging in library mode.
|
|
30
|
+
|
|
31
|
+
Sets INGEST_RAY_LOG_LEVEL=PRODUCTION if not already set by user, which:
|
|
32
|
+
- Sets Ray logging to ERROR level (suppresses INFO/WARNING)
|
|
33
|
+
- Disables Ray usage stats collection
|
|
34
|
+
- Disables Ray import warnings
|
|
35
|
+
|
|
36
|
+
Also silences other common warnings that are noisy in library mode.
|
|
37
|
+
"""
|
|
38
|
+
# Only set if user hasn't explicitly configured
|
|
39
|
+
if "INGEST_RAY_LOG_LEVEL" not in os.environ:
|
|
40
|
+
os.environ["INGEST_RAY_LOG_LEVEL"] = "PRODUCTION"
|
|
41
|
+
|
|
42
|
+
# Silence Ray accelerator env var warning
|
|
43
|
+
if "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO" not in os.environ:
|
|
44
|
+
os.environ["RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO"] = "0"
|
|
45
|
+
|
|
46
|
+
# Disable OTEL tracing export errors (no collector expected in library mode)
|
|
47
|
+
if "OTEL_SDK_DISABLED" not in os.environ:
|
|
48
|
+
os.environ["OTEL_SDK_DISABLED"] = "true"
|
|
49
|
+
|
|
50
|
+
# Set nv-ingest module loggers to WARNING to suppress INFO level startup messages
|
|
51
|
+
logging.getLogger("nv_ingest").setLevel(logging.WARNING)
|
|
52
|
+
logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
|
|
53
|
+
|
|
54
|
+
|
|
26
55
|
def run_pipeline(
|
|
27
56
|
pipeline_config: Optional[PipelineConfigSchema] = None,
|
|
28
57
|
block: bool = True,
|
|
@@ -32,6 +61,7 @@ def run_pipeline(
|
|
|
32
61
|
stdout: Optional[TextIO] = None,
|
|
33
62
|
stderr: Optional[TextIO] = None,
|
|
34
63
|
libmode: bool = True,
|
|
64
|
+
quiet: Optional[bool] = None,
|
|
35
65
|
) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
|
|
36
66
|
"""
|
|
37
67
|
Launch and manage a pipeline using configuration.
|
|
@@ -65,6 +95,10 @@ def run_pipeline(
|
|
|
65
95
|
libmode : bool, default=True
|
|
66
96
|
If True and pipeline_config is None, loads the default libmode pipeline configuration.
|
|
67
97
|
If False, requires pipeline_config to be provided.
|
|
98
|
+
quiet : Optional[bool], default=None
|
|
99
|
+
If True, configures logging for minimal output (PRODUCTION preset, suppresses
|
|
100
|
+
INFO-level startup messages). If None, defaults to True when libmode=True.
|
|
101
|
+
Set to False to see verbose startup logs even in library mode.
|
|
68
102
|
|
|
69
103
|
Returns
|
|
70
104
|
-------
|
|
@@ -83,6 +117,12 @@ def run_pipeline(
|
|
|
83
117
|
Exception
|
|
84
118
|
Any other exceptions raised during pipeline launch or configuration.
|
|
85
119
|
"""
|
|
120
|
+
# Configure quiet mode for library mode by default (unless explicitly disabled)
|
|
121
|
+
if quiet is None:
|
|
122
|
+
quiet = libmode
|
|
123
|
+
if quiet:
|
|
124
|
+
_configure_quiet_mode()
|
|
125
|
+
|
|
86
126
|
# Resolve configuration
|
|
87
127
|
config = resolve_pipeline_config(pipeline_config, libmode)
|
|
88
128
|
overrides = create_runtime_overrides(disable_dynamic_scaling, dynamic_memory_threshold)
|
|
@@ -11,6 +11,7 @@ consumption stays within the static_memory_threshold.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import logging
|
|
14
|
+
import os
|
|
14
15
|
from typing import List
|
|
15
16
|
from copy import deepcopy
|
|
16
17
|
|
|
@@ -102,8 +103,17 @@ def resolve_static_replicas(pipeline_config: PipelineConfigSchema) -> PipelineCo
|
|
|
102
103
|
|
|
103
104
|
logger.info(f"Total baseline memory demand: {total_memory_demand_mb}MB from {len(non_static_stages)} stages")
|
|
104
105
|
|
|
105
|
-
#
|
|
106
|
-
|
|
106
|
+
# Optional bypass of global memory-based scale down via environment variable
|
|
107
|
+
bypass_env = os.getenv("NV_INGEST_BYPASS_STATIC_MEMORY_SCALE_DOWN", "").strip().lower()
|
|
108
|
+
bypass_scale_down = bypass_env in ("1", "true", "yes", "on")
|
|
109
|
+
|
|
110
|
+
# Check if we need to scale down (unless bypassed)
|
|
111
|
+
if bypass_scale_down:
|
|
112
|
+
logger.warning(
|
|
113
|
+
"Bypassing static memory-based replica scale-down due to NV_INGEST_BYPASS_STATIC_MEMORY_SCALE_DOWN"
|
|
114
|
+
)
|
|
115
|
+
scaling_factor = 1.0
|
|
116
|
+
elif total_memory_demand_mb <= available_memory_mb:
|
|
107
117
|
logger.info("Memory demand within threshold, applying baseline replica counts")
|
|
108
118
|
scaling_factor = 1.0
|
|
109
119
|
else:
|
|
@@ -68,20 +68,20 @@ stages:
|
|
|
68
68
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
69
69
|
yolox_endpoints: [
|
|
70
70
|
$YOLOX_GRPC_ENDPOINT|"",
|
|
71
|
-
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-
|
|
71
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
72
72
|
]
|
|
73
73
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
74
|
-
|
|
74
|
+
nemotron_parse_config:
|
|
75
75
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
76
|
-
|
|
77
|
-
$
|
|
78
|
-
$
|
|
76
|
+
nemotron_parse_endpoints: [
|
|
77
|
+
$NEMOTRON_PARSE_GRPC_ENDPOINT|"",
|
|
78
|
+
$NEMOTRON_PARSE_HTTP_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
|
|
79
79
|
]
|
|
80
|
-
|
|
81
|
-
|
|
80
|
+
nemotron_parse_infer_protocol: $NEMOTRON_PARSE_INFER_PROTOCOL|http
|
|
81
|
+
nemotron_parse_model_name: $NEMOTRON_PARSE_MODEL_NAME|"nvidia/nemotron-parse"
|
|
82
82
|
yolox_endpoints: [
|
|
83
83
|
$YOLOX_GRPC_ENDPOINT|"",
|
|
84
|
-
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-
|
|
84
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
85
85
|
]
|
|
86
86
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
87
87
|
replicas:
|
|
@@ -124,7 +124,14 @@ stages:
|
|
|
124
124
|
docx_extraction_config:
|
|
125
125
|
yolox_endpoints: [
|
|
126
126
|
$YOLOX_GRPC_ENDPOINT|"",
|
|
127
|
-
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-
|
|
127
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
128
|
+
]
|
|
129
|
+
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
130
|
+
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
131
|
+
pdfium_config:
|
|
132
|
+
yolox_endpoints: [
|
|
133
|
+
$YOLOX_GRPC_ENDPOINT|"",
|
|
134
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
128
135
|
]
|
|
129
136
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
130
137
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
@@ -145,7 +152,14 @@ stages:
|
|
|
145
152
|
pptx_extraction_config:
|
|
146
153
|
yolox_endpoints: [
|
|
147
154
|
$YOLOX_GRPC_ENDPOINT|"",
|
|
148
|
-
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-
|
|
155
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
156
|
+
]
|
|
157
|
+
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
158
|
+
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
159
|
+
pdfium_config:
|
|
160
|
+
yolox_endpoints: [
|
|
161
|
+
$YOLOX_GRPC_ENDPOINT|"",
|
|
162
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
149
163
|
]
|
|
150
164
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
151
165
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
@@ -166,7 +180,7 @@ stages:
|
|
|
166
180
|
image_extraction_config:
|
|
167
181
|
yolox_endpoints: [
|
|
168
182
|
$YOLOX_GRPC_ENDPOINT|"",
|
|
169
|
-
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-
|
|
183
|
+
$YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
|
|
170
184
|
]
|
|
171
185
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
|
|
172
186
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
@@ -201,7 +215,7 @@ stages:
|
|
|
201
215
|
endpoint_config:
|
|
202
216
|
ocr_endpoints: [
|
|
203
217
|
$OCR_GRPC_ENDPOINT|"",
|
|
204
|
-
$OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/
|
|
218
|
+
$OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
|
|
205
219
|
]
|
|
206
220
|
ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
|
|
207
221
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
@@ -227,9 +241,9 @@ stages:
|
|
|
227
241
|
yolox_infer_protocol: $YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL|"http"
|
|
228
242
|
ocr_endpoints: [
|
|
229
243
|
$OCR_GRPC_ENDPOINT|"",
|
|
230
|
-
$OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/
|
|
244
|
+
$OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
|
|
231
245
|
]
|
|
232
|
-
ocr_infer_protocol: $
|
|
246
|
+
ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
|
|
233
247
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
234
248
|
replicas:
|
|
235
249
|
min_replicas: 0
|
|
@@ -254,7 +268,7 @@ stages:
|
|
|
254
268
|
yolox_infer_protocol: $YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL|"http"
|
|
255
269
|
ocr_endpoints: [
|
|
256
270
|
$OCR_GRPC_ENDPOINT|"",
|
|
257
|
-
$OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/
|
|
271
|
+
$OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
|
|
258
272
|
]
|
|
259
273
|
ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
|
|
260
274
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
@@ -318,9 +332,10 @@ stages:
|
|
|
318
332
|
actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
|
|
319
333
|
config:
|
|
320
334
|
api_key: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
321
|
-
endpoint_url: $VLM_CAPTION_ENDPOINT|"
|
|
335
|
+
endpoint_url: $VLM_CAPTION_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
|
|
322
336
|
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
|
|
323
|
-
prompt: "Caption the content of this image:"
|
|
337
|
+
prompt: $VLM_CAPTION_PROMPT|"Caption the content of this image:"
|
|
338
|
+
system_prompt: $VLM_CAPTION_SYSTEM_PROMPT|"/no_think"
|
|
324
339
|
replicas:
|
|
325
340
|
min_replicas: 0
|
|
326
341
|
max_replicas:
|
|
@@ -70,14 +70,14 @@ stages:
|
|
|
70
70
|
$YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
|
|
71
71
|
]
|
|
72
72
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
|
|
73
|
-
|
|
73
|
+
nemotron_parse_config:
|
|
74
74
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
75
|
-
|
|
76
|
-
$
|
|
77
|
-
$
|
|
75
|
+
nemotron_parse_endpoints: [
|
|
76
|
+
$NEMOTRON_PARSE_GRPC_ENDPOINT|"",
|
|
77
|
+
$NEMOTRON_PARSE_HTTP_ENDPOINT|"http://nemotron-parse:8000/v1/chat/completions",
|
|
78
78
|
]
|
|
79
|
-
|
|
80
|
-
|
|
79
|
+
nemotron_parse_infer_protocol: $NEMOTRON_PARSE_INFER_PROTOCOL|http
|
|
80
|
+
nemotron_parse_model_name: $NEMOTRON_PARSE_MODEL_NAME|"nvidia/nemotron-parse"
|
|
81
81
|
yolox_endpoints: [
|
|
82
82
|
$YOLOX_GRPC_ENDPOINT|"page-elements:8001",
|
|
83
83
|
$YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
|
|
@@ -123,7 +123,14 @@ stages:
|
|
|
123
123
|
docx_extraction_config:
|
|
124
124
|
yolox_endpoints: [
|
|
125
125
|
$YOLOX_GRPC_ENDPOINT|"page-elements:8001",
|
|
126
|
-
$YOLOX_HTTP_ENDPOINT|"",
|
|
126
|
+
$YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
|
|
127
|
+
]
|
|
128
|
+
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
|
|
129
|
+
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
130
|
+
pdfium_config:
|
|
131
|
+
yolox_endpoints: [
|
|
132
|
+
$YOLOX_GRPC_ENDPOINT|"page-elements:8001",
|
|
133
|
+
$YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
|
|
127
134
|
]
|
|
128
135
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
|
|
129
136
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
@@ -148,6 +155,13 @@ stages:
|
|
|
148
155
|
]
|
|
149
156
|
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
|
|
150
157
|
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
158
|
+
pdfium_config:
|
|
159
|
+
yolox_endpoints: [
|
|
160
|
+
$YOLOX_GRPC_ENDPOINT|"page-elements:8001",
|
|
161
|
+
$YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
|
|
162
|
+
]
|
|
163
|
+
yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
|
|
164
|
+
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
151
165
|
replicas:
|
|
152
166
|
min_replicas: 0
|
|
153
167
|
max_replicas:
|
|
@@ -340,7 +354,8 @@ stages:
|
|
|
340
354
|
api_key: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
341
355
|
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
|
|
342
356
|
endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
|
|
343
|
-
prompt: "Caption the content of this image:"
|
|
357
|
+
prompt: $VLM_CAPTION_PROMPT|"Caption the content of this image:"
|
|
358
|
+
system_prompt: $VLM_CAPTION_SYSTEM_PROMPT|"/no_think"
|
|
344
359
|
replicas:
|
|
345
360
|
min_replicas: 0
|
|
346
361
|
max_replicas:
|
|
@@ -372,6 +387,9 @@ stages:
|
|
|
372
387
|
type: "stage"
|
|
373
388
|
phase: 5 # RESPONSE
|
|
374
389
|
actor: "nv_ingest.framework.orchestration.ray.stages.storage.image_storage:ImageStorageStage"
|
|
390
|
+
config:
|
|
391
|
+
storage_uri: $IMAGE_STORAGE_URI|"s3://nv-ingest/artifacts/store/images"
|
|
392
|
+
public_base_url: $IMAGE_STORAGE_PUBLIC_BASE_URL|""
|
|
375
393
|
replicas:
|
|
376
394
|
min_replicas: 0
|
|
377
395
|
max_replicas:
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.1.6.dev20260106
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -219,13 +219,15 @@ Requires-Dist: diskcache>=5.6.3
|
|
|
219
219
|
Requires-Dist: fastapi>=0.115.6
|
|
220
220
|
Requires-Dist: fastparquet>=2024.11.0
|
|
221
221
|
Requires-Dist: fsspec>=2024.10.0
|
|
222
|
+
Requires-Dist: universal_pathlib>=0.2.6
|
|
223
|
+
Requires-Dist: s3fs>=2024.10.0
|
|
222
224
|
Requires-Dist: gunicorn
|
|
223
225
|
Requires-Dist: h11>=0.16.0
|
|
224
226
|
Requires-Dist: httpx>=0.28.1
|
|
225
227
|
Requires-Dist: isodate>=0.7.2
|
|
226
228
|
Requires-Dist: langdetect>=1.0.9
|
|
227
229
|
Requires-Dist: minio>=7.2.12
|
|
228
|
-
Requires-Dist: librosa
|
|
230
|
+
Requires-Dist: librosa==0.10.2
|
|
229
231
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
230
232
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
231
233
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/SOURCES.txt
RENAMED
|
@@ -63,6 +63,7 @@ nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py
|
|
|
63
63
|
nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py
|
|
64
64
|
nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py
|
|
65
65
|
nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py
|
|
66
|
+
nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py
|
|
66
67
|
nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py
|
|
67
68
|
nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py
|
|
68
69
|
nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/requires.txt
RENAMED
|
@@ -4,13 +4,15 @@ diskcache>=5.6.3
|
|
|
4
4
|
fastapi>=0.115.6
|
|
5
5
|
fastparquet>=2024.11.0
|
|
6
6
|
fsspec>=2024.10.0
|
|
7
|
+
universal_pathlib>=0.2.6
|
|
8
|
+
s3fs>=2024.10.0
|
|
7
9
|
gunicorn
|
|
8
10
|
h11>=0.16.0
|
|
9
11
|
httpx>=0.28.1
|
|
10
12
|
isodate>=0.7.2
|
|
11
13
|
langdetect>=1.0.9
|
|
12
14
|
minio>=7.2.12
|
|
13
|
-
librosa
|
|
15
|
+
librosa==0.10.2
|
|
14
16
|
opentelemetry-api>=1.27.0
|
|
15
17
|
opentelemetry-exporter-otlp>=1.27.0
|
|
16
18
|
opentelemetry-sdk>=1.27.0
|
|
@@ -26,13 +26,15 @@ dependencies = [
|
|
|
26
26
|
"fastapi>=0.115.6",
|
|
27
27
|
"fastparquet>=2024.11.0",
|
|
28
28
|
"fsspec>=2024.10.0",
|
|
29
|
+
"universal_pathlib>=0.2.6",
|
|
30
|
+
"s3fs>=2024.10.0",
|
|
29
31
|
"gunicorn",
|
|
30
32
|
"h11>=0.16.0", # Must pin at or above 0.16.0 for CVE mitigation
|
|
31
33
|
"httpx>=0.28.1",
|
|
32
34
|
"isodate>=0.7.2",
|
|
33
35
|
"langdetect>=1.0.9",
|
|
34
36
|
"minio>=7.2.12",
|
|
35
|
-
"librosa
|
|
37
|
+
"librosa==0.10.2",
|
|
36
38
|
"opentelemetry-api>=1.27.0",
|
|
37
39
|
"opentelemetry-exporter-otlp>=1.27.0",
|
|
38
40
|
"opentelemetry-sdk>=1.27.0",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/__init__.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/tracing.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/__init__.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/health.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/ingest.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/metrics.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/README.md
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/__init__.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.26.dev20251126 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|