nv-ingest 2025.8.8.dev20250808__tar.gz → 2026.1.12.dev20260112__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/PKG-INFO +6 -3
- nv_ingest-2026.1.12.dev20260112/nv_ingest/api/__init__.py +9 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/main.py +3 -1
- nv_ingest-2026.1.12.dev20260112/nv_ingest/api/tracing.py +82 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v2/README.md +203 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v2/ingest.py +1305 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/execution/helpers.py +85 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/execution/options.py +112 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/execution.py +501 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/strategies.py +218 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/termination.py +147 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +12 -12
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py +64 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +166 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -14
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +140 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/__init__.py +3 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/config/__init__.py +3 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/config/loaders.py +229 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/config/replica_resolver.py +237 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/default_libmode_pipeline_impl.py +529 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/default_pipeline_impl.py +558 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/ingest_pipeline.py +389 -0
- nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/pipeline_schema.py +398 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/PKG-INFO +6 -3
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/SOURCES.txt +25 -3
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/requires.txt +5 -2
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/pyproject.toml +8 -2
- nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -98
- nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
- nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -393
- nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/LICENSE +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/MANIFEST.in +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/api → nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v1}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/v1/metrics.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/api/v1 → nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v2}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/execution}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/edges → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/examples → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/primitives → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/edges}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/examples}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/extractors → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/primitives}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/injectors → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/meta → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/extractors}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/mutate → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/injectors}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/sinks → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/meta}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/sources → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/mutate}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/storage → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/sinks}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/telemetry → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/sources}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/transforms → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/storage}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/utility → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/telemetry}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/transforms}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/utility}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/system_tools → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util/pipeline}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util/system_tools}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/impl → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/impl/ingest → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/meta → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/impl}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/meta → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/impl}/ingest/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/telemetry → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/meta}/__init__.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/version.py +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.1.12.dev20260112
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -219,16 +219,19 @@ Requires-Dist: diskcache>=5.6.3
|
|
|
219
219
|
Requires-Dist: fastapi>=0.115.6
|
|
220
220
|
Requires-Dist: fastparquet>=2024.11.0
|
|
221
221
|
Requires-Dist: fsspec>=2024.10.0
|
|
222
|
+
Requires-Dist: universal_pathlib>=0.2.6
|
|
223
|
+
Requires-Dist: s3fs>=2024.10.0
|
|
222
224
|
Requires-Dist: gunicorn
|
|
223
225
|
Requires-Dist: h11>=0.16.0
|
|
224
226
|
Requires-Dist: httpx>=0.28.1
|
|
225
227
|
Requires-Dist: isodate>=0.7.2
|
|
226
228
|
Requires-Dist: langdetect>=1.0.9
|
|
227
229
|
Requires-Dist: minio>=7.2.12
|
|
228
|
-
Requires-Dist:
|
|
230
|
+
Requires-Dist: librosa==0.10.2
|
|
229
231
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
230
232
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
231
233
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
234
|
+
Requires-Dist: psutil>=7.1.0
|
|
232
235
|
Requires-Dist: pydantic>2.0.0
|
|
233
236
|
Requires-Dist: pydantic-settings>2.0.0
|
|
234
237
|
Requires-Dist: pypdfium2==4.30.0
|
|
@@ -240,7 +243,7 @@ Requires-Dist: python-docx>=1.1.2
|
|
|
240
243
|
Requires-Dist: python-dotenv>=1.0.1
|
|
241
244
|
Requires-Dist: python-pptx>=1.0.2
|
|
242
245
|
Requires-Dist: prometheus-client
|
|
243
|
-
Requires-Dist: ray[all]>=2.
|
|
246
|
+
Requires-Dist: ray[all]>=2.49.0
|
|
244
247
|
Requires-Dist: redis>=5.2.1
|
|
245
248
|
Requires-Dist: requests>=2.28.2
|
|
246
249
|
Requires-Dist: scikit-learn>=1.6.0
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""nv_ingest.api package."""
|
|
6
|
+
|
|
7
|
+
from .tracing import traced_endpoint # re-export for convenience
|
|
8
|
+
|
|
9
|
+
__all__ = ["traced_endpoint"]
|
|
@@ -15,6 +15,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
|
15
15
|
from .v1.health import router as HealthApiRouter
|
|
16
16
|
from .v1.ingest import router as IngestApiRouter
|
|
17
17
|
from .v1.metrics import router as MetricsApiRouter
|
|
18
|
+
from .v2.ingest import router as IngestApiRouterV2
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
@@ -22,7 +23,7 @@ logger = logging.getLogger(__name__)
|
|
|
22
23
|
app = FastAPI(
|
|
23
24
|
title="NV-Ingest Microservice",
|
|
24
25
|
description="Service for ingesting heterogenous datatypes",
|
|
25
|
-
version="
|
|
26
|
+
version="26.1.0",
|
|
26
27
|
contact={
|
|
27
28
|
"name": "NVIDIA Corporation",
|
|
28
29
|
"url": "https://nvidia.com",
|
|
@@ -33,6 +34,7 @@ app = FastAPI(
|
|
|
33
34
|
app.include_router(IngestApiRouter, prefix="/v1")
|
|
34
35
|
app.include_router(HealthApiRouter, prefix="/v1/health")
|
|
35
36
|
app.include_router(MetricsApiRouter, prefix="/v1")
|
|
37
|
+
app.include_router(IngestApiRouterV2, prefix="/v2")
|
|
36
38
|
|
|
37
39
|
# Set up the tracer provider and add a processor for exporting traces
|
|
38
40
|
resource = Resource(attributes={"service.name": "nv-ingest"})
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""HTTP endpoint tracing utilities."""
|
|
6
|
+
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from inspect import iscoroutinefunction
|
|
9
|
+
from typing import Any, Callable, Optional, TypeVar
|
|
10
|
+
|
|
11
|
+
from fastapi import Request, Response
|
|
12
|
+
from opentelemetry import trace
|
|
13
|
+
|
|
14
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
15
|
+
|
|
16
|
+
tracer = trace.get_tracer(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def traced_endpoint(name: Optional[str] = None) -> Callable[[F], F]:
|
|
20
|
+
"""Wrap a FastAPI endpoint with a span whose name defaults to the function name.
|
|
21
|
+
|
|
22
|
+
The decorator preserves the wrapped callable's signature so FastAPI can continue
|
|
23
|
+
to perform dependency injection and generate OpenAPI documentation correctly.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def decorator(func: F) -> F:
|
|
27
|
+
span_name = name or func.__name__
|
|
28
|
+
|
|
29
|
+
if iscoroutinefunction(func):
|
|
30
|
+
|
|
31
|
+
@wraps(func)
|
|
32
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
33
|
+
with tracer.start_as_current_span(span_name) as span:
|
|
34
|
+
span.set_attribute("nv_ingest.endpoint", func.__qualname__)
|
|
35
|
+
_record_http_request(span, args, kwargs)
|
|
36
|
+
response = await func(*args, **kwargs)
|
|
37
|
+
_record_http_response(span, response)
|
|
38
|
+
return response
|
|
39
|
+
|
|
40
|
+
return async_wrapper # type: ignore[return-value]
|
|
41
|
+
|
|
42
|
+
@wraps(func)
|
|
43
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
44
|
+
with tracer.start_as_current_span(span_name) as span:
|
|
45
|
+
span.set_attribute("nv_ingest.endpoint", func.__qualname__)
|
|
46
|
+
_record_http_request(span, args, kwargs)
|
|
47
|
+
result = func(*args, **kwargs)
|
|
48
|
+
_record_http_response(span, result)
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
return sync_wrapper # type: ignore[return-value]
|
|
52
|
+
|
|
53
|
+
return decorator
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _record_http_request(span, args: tuple[Any, ...], kwargs: dict[str, Any]) -> None:
|
|
57
|
+
request = _find_type(Request, args, kwargs)
|
|
58
|
+
if request is None:
|
|
59
|
+
return
|
|
60
|
+
span.set_attribute("http.method", request.method)
|
|
61
|
+
span.set_attribute("http.url", str(request.url))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _record_http_response(span, response: Any) -> None:
|
|
65
|
+
maybe_response = response if isinstance(response, Response) else None
|
|
66
|
+
if maybe_response is None:
|
|
67
|
+
maybe_response = _find_type(Response, (response,), {})
|
|
68
|
+
if maybe_response is None:
|
|
69
|
+
return
|
|
70
|
+
span.set_attribute("http.status_code", maybe_response.status_code)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _find_type(expected_type: type, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Optional[Any]:
|
|
74
|
+
"""Return the first argument matching ``expected_type`` from args or kwargs."""
|
|
75
|
+
|
|
76
|
+
for arg in args:
|
|
77
|
+
if isinstance(arg, expected_type):
|
|
78
|
+
return arg
|
|
79
|
+
for value in kwargs.values():
|
|
80
|
+
if isinstance(value, expected_type):
|
|
81
|
+
return value
|
|
82
|
+
return None
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# NV-Ingest V2 API
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The V2 API introduces automatic PDF splitting at the REST layer to improve processing throughput. When a multi-page PDF is submitted, it's automatically split into configurable multi-page chunks (default 32 pages) before being sent to the Redis service that then communicates with our Ray processing backend.
|
|
6
|
+
|
|
7
|
+
## Key Changes from V1
|
|
8
|
+
|
|
9
|
+
1. **Automatic PDF Splitting**: PDFs over the configured `PDF_SPLIT_PAGE_COUNT` are automatically split into multi-page chunks
|
|
10
|
+
2. **Parent-Child Job Tracking**: Parent jobs maintain relationships with their subjobs via Redis
|
|
11
|
+
3. **Transparent Aggregation**: Results are automatically aggregated when fetching parent jobs
|
|
12
|
+
4. **Backward Compatible**: PDFs with page counts ≤ `PDF_SPLIT_PAGE_COUNT` behave identical to V1
|
|
13
|
+
|
|
14
|
+
## How It Works
|
|
15
|
+
|
|
16
|
+
1. **Submit**: When a PDF with pages exceeding `PDF_SPLIT_PAGE_COUNT` is submitted to `/v2/submit_job`:
|
|
17
|
+
- The PDF is split into page chunks (size determined by `PDF_SPLIT_PAGE_COUNT`)
|
|
18
|
+
- Each chunk becomes a subjob with deterministic IDs derived from the parent
|
|
19
|
+
- Source IDs are modified to maintain association: `document.pdf#page_1`
|
|
20
|
+
- Parent-child mapping is stored in Redis
|
|
21
|
+
|
|
22
|
+
2. **Processing**: Each subjob is processed independently by Ray, appearing as chunk-sized PDFs that honor the configured `PDF_SPLIT_PAGE_COUNT`
|
|
23
|
+
|
|
24
|
+
3. **Fetch**: When fetching the parent job via `/v2/fetch_job/{parent_id}`:
|
|
25
|
+
- Subjob states and results are retrieved concurrently (bounded by the Redis connection pool)
|
|
26
|
+
- If all complete, results are aggregated in original page order
|
|
27
|
+
- Pending work returns 202 (processing)
|
|
28
|
+
- Failed chunks are noted without failing the entire job; metadata records which chunks failed
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Client Library Features
|
|
32
|
+
|
|
33
|
+
### Accessing Trace Metrics
|
|
34
|
+
|
|
35
|
+
The Python client library provides convenient access to trace metrics via the `return_traces` parameter:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from nv_ingest_client.client import Ingestor
|
|
39
|
+
|
|
40
|
+
ingestor = Ingestor(
|
|
41
|
+
message_client_hostname="localhost",
|
|
42
|
+
message_client_port=7670,
|
|
43
|
+
message_client_kwargs={"api_version": "v2"}
|
|
44
|
+
).files("/path/to/pdfs").extract().embed()
|
|
45
|
+
|
|
46
|
+
# Get results with trace metrics
|
|
47
|
+
results, traces = ingestor.ingest(return_traces=True)
|
|
48
|
+
|
|
49
|
+
# Access timing for first document
|
|
50
|
+
pdf_time = traces[0]["trace::resident_time::pdf_extractor"] / 1e9
|
|
51
|
+
table_time = traces[0]["trace::resident_time::table_extractor"] / 1e9
|
|
52
|
+
print(f"PDF: {pdf_time:.2f}s, Tables: {table_time:.2f}s")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Note:** For split PDFs, `resident_time` represents aggregated compute time across all chunks. For non-split PDFs, it is computed client-side from entry/exit pairs.
|
|
56
|
+
|
|
57
|
+
### Aggregated response
|
|
58
|
+
|
|
59
|
+
The fetch endpoint returns a JSON body shaped like the following:
|
|
60
|
+
|
|
61
|
+
```json
|
|
62
|
+
{
|
|
63
|
+
"data": [...],
|
|
64
|
+
"status": "success",
|
|
65
|
+
"trace": {
|
|
66
|
+
"trace::entry::pdf_extractor": 1000,
|
|
67
|
+
"trace::exit::pdf_extractor": 2150,
|
|
68
|
+
"trace::resident_time::pdf_extractor": 250,
|
|
69
|
+
"trace::entry::table_extractor": 1200,
|
|
70
|
+
"trace::exit::table_extractor": 2300,
|
|
71
|
+
"trace::resident_time::table_extractor": 300
|
|
72
|
+
// ... parent-level aggregated traces only (clean, V1-compatible)
|
|
73
|
+
},
|
|
74
|
+
"annotations": {
|
|
75
|
+
"annotation::uuid-1": {"task_id": "pdf_extractor", "task_result": "SUCCESS"},
|
|
76
|
+
"annotation::uuid-2": {"task_id": "table_extractor", "task_result": "SUCCESS"}
|
|
77
|
+
// ... all annotations from all chunks (annotations have unique UUIDs)
|
|
78
|
+
},
|
|
79
|
+
"metadata": {
|
|
80
|
+
"parent_job_id": "<uuid>",
|
|
81
|
+
"total_pages": 320,
|
|
82
|
+
"pages_per_chunk": 32,
|
|
83
|
+
"original_source_id": "document.pdf",
|
|
84
|
+
"subjob_ids": ["...", "..."],
|
|
85
|
+
"subjobs_failed": 0,
|
|
86
|
+
"failed_subjobs": [],
|
|
87
|
+
"chunks": [
|
|
88
|
+
{
|
|
89
|
+
"job_id": "...",
|
|
90
|
+
"chunk_index": 1,
|
|
91
|
+
"start_page": 1,
|
|
92
|
+
"end_page": 32,
|
|
93
|
+
"page_count": 32
|
|
94
|
+
}
|
|
95
|
+
// ... additional chunks ...
|
|
96
|
+
],
|
|
97
|
+
"trace_segments": [
|
|
98
|
+
{
|
|
99
|
+
"job_id": "...",
|
|
100
|
+
"chunk_index": 1,
|
|
101
|
+
"start_page": 1,
|
|
102
|
+
"end_page": 32,
|
|
103
|
+
"trace": {"trace::entry::pdf_extractor": 1.7599e18, ...}
|
|
104
|
+
}
|
|
105
|
+
// ... per-chunk trace details
|
|
106
|
+
],
|
|
107
|
+
"annotation_segments": [
|
|
108
|
+
{
|
|
109
|
+
"job_id": "...",
|
|
110
|
+
"chunk_index": 1,
|
|
111
|
+
"start_page": 1,
|
|
112
|
+
"end_page": 32,
|
|
113
|
+
"annotations": {"annotation::uuid": {...}, ...}
|
|
114
|
+
}
|
|
115
|
+
// ... per-chunk annotation details
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Top-level trace and annotations** (V1 compatibility):
|
|
122
|
+
- `trace`: Contains **only parent-level aggregated traces** for clean V1 compatibility
|
|
123
|
+
- `trace::entry::<stage>` - Earliest entry time across all chunks
|
|
124
|
+
- `trace::exit::<stage>` - Latest exit time across all chunks
|
|
125
|
+
- `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time)
|
|
126
|
+
- `annotations`: Merged annotations from all chunks (annotations have unique UUIDs so merge safely)
|
|
127
|
+
- These fields match V1 structure, allowing existing client code to work without modification
|
|
128
|
+
|
|
129
|
+
**Note:** Chunk-level trace details are available in `metadata.trace_segments[]` for granular analysis
|
|
130
|
+
|
|
131
|
+
**Parent-Level Trace Aggregation:**
|
|
132
|
+
|
|
133
|
+
For split PDFs, parent-level metrics are automatically computed for each stage (including nested stages):
|
|
134
|
+
|
|
135
|
+
- `trace::entry::<stage>` - Earliest entry time across all chunks (when first chunk entered stage)
|
|
136
|
+
- `trace::exit::<stage>` - Latest exit time across all chunks (when last chunk exited stage)
|
|
137
|
+
- `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time in stage)
|
|
138
|
+
|
|
139
|
+
**Supports arbitrary nesting depth:**
|
|
140
|
+
- Simple: `trace::entry::pdf_extractor`
|
|
141
|
+
- Nested: `trace::entry::pdf_extractor::pdf_extraction::pdfium_pages_to_numpy_0`
|
|
142
|
+
|
|
143
|
+
**Example:**
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"trace": {
|
|
147
|
+
"trace::entry::pdf_extractor": 1000,
|
|
148
|
+
"trace::exit::pdf_extractor": 2150,
|
|
149
|
+
"trace::resident_time::pdf_extractor": 250
|
|
150
|
+
// ... only parent-level aggregations (clean, concise)
|
|
151
|
+
},
|
|
152
|
+
"metadata": {
|
|
153
|
+
"trace_segments": [
|
|
154
|
+
{
|
|
155
|
+
"chunk_index": 1,
|
|
156
|
+
"start_page": 1,
|
|
157
|
+
"end_page": 32,
|
|
158
|
+
"trace": {
|
|
159
|
+
"trace::entry::pdf_extractor": 1000,
|
|
160
|
+
"trace::exit::pdf_extractor": 1100
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
"chunk_index": 2,
|
|
165
|
+
"trace": {
|
|
166
|
+
"trace::entry::pdf_extractor": 2000,
|
|
167
|
+
"trace::exit::pdf_extractor": 2150
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Note:** `resident_time` represents total compute time (sum of chunk durations), while `exit - entry` shows wall-clock span.
|
|
176
|
+
|
|
177
|
+
**Detailed metadata** (V2-specific):
|
|
178
|
+
- `trace_segments`: **Chunk-level trace data** with page ranges for granular per-chunk analysis
|
|
179
|
+
- `annotation_segments`: Per-chunk annotation data with page ranges
|
|
180
|
+
- Clients can correlate chunk data by matching `job_id` or `chunk_index` across arrays
|
|
181
|
+
- Failed chunk entries remain in `failed_subjobs`; missing chunks indicate the sink did not emit telemetry
|
|
182
|
+
- **To access chunk traces:** Use `metadata.trace_segments[]` - each segment contains the full trace dict for that chunk
|
|
183
|
+
|
|
184
|
+
### Advanced: Accessing Full Metadata
|
|
185
|
+
|
|
186
|
+
For advanced use cases requiring per-chunk trace breakdown or full metadata, use `include_parent_trace_ids`:
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
results, traces, parent_trace_ids = ingestor.ingest(
|
|
190
|
+
return_traces=True,
|
|
191
|
+
include_parent_trace_ids=True
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Fetch full parent job metadata (including trace_segments)
|
|
195
|
+
import requests
|
|
196
|
+
response = requests.get(f"http://localhost:7670/v2/fetch_job/{parent_trace_ids[0]}")
|
|
197
|
+
metadata = response.json()["metadata"]
|
|
198
|
+
|
|
199
|
+
# Access per-chunk traces
|
|
200
|
+
for segment in metadata["trace_segments"]:
|
|
201
|
+
print(f"Chunk {segment['chunk_index']}: pages {segment['start_page']}-{segment['end_page']}")
|
|
202
|
+
print(f" Traces: {len(segment['trace'])} entries")
|
|
203
|
+
```
|