nv-ingest 2025.10.8.dev20251008__tar.gz → 2025.10.10.dev20251010__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/PKG-INFO +2 -1
- nv_ingest-2025.10.10.dev20251010/nv_ingest/api/__init__.py +9 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/api/main.py +2 -0
- nv_ingest-2025.10.10.dev20251010/nv_ingest/api/tracing.py +82 -0
- nv_ingest-2025.10.10.dev20251010/nv_ingest/api/v2/README.md +104 -0
- nv_ingest-2025.10.10.dev20251010/nv_ingest/api/v2/ingest.py +816 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +192 -10
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest.egg-info/PKG-INFO +2 -1
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest.egg-info/SOURCES.txt +4 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest.egg-info/requires.txt +1 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/pyproject.toml +1 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/LICENSE +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/MANIFEST.in +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008/nv_ingest/api → nv_ingest-2025.10.10.dev20251010/nv_ingest/api/v1}/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/api/v1/metrics.py +0 -0
- {nv_ingest-2025.10.8.dev20251008/nv_ingest/api/v1 → nv_ingest-2025.10.10.dev20251010/nv_ingest/api/v2}/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/execution/options.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/process/execution.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/process/termination.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/config/__init__.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/config/loaders.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/config/replica_resolver.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/default_pipeline_impl.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/pipeline/pipeline_schema.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest/version.py +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-2025.10.8.dev20251008 → nv_ingest-2025.10.10.dev20251010}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version: 2025.10.
|
|
3
|
+
Version: 2025.10.10.dev20251010
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -230,6 +230,7 @@ Requires-Dist: openai>=1.82.0
|
|
|
230
230
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
231
231
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
232
232
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
233
|
+
Requires-Dist: psutil>=7.1.0
|
|
233
234
|
Requires-Dist: pydantic>2.0.0
|
|
234
235
|
Requires-Dist: pydantic-settings>2.0.0
|
|
235
236
|
Requires-Dist: pypdfium2==4.30.0
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""nv_ingest.api package."""
|
|
6
|
+
|
|
7
|
+
from .tracing import traced_endpoint # re-export for convenience
|
|
8
|
+
|
|
9
|
+
__all__ = ["traced_endpoint"]
|
|
@@ -15,6 +15,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
|
15
15
|
from .v1.health import router as HealthApiRouter
|
|
16
16
|
from .v1.ingest import router as IngestApiRouter
|
|
17
17
|
from .v1.metrics import router as MetricsApiRouter
|
|
18
|
+
from .v2.ingest import router as IngestApiRouterV2
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
@@ -33,6 +34,7 @@ app = FastAPI(
|
|
|
33
34
|
app.include_router(IngestApiRouter, prefix="/v1")
|
|
34
35
|
app.include_router(HealthApiRouter, prefix="/v1/health")
|
|
35
36
|
app.include_router(MetricsApiRouter, prefix="/v1")
|
|
37
|
+
app.include_router(IngestApiRouterV2, prefix="/v2")
|
|
36
38
|
|
|
37
39
|
# Set up the tracer provider and add a processor for exporting traces
|
|
38
40
|
resource = Resource(attributes={"service.name": "nv-ingest"})
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
"""HTTP endpoint tracing utilities."""
|
|
6
|
+
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from inspect import iscoroutinefunction
|
|
9
|
+
from typing import Any, Callable, Optional, TypeVar
|
|
10
|
+
|
|
11
|
+
from fastapi import Request, Response
|
|
12
|
+
from opentelemetry import trace
|
|
13
|
+
|
|
14
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
15
|
+
|
|
16
|
+
tracer = trace.get_tracer(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def traced_endpoint(name: Optional[str] = None) -> Callable[[F], F]:
|
|
20
|
+
"""Wrap a FastAPI endpoint with a span whose name defaults to the function name.
|
|
21
|
+
|
|
22
|
+
The decorator preserves the wrapped callable's signature so FastAPI can continue
|
|
23
|
+
to perform dependency injection and generate OpenAPI documentation correctly.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def decorator(func: F) -> F:
|
|
27
|
+
span_name = name or func.__name__
|
|
28
|
+
|
|
29
|
+
if iscoroutinefunction(func):
|
|
30
|
+
|
|
31
|
+
@wraps(func)
|
|
32
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
33
|
+
with tracer.start_as_current_span(span_name) as span:
|
|
34
|
+
span.set_attribute("nv_ingest.endpoint", func.__qualname__)
|
|
35
|
+
_record_http_request(span, args, kwargs)
|
|
36
|
+
response = await func(*args, **kwargs)
|
|
37
|
+
_record_http_response(span, response)
|
|
38
|
+
return response
|
|
39
|
+
|
|
40
|
+
return async_wrapper # type: ignore[return-value]
|
|
41
|
+
|
|
42
|
+
@wraps(func)
|
|
43
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
44
|
+
with tracer.start_as_current_span(span_name) as span:
|
|
45
|
+
span.set_attribute("nv_ingest.endpoint", func.__qualname__)
|
|
46
|
+
_record_http_request(span, args, kwargs)
|
|
47
|
+
result = func(*args, **kwargs)
|
|
48
|
+
_record_http_response(span, result)
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
return sync_wrapper # type: ignore[return-value]
|
|
52
|
+
|
|
53
|
+
return decorator
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _record_http_request(span, args: tuple[Any, ...], kwargs: dict[str, Any]) -> None:
|
|
57
|
+
request = _find_type(Request, args, kwargs)
|
|
58
|
+
if request is None:
|
|
59
|
+
return
|
|
60
|
+
span.set_attribute("http.method", request.method)
|
|
61
|
+
span.set_attribute("http.url", str(request.url))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _record_http_response(span, response: Any) -> None:
|
|
65
|
+
maybe_response = response if isinstance(response, Response) else None
|
|
66
|
+
if maybe_response is None:
|
|
67
|
+
maybe_response = _find_type(Response, (response,), {})
|
|
68
|
+
if maybe_response is None:
|
|
69
|
+
return
|
|
70
|
+
span.set_attribute("http.status_code", maybe_response.status_code)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _find_type(expected_type: type, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Optional[Any]:
|
|
74
|
+
"""Return the first argument matching ``expected_type`` from args or kwargs."""
|
|
75
|
+
|
|
76
|
+
for arg in args:
|
|
77
|
+
if isinstance(arg, expected_type):
|
|
78
|
+
return arg
|
|
79
|
+
for value in kwargs.values():
|
|
80
|
+
if isinstance(value, expected_type):
|
|
81
|
+
return value
|
|
82
|
+
return None
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# NV-Ingest V2 API
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
The V2 API introduces automatic PDF splitting at the REST layer to improve processing throughput. When a multi-page PDF is submitted, it's automatically split into configurable multi-page chunks (default 32 pages) before being sent to the Redis service that then communicates with our Ray processing backend.
|
|
6
|
+
|
|
7
|
+
## Key Changes from V1
|
|
8
|
+
|
|
9
|
+
1. **Automatic PDF Splitting**: PDFs over the configured `PDF_SPLIT_PAGE_COUNT` are automatically split into multi-page chunks
|
|
10
|
+
2. **Parent-Child Job Tracking**: Parent jobs maintain relationships with their subjobs via Redis
|
|
11
|
+
3. **Transparent Aggregation**: Results are automatically aggregated when fetching parent jobs
|
|
12
|
+
4. **Backward Compatible**: PDFs with page counts ≤ `PDF_SPLIT_PAGE_COUNT` behave identical to V1
|
|
13
|
+
|
|
14
|
+
## Tracing & Aggregated Metadata
|
|
15
|
+
|
|
16
|
+
- V2 endpoints open an OpenTelemetry span using the shared `traced_endpoint` decorator. The span name defaults to the function name, or can be overridden when applying the decorator.
|
|
17
|
+
- `submit_job_v2` records the parent span's `trace_id` into each subjob's `tracing_options`, enabling downstream Ray stages (e.g., the message broker sink) to attach chunk-level telemetry consistently.
|
|
18
|
+
- Response headers still return `x-trace-id` derived from the active span context, allowing clients to correlate downstream work.
|
|
19
|
+
- When `/v2/fetch_job/{parent_id}` aggregates completed chunks, it captures any `trace` / `annotations` dictionaries emitted by the sink for each subjob and includes them in the response payload (see "Aggregated response" below).
|
|
20
|
+
|
|
21
|
+
This behaviour matches the V1 tracing model and sets the foundation for adding W3C `traceparent` propagation in future changes.
|
|
22
|
+
|
|
23
|
+
## How It Works
|
|
24
|
+
|
|
25
|
+
1. **Submit**: When a PDF with pages exceeding `PDF_SPLIT_PAGE_COUNT` is submitted to `/v2/submit_job`:
|
|
26
|
+
- The PDF is split into page chunks (size determined by `PDF_SPLIT_PAGE_COUNT`)
|
|
27
|
+
- Each chunk becomes a subjob with deterministic IDs derived from the parent
|
|
28
|
+
- Source IDs are modified to maintain association: `document.pdf#page_1`
|
|
29
|
+
- Parent-child mapping is stored in Redis
|
|
30
|
+
|
|
31
|
+
2. **Processing**: Each subjob is processed independently by Ray, appearing as chunk-sized PDFs that honor the configured `PDF_SPLIT_PAGE_COUNT`
|
|
32
|
+
|
|
33
|
+
3. **Fetch**: When fetching the parent job via `/v2/fetch_job/{parent_id}`:
|
|
34
|
+
- Subjob states and results are retrieved concurrently (bounded by the Redis connection pool)
|
|
35
|
+
- If all complete, results are aggregated in original page order
|
|
36
|
+
- Pending work returns 202 (processing)
|
|
37
|
+
- Failed chunks are noted without failing the entire job; metadata records which chunks failed
|
|
38
|
+
|
|
39
|
+
### Aggregated response
|
|
40
|
+
|
|
41
|
+
The fetch endpoint returns a JSON body shaped like the following:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
{
|
|
45
|
+
"data": [...],
|
|
46
|
+
"status": "success",
|
|
47
|
+
"metadata": {
|
|
48
|
+
"parent_job_id": "<uuid>",
|
|
49
|
+
"total_pages": 320,
|
|
50
|
+
"pages_per_chunk": 32,
|
|
51
|
+
"original_source_id": "document.pdf",
|
|
52
|
+
"subjob_ids": ["...", "..."],
|
|
53
|
+
"subjobs_failed": 0,
|
|
54
|
+
"failed_subjobs": [],
|
|
55
|
+
"chunks": [
|
|
56
|
+
{
|
|
57
|
+
"job_id": "...",
|
|
58
|
+
"chunk_index": 1,
|
|
59
|
+
"start_page": 1,
|
|
60
|
+
"end_page": 32,
|
|
61
|
+
"page_count": 32
|
|
62
|
+
}
|
|
63
|
+
// ... additional chunks ...
|
|
64
|
+
],
|
|
65
|
+
"trace_segments": [
|
|
66
|
+
{
|
|
67
|
+
"job_id": "...",
|
|
68
|
+
"chunk_index": 1,
|
|
69
|
+
"start_page": 1,
|
|
70
|
+
"end_page": 32,
|
|
71
|
+
"trace": {"trace::sink_push": 1.7285796e+18, ...}
|
|
72
|
+
}
|
|
73
|
+
// ...
|
|
74
|
+
],
|
|
75
|
+
"annotation_segments": [
|
|
76
|
+
{
|
|
77
|
+
"job_id": "...",
|
|
78
|
+
"chunk_index": 1,
|
|
79
|
+
"start_page": 1,
|
|
80
|
+
"end_page": 32,
|
|
81
|
+
"annotations": {"annotation::stage": "sink", ...}
|
|
82
|
+
}
|
|
83
|
+
// ...
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
- `trace_segments` and `annotation_segments` appear only when the sink emits telemetry for a given chunk.
|
|
90
|
+
- Clients can correlate chunk data by matching `job_id` or `chunk_index` across `chunks`, `trace_segments`, and `annotation_segments`.
|
|
91
|
+
- Failed chunk entries remain in `failed_subjobs`; if a chunk is missing from the telemetry arrays, the sink did not emit trace/annotation payloads for that chunk.
|
|
92
|
+
|
|
93
|
+
## Testing
|
|
94
|
+
|
|
95
|
+
Use the V2 test script with environment variable:
|
|
96
|
+
```bash
|
|
97
|
+
# Run with V2 endpoints
|
|
98
|
+
DATASET_DIR=/data/splits python scripts/tests/cases/dc20_v2_e2e.py
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Or set the API version for any existing code:
|
|
102
|
+
```bash
|
|
103
|
+
export NV_INGEST_API_VERSION=v2
|
|
104
|
+
```
|