nv-ingest 2025.11.4.dev20251104__tar.gz → 2025.11.15.dev20251115__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/PKG-INFO +1 -1
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v2/ingest.py +171 -61
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
- nv_ingest-2025.11.15.dev20251115/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +2 -2
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/default_pipeline_impl.py +25 -1
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/PKG-INFO +1 -1
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/SOURCES.txt +1 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/LICENSE +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/MANIFEST.in +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/main.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/tracing.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/metrics.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v2/README.md +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v2/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/execution/options.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/process/execution.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/process/termination.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/config/__init__.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/config/loaders.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/config/replica_resolver.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/pipeline_schema.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/version.py +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/requires.txt +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/pyproject.toml +0 -0
- {nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/setup.cfg +0 -0
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v2/ingest.py
RENAMED
|
@@ -13,6 +13,8 @@ import os
|
|
|
13
13
|
import time
|
|
14
14
|
import uuid
|
|
15
15
|
import random
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import fsspec
|
|
16
18
|
|
|
17
19
|
from fastapi import APIRouter, Request, Response
|
|
18
20
|
from fastapi import HTTPException
|
|
@@ -21,6 +23,8 @@ from redis import RedisError
|
|
|
21
23
|
|
|
22
24
|
from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
|
|
23
25
|
from nv_ingest_api.util.service_clients.client_base import FetchMode
|
|
26
|
+
from nv_ingest_api.util.dataloader.dataloader import DataLoader
|
|
27
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
|
|
24
28
|
|
|
25
29
|
# For PDF splitting
|
|
26
30
|
import pypdfium2 as pdfium
|
|
@@ -188,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
|
|
|
188
192
|
return 1 # Assume single page on error
|
|
189
193
|
|
|
190
194
|
|
|
191
|
-
def
|
|
195
|
+
def _create_subjob_dict(
|
|
196
|
+
job_id: str,
|
|
197
|
+
job_payload: Dict[str, Any],
|
|
192
198
|
job_spec_template: Dict[str, Any],
|
|
193
|
-
chunk: Dict[str, Any],
|
|
194
|
-
*,
|
|
195
|
-
parent_uuid: uuid.UUID,
|
|
196
|
-
parent_job_id: str,
|
|
197
199
|
current_trace_id: int,
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
) ->
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
chunk_number = chunk["chunk_index"] + 1
|
|
204
|
-
start_page = chunk["start_page"]
|
|
205
|
-
end_page = chunk["end_page"]
|
|
206
|
-
|
|
207
|
-
subjob_spec = {
|
|
200
|
+
parent_job_id: str,
|
|
201
|
+
start_key: Dict[str, Any],
|
|
202
|
+
) -> Dict[str, Any]:
|
|
203
|
+
job_spec = {
|
|
208
204
|
key: value
|
|
209
205
|
for key, value in job_spec_template.items()
|
|
210
206
|
if key not in {"job_payload", "job_id", "tracing_options"}
|
|
211
207
|
}
|
|
208
|
+
job_spec["job_payload"] = job_payload
|
|
209
|
+
job_spec["job_id"] = job_id
|
|
212
210
|
|
|
211
|
+
base_tracing_options = job_spec_template.get("tracing_options") or {}
|
|
212
|
+
tracing_options = dict(base_tracing_options)
|
|
213
|
+
tracing_options.setdefault("trace", True)
|
|
214
|
+
tracing_options["trace_id"] = str(current_trace_id)
|
|
215
|
+
tracing_options["ts_send"] = int(time.time() * 1000)
|
|
216
|
+
tracing_options["parent_job_id"] = parent_job_id
|
|
217
|
+
for key, value in start_key.items():
|
|
218
|
+
tracing_options[key] = value
|
|
219
|
+
|
|
220
|
+
job_spec["tracing_options"] = tracing_options
|
|
221
|
+
return job_spec
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _create_payload_dict(
|
|
225
|
+
job_spec_template: Dict[str, Any],
|
|
226
|
+
content: str,
|
|
227
|
+
source_id: str,
|
|
228
|
+
source_name: str,
|
|
229
|
+
document_type: str,
|
|
230
|
+
) -> Dict[str, Any]:
|
|
213
231
|
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
214
232
|
subjob_payload = {
|
|
215
233
|
key: value
|
|
@@ -217,27 +235,40 @@ def _prepare_chunk_submission(
|
|
|
217
235
|
if key not in {"content", "source_id", "source_name"}
|
|
218
236
|
}
|
|
219
237
|
|
|
220
|
-
|
|
221
|
-
subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
|
|
238
|
+
subjob_payload["content"] = [content]
|
|
222
239
|
|
|
223
|
-
|
|
224
|
-
subjob_payload["
|
|
225
|
-
subjob_payload["
|
|
240
|
+
subjob_payload["source_id"] = [source_id]
|
|
241
|
+
subjob_payload["source_name"] = [source_name]
|
|
242
|
+
subjob_payload["document_type"] = [document_type]
|
|
243
|
+
return subjob_payload
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _prepare_chunk_submission(
|
|
247
|
+
job_spec_template: Dict[str, Any],
|
|
248
|
+
chunk: Dict[str, Any],
|
|
249
|
+
*,
|
|
250
|
+
parent_uuid: uuid.UUID,
|
|
251
|
+
parent_job_id: str,
|
|
252
|
+
current_trace_id: int,
|
|
253
|
+
source_id: str,
|
|
254
|
+
source_name: str,
|
|
255
|
+
document_type: str,
|
|
256
|
+
) -> Tuple[str, MessageWrapper]:
|
|
257
|
+
"""Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
|
|
258
|
+
|
|
259
|
+
chunk_number = chunk["chunk_index"] + 1
|
|
226
260
|
|
|
227
261
|
subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
|
|
228
262
|
subjob_id = str(subjob_uuid)
|
|
229
|
-
subjob_spec["job_payload"] = subjob_payload
|
|
230
|
-
subjob_spec["job_id"] = subjob_id
|
|
231
263
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
tracing_options["ts_send"] = int(time.time() * 1000)
|
|
237
|
-
tracing_options["parent_job_id"] = parent_job_id
|
|
238
|
-
tracing_options["page_num"] = start_page
|
|
264
|
+
subjob_payload_template = job_spec_template.get("job_payload", {})
|
|
265
|
+
chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
|
|
266
|
+
subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
|
|
267
|
+
start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
|
|
239
268
|
|
|
240
|
-
subjob_spec
|
|
269
|
+
subjob_spec = _create_subjob_dict(
|
|
270
|
+
subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
|
|
271
|
+
)
|
|
241
272
|
|
|
242
273
|
return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
|
|
243
274
|
|
|
@@ -801,6 +832,8 @@ async def submit_job_v2(
|
|
|
801
832
|
request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
|
|
802
833
|
):
|
|
803
834
|
span = trace.get_current_span()
|
|
835
|
+
source_id = None
|
|
836
|
+
document_type = None
|
|
804
837
|
try:
|
|
805
838
|
span.add_event("Submitting file for processing (V2)")
|
|
806
839
|
|
|
@@ -827,7 +860,19 @@ async def submit_job_v2(
|
|
|
827
860
|
|
|
828
861
|
# Track page count for all PDFs (used for both splitting logic and metadata)
|
|
829
862
|
pdf_page_count_cache = None
|
|
830
|
-
|
|
863
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
864
|
+
subjob_ids: List[str] = []
|
|
865
|
+
subjob_descriptors: List[Dict[str, Any]] = []
|
|
866
|
+
parent_metadata: Dict[str, Any] = {}
|
|
867
|
+
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
868
|
+
try:
|
|
869
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
870
|
+
except ValueError:
|
|
871
|
+
logger.warning(
|
|
872
|
+
"Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
|
|
873
|
+
parent_job_id,
|
|
874
|
+
)
|
|
875
|
+
parent_uuid = uuid.uuid4()
|
|
831
876
|
# Check if this is a PDF that needs splitting
|
|
832
877
|
if document_types and payloads and document_types[0].lower() == "pdf":
|
|
833
878
|
# Decode the payload to check page count
|
|
@@ -836,6 +881,7 @@ async def submit_job_v2(
|
|
|
836
881
|
pdf_page_count_cache = page_count # Cache for later use
|
|
837
882
|
qos_tier = get_qos_tier_for_page_count(page_count)
|
|
838
883
|
pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
|
|
884
|
+
document_type = DocumentTypeEnum.PDF
|
|
839
885
|
|
|
840
886
|
# Split if the document has more pages than our chunk size
|
|
841
887
|
if page_count > pages_per_chunk:
|
|
@@ -846,13 +892,11 @@ async def submit_job_v2(
|
|
|
846
892
|
page_count,
|
|
847
893
|
qos_tier,
|
|
848
894
|
)
|
|
849
|
-
|
|
850
895
|
chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
|
|
851
896
|
|
|
852
897
|
subjob_ids: List[str] = []
|
|
853
898
|
subjob_descriptors: List[Dict[str, Any]] = []
|
|
854
899
|
submission_items: List[Tuple[str, MessageWrapper]] = []
|
|
855
|
-
|
|
856
900
|
try:
|
|
857
901
|
parent_uuid = uuid.UUID(parent_job_id)
|
|
858
902
|
except ValueError:
|
|
@@ -863,14 +907,20 @@ async def submit_job_v2(
|
|
|
863
907
|
parent_uuid = uuid.uuid4()
|
|
864
908
|
|
|
865
909
|
for chunk in chunks:
|
|
910
|
+
start = chunk["start_page"]
|
|
911
|
+
end = chunk["end_page"]
|
|
912
|
+
page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
|
|
913
|
+
source_id = f"{original_source_id}#{page_suffix}"
|
|
914
|
+
source_name = f"{original_source_name}#{page_suffix}"
|
|
866
915
|
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
867
916
|
job_spec_dict,
|
|
868
917
|
chunk,
|
|
918
|
+
document_type=DocumentTypeEnum.PDF,
|
|
869
919
|
parent_uuid=parent_uuid,
|
|
870
920
|
parent_job_id=parent_job_id,
|
|
871
921
|
current_trace_id=current_trace_id,
|
|
872
|
-
|
|
873
|
-
|
|
922
|
+
source_id=source_id,
|
|
923
|
+
source_name=source_name,
|
|
874
924
|
)
|
|
875
925
|
|
|
876
926
|
# Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
|
|
@@ -895,38 +945,98 @@ async def submit_job_v2(
|
|
|
895
945
|
"page_count": chunk.get("page_count"),
|
|
896
946
|
}
|
|
897
947
|
)
|
|
948
|
+
parent_metadata.update(
|
|
949
|
+
{
|
|
950
|
+
"total_pages": page_count,
|
|
951
|
+
"pages_per_chunk": pages_per_chunk,
|
|
952
|
+
"original_source_id": original_source_id,
|
|
953
|
+
"original_source_name": original_source_name,
|
|
954
|
+
"document_type": document_types[0] if document_types else "pdf",
|
|
955
|
+
"subjob_order": subjob_ids,
|
|
956
|
+
}
|
|
957
|
+
)
|
|
958
|
+
elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
|
|
959
|
+
document_type = document_types[0]
|
|
960
|
+
upload_path = f"./{Path(original_source_id).name}"
|
|
961
|
+
# dump the payload to a file, just came from client
|
|
962
|
+
with fsspec.open(upload_path, "wb") as f:
|
|
963
|
+
f.write(base64.b64decode(payloads[0]))
|
|
964
|
+
dataloader = DataLoader(
|
|
965
|
+
path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
|
|
966
|
+
)
|
|
967
|
+
document_type = DocumentTypeEnum.MP3
|
|
968
|
+
|
|
969
|
+
parent_uuid = uuid.UUID(parent_job_id)
|
|
970
|
+
for task in job_spec_dict["tasks"]:
|
|
971
|
+
if "task_properties" in task and "document_type" in task["task_properties"]:
|
|
972
|
+
task["task_properties"]["document_type"] = document_type
|
|
973
|
+
end = 0
|
|
974
|
+
for idx, (file_path, duration) in enumerate(dataloader.files_completed):
|
|
975
|
+
start = end
|
|
976
|
+
end = int(start + duration)
|
|
977
|
+
chunk = {
|
|
978
|
+
"bytes": file_path.encode("utf-8"),
|
|
979
|
+
"chunk_index": idx,
|
|
980
|
+
"start": start,
|
|
981
|
+
"end": end,
|
|
982
|
+
}
|
|
898
983
|
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
984
|
+
subjob_id, subjob_wrapper = _prepare_chunk_submission(
|
|
985
|
+
job_spec_dict,
|
|
986
|
+
chunk,
|
|
987
|
+
parent_uuid=parent_uuid,
|
|
988
|
+
parent_job_id=parent_job_id,
|
|
989
|
+
current_trace_id=current_trace_id,
|
|
990
|
+
source_id=file_path,
|
|
991
|
+
source_name=upload_path,
|
|
992
|
+
document_type=document_type,
|
|
993
|
+
)
|
|
908
994
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
995
|
+
submission_items.append((subjob_id, subjob_wrapper))
|
|
996
|
+
subjob_ids.append(subjob_id)
|
|
997
|
+
subjob_descriptors.append(
|
|
998
|
+
{
|
|
999
|
+
"job_id": subjob_id,
|
|
1000
|
+
"chunk_index": idx + 1,
|
|
1001
|
+
"start_page": chunk.get("start"),
|
|
1002
|
+
"end_page": chunk.get("end"),
|
|
1003
|
+
"page_count": chunk.get("page_count", 0),
|
|
1004
|
+
}
|
|
1005
|
+
)
|
|
1006
|
+
logger.error(f"Removing uploaded file {upload_path}")
|
|
1007
|
+
os.remove(upload_path)
|
|
1008
|
+
|
|
1009
|
+
if submission_items:
|
|
1010
|
+
burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
|
|
1011
|
+
await _submit_subjobs_in_bursts(
|
|
1012
|
+
submission_items,
|
|
1013
|
+
ingest_service,
|
|
1014
|
+
burst_size=burst_size,
|
|
1015
|
+
pause_ms=pause_ms,
|
|
1016
|
+
jitter_ms=jitter_ms,
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
parent_metadata.update(
|
|
1020
|
+
{
|
|
912
1021
|
"original_source_id": original_source_id,
|
|
913
1022
|
"original_source_name": original_source_name,
|
|
914
|
-
"document_type":
|
|
1023
|
+
"document_type": document_type,
|
|
915
1024
|
"subjob_order": subjob_ids,
|
|
916
1025
|
}
|
|
1026
|
+
)
|
|
1027
|
+
# raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
|
|
1028
|
+
await ingest_service.set_parent_job_mapping(
|
|
1029
|
+
parent_job_id,
|
|
1030
|
+
subjob_ids,
|
|
1031
|
+
parent_metadata,
|
|
1032
|
+
subjob_descriptors=subjob_descriptors,
|
|
1033
|
+
)
|
|
917
1034
|
|
|
918
|
-
|
|
919
|
-
parent_job_id,
|
|
920
|
-
subjob_ids,
|
|
921
|
-
parent_metadata,
|
|
922
|
-
subjob_descriptors=subjob_descriptors,
|
|
923
|
-
)
|
|
924
|
-
|
|
925
|
-
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
1035
|
+
await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
|
|
926
1036
|
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
1037
|
+
span.add_event(f"Split into {len(subjob_ids)} subjobs")
|
|
1038
|
+
response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
|
|
1039
|
+
return parent_job_id
|
|
930
1040
|
|
|
931
1041
|
# For non-PDFs or cases where splitting is not required, submit as normal
|
|
932
1042
|
if "tracing_options" not in job_spec_dict:
|
|
@@ -982,8 +1092,8 @@ async def submit_job_v2(
|
|
|
982
1092
|
return parent_job_id
|
|
983
1093
|
|
|
984
1094
|
except Exception as ex:
|
|
985
|
-
logger.exception(f"Error submitting job: {str(ex)}")
|
|
986
|
-
raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
|
|
1095
|
+
logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
|
|
1096
|
+
raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
|
|
987
1097
|
|
|
988
1098
|
|
|
989
1099
|
# GET /v2/fetch_job
|
|
@@ -152,11 +152,11 @@ if __name__ == "__main__":
|
|
|
152
152
|
os.environ["OCR_MODEL_NAME"] = "paddle"
|
|
153
153
|
os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
154
154
|
os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
155
|
-
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/
|
|
155
|
+
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
|
|
156
156
|
logger.info("Environment variables set.")
|
|
157
157
|
|
|
158
158
|
image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
159
|
-
model_name = "nvidia/
|
|
159
|
+
model_name = "nvidia/nemotron-nano-12b-v2-vl"
|
|
160
160
|
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
161
161
|
(
|
|
162
162
|
yolox_table_structure_grpc,
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from typing import Optional
|
|
8
|
-
|
|
9
8
|
import ray
|
|
10
9
|
|
|
11
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
|
|
|
67
66
|
# Extract the DataFrame payload.
|
|
68
67
|
df_ledger = control_message.payload()
|
|
69
68
|
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
70
|
-
|
|
71
69
|
# Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
|
|
72
70
|
task_config = remove_task_by_type(control_message, "extract")
|
|
73
71
|
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import ray
|
|
7
|
+
|
|
8
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
9
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
10
|
+
from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class OCRExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts text data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "text_data_extract" task from the message.
|
|
29
|
+
2. Calls the text extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
self._logger.info("OCRExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
self._logger.exception(f"Error validating Text extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
45
|
+
@filter_by_task(required_tasks=["ocr_data_extract"])
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
# Extract DataFrame payload
|
|
48
|
+
df_ledger = control_message.payload()
|
|
49
|
+
if df_ledger.empty:
|
|
50
|
+
return control_message
|
|
51
|
+
|
|
52
|
+
# Remove the "text_data_extract" task from the message
|
|
53
|
+
task_config = remove_task_by_type(control_message, "ocr_data_extract")
|
|
54
|
+
|
|
55
|
+
execution_trace_log = {}
|
|
56
|
+
new_df, extraction_info = extract_text_data_from_image_internal(
|
|
57
|
+
df_extraction_ledger=df_ledger,
|
|
58
|
+
task_config=task_config,
|
|
59
|
+
extraction_config=self.validated_config,
|
|
60
|
+
execution_trace_log=execution_trace_log,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
control_message.payload(new_df)
|
|
64
|
+
control_message.set_metadata("ocr_extraction_info", extraction_info)
|
|
65
|
+
|
|
66
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
67
|
+
if do_trace_tagging and execution_trace_log:
|
|
68
|
+
parent_name = self.stage_name if self.stage_name else "ocr_extractor"
|
|
69
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
70
|
+
|
|
71
|
+
return control_message
|
|
@@ -318,8 +318,8 @@ stages:
|
|
|
318
318
|
actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
|
|
319
319
|
config:
|
|
320
320
|
api_key: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
321
|
-
endpoint_url: $VLM_CAPTION_ENDPOINT|"
|
|
322
|
-
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/
|
|
321
|
+
endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
|
|
322
|
+
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
|
|
323
323
|
prompt: "Caption the content of this image:"
|
|
324
324
|
replicas:
|
|
325
325
|
min_replicas: 0
|
|
@@ -192,6 +192,27 @@ stages:
|
|
|
192
192
|
strategy: "static"
|
|
193
193
|
value: 1
|
|
194
194
|
|
|
195
|
+
- name: "ocr_extractor"
|
|
196
|
+
type: "stage"
|
|
197
|
+
phase: 1 # EXTRACTION
|
|
198
|
+
actor: "nv_ingest.framework.orchestration.ray.stages.extractors.ocr_extractor:OCRExtractorStage"
|
|
199
|
+
config:
|
|
200
|
+
endpoint_config:
|
|
201
|
+
ocr_endpoints: [
|
|
202
|
+
$OCR_GRPC_ENDPOINT|"ocr:8001",
|
|
203
|
+
$OCR_HTTP_ENDPOINT|"http://ocr:8000/v1/infer",
|
|
204
|
+
]
|
|
205
|
+
ocr_infer_protocol: $OCR_INFER_PROTOCOL|grpc
|
|
206
|
+
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
207
|
+
replicas:
|
|
208
|
+
min_replicas: 0
|
|
209
|
+
max_replicas:
|
|
210
|
+
strategy: "static"
|
|
211
|
+
value: 4
|
|
212
|
+
static_replicas:
|
|
213
|
+
strategy: "static"
|
|
214
|
+
value: 3
|
|
215
|
+
|
|
195
216
|
- name: "infographic_extractor"
|
|
196
217
|
type: "stage"
|
|
197
218
|
phase: 1 # EXTRACTION
|
|
@@ -317,7 +338,7 @@ stages:
|
|
|
317
338
|
actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
|
|
318
339
|
config:
|
|
319
340
|
api_key: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
320
|
-
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/
|
|
341
|
+
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
|
|
321
342
|
endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
|
|
322
343
|
prompt: "Caption the content of this image:"
|
|
323
344
|
replicas:
|
|
@@ -461,6 +482,9 @@ edges:
|
|
|
461
482
|
to: "chart_extractor"
|
|
462
483
|
queue_size: 4
|
|
463
484
|
- from: "chart_extractor"
|
|
485
|
+
to: "ocr_extractor"
|
|
486
|
+
queue_size: 8
|
|
487
|
+
- from: "ocr_extractor"
|
|
464
488
|
to: "image_filter"
|
|
465
489
|
queue_size: 4
|
|
466
490
|
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/SOURCES.txt
RENAMED
|
@@ -52,6 +52,7 @@ nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py
|
|
|
52
52
|
nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py
|
|
53
53
|
nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py
|
|
54
54
|
nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py
|
|
55
|
+
nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py
|
|
55
56
|
nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py
|
|
56
57
|
nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py
|
|
57
58
|
nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/tracing.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/__init__.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/health.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/ingest.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v1/metrics.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v2/README.md
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/api/v2/__init__.py
RENAMED
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/framework/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest/pipeline/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-2025.11.4.dev20251104 → nv_ingest-2025.11.15.dev20251115}/nv_ingest.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|