nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from typing import Optional, Any
|
|
8
|
+
|
|
9
|
+
import ray
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RayActorSinkStage(RayActorStage, ABC):
|
|
18
|
+
"""
|
|
19
|
+
Abstract base class for sink stages in a RayPipeline.
|
|
20
|
+
Sink stages do not support an output queue; instead, they implement write_output
|
|
21
|
+
to deliver their final processed messages.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@ray.method(num_returns=1)
|
|
25
|
+
def set_output_queue(self, queue_handle: any) -> bool:
|
|
26
|
+
raise NotImplementedError("Sink stages do not support an output queue.")
|
|
27
|
+
|
|
28
|
+
def _processing_loop(self) -> None:
|
|
29
|
+
"""
|
|
30
|
+
The main processing loop executed in a background thread.
|
|
31
|
+
|
|
32
|
+
Continuously reads from the input queue, processes items using `on_data`,
|
|
33
|
+
performs final processing, and deletes the control message. Exits when `self._running` becomes
|
|
34
|
+
False. Upon loop termination, it schedules `_request_actor_exit` to run
|
|
35
|
+
on the main Ray actor thread to ensure a clean shutdown via `ray.actor.exit_actor()`.
|
|
36
|
+
"""
|
|
37
|
+
actor_id_str = self._get_actor_id_str()
|
|
38
|
+
logger.debug(f"{actor_id_str}: Processing loop thread starting.")
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
# Loop continues as long as the actor is marked as running
|
|
42
|
+
while self._running:
|
|
43
|
+
control_message: Optional[Any] = None
|
|
44
|
+
try:
|
|
45
|
+
# Step 1: Attempt to get work from the input queue
|
|
46
|
+
control_message = self._read_input()
|
|
47
|
+
|
|
48
|
+
# If no message, loop back and check self._running again
|
|
49
|
+
if control_message is None:
|
|
50
|
+
continue # Go to the next iteration of the while loop
|
|
51
|
+
|
|
52
|
+
self.stats["successful_queue_reads"] += 1
|
|
53
|
+
|
|
54
|
+
# Step 2: Process the retrieved message
|
|
55
|
+
self._active_processing = True # Mark as busy
|
|
56
|
+
self.on_data(control_message)
|
|
57
|
+
|
|
58
|
+
self.stats["processed"] += 1
|
|
59
|
+
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# Log exceptions during item processing but continue the loop
|
|
62
|
+
cm_info = f" (message type: {type(control_message).__name__})" if control_message else ""
|
|
63
|
+
logger.exception(f"{actor_id_str}: Error processing item{cm_info}: {e}")
|
|
64
|
+
|
|
65
|
+
# Avoid busy-spinning in case of persistent errors reading or processing
|
|
66
|
+
if self._running:
|
|
67
|
+
time.sleep(0.1)
|
|
68
|
+
finally:
|
|
69
|
+
# Ensure active_processing is reset regardless of success/failure/output
|
|
70
|
+
self._active_processing = False
|
|
71
|
+
|
|
72
|
+
# --- Loop Exit ---
|
|
73
|
+
logger.debug(
|
|
74
|
+
f"{actor_id_str}: Graceful exit condition met (self._running is False). Processing loop terminating."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
# Catch unexpected errors in the loop structure itself
|
|
79
|
+
self._logger.exception(f"{actor_id_str}: Unexpected error caused processing loop termination: {e}")
|
|
80
|
+
finally:
|
|
81
|
+
self._logger.debug(f"{actor_id_str}: Processing loop thread finished.")
|
|
82
|
+
self._shutdown_signal_complete = True
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any
|
|
7
|
+
import ray
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RayActorSourceStage(RayActorStage, ABC):
|
|
16
|
+
"""
|
|
17
|
+
Abstract base class for source stages in a RayPipeline.
|
|
18
|
+
Source stages do not support an input queue.
|
|
19
|
+
Instead, they must implement get_input() to fetch control messages from an external source.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, config: Any, log_to_stdout=False) -> None:
|
|
23
|
+
super().__init__(config, log_to_stdout=log_to_stdout)
|
|
24
|
+
self.paused = False
|
|
25
|
+
|
|
26
|
+
@ray.method(num_returns=1)
|
|
27
|
+
def set_input_queue(self, queue_handle: Any) -> bool:
|
|
28
|
+
raise NotImplementedError("Source stages do not support an input queue.")
|
|
29
|
+
|
|
30
|
+
def get_input(self) -> Any:
|
|
31
|
+
"""
|
|
32
|
+
Source stages must implement get_input() to fetch control messages from an external source.
|
|
33
|
+
"""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def _read_input(self) -> Any:
|
|
38
|
+
"""
|
|
39
|
+
For source stages, read_input simply calls get_input().
|
|
40
|
+
"""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@ray.method(num_returns=1)
|
|
44
|
+
def pause(self) -> bool:
|
|
45
|
+
"""
|
|
46
|
+
Pause the source stage so that it will not write to its output queue.
|
|
47
|
+
"""
|
|
48
|
+
self.paused = True
|
|
49
|
+
logger.info("Source stage paused.")
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
@ray.method(num_returns=1)
|
|
53
|
+
def resume(self) -> bool:
|
|
54
|
+
"""
|
|
55
|
+
Resume the source stage to allow writing to its output queue.
|
|
56
|
+
"""
|
|
57
|
+
self.paused = False
|
|
58
|
+
logger.info("Source stage resumed.")
|
|
59
|
+
return True
|