nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,82 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import time
6
+ from abc import ABC
7
+ from typing import Optional, Any
8
+
9
+ import ray
10
+ import logging
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class RayActorSinkStage(RayActorStage, ABC):
18
+ """
19
+ Abstract base class for sink stages in a RayPipeline.
20
+ Sink stages do not support an output queue; instead, they implement write_output
21
+ to deliver their final processed messages.
22
+ """
23
+
24
+ @ray.method(num_returns=1)
25
+ def set_output_queue(self, queue_handle: any) -> bool:
26
+ raise NotImplementedError("Sink stages do not support an output queue.")
27
+
28
+ def _processing_loop(self) -> None:
29
+ """
30
+ The main processing loop executed in a background thread.
31
+
32
+ Continuously reads from the input queue, processes items using `on_data`,
33
+ performs final processing, and deletes the control message. Exits when `self._running` becomes
34
+ False. Upon loop termination, it schedules `_request_actor_exit` to run
35
+ on the main Ray actor thread to ensure a clean shutdown via `ray.actor.exit_actor()`.
36
+ """
37
+ actor_id_str = self._get_actor_id_str()
38
+ logger.debug(f"{actor_id_str}: Processing loop thread starting.")
39
+
40
+ try:
41
+ # Loop continues as long as the actor is marked as running
42
+ while self._running:
43
+ control_message: Optional[Any] = None
44
+ try:
45
+ # Step 1: Attempt to get work from the input queue
46
+ control_message = self._read_input()
47
+
48
+ # If no message, loop back and check self._running again
49
+ if control_message is None:
50
+ continue # Go to the next iteration of the while loop
51
+
52
+ self.stats["successful_queue_reads"] += 1
53
+
54
+ # Step 2: Process the retrieved message
55
+ self._active_processing = True # Mark as busy
56
+ self.on_data(control_message)
57
+
58
+ self.stats["processed"] += 1
59
+
60
+ except Exception as e:
61
+ # Log exceptions during item processing but continue the loop
62
+ cm_info = f" (message type: {type(control_message).__name__})" if control_message else ""
63
+ logger.exception(f"{actor_id_str}: Error processing item{cm_info}: {e}")
64
+
65
+ # Avoid busy-spinning in case of persistent errors reading or processing
66
+ if self._running:
67
+ time.sleep(0.1)
68
+ finally:
69
+ # Ensure active_processing is reset regardless of success/failure/output
70
+ self._active_processing = False
71
+
72
+ # --- Loop Exit ---
73
+ logger.debug(
74
+ f"{actor_id_str}: Graceful exit condition met (self._running is False). Processing loop terminating."
75
+ )
76
+
77
+ except Exception as e:
78
+ # Catch unexpected errors in the loop structure itself
79
+ self._logger.exception(f"{actor_id_str}: Unexpected error caused processing loop termination: {e}")
80
+ finally:
81
+ self._logger.debug(f"{actor_id_str}: Processing loop thread finished.")
82
+ self._shutdown_signal_complete = True
@@ -0,0 +1,59 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any
7
+ import ray
8
+ import logging
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RayActorSourceStage(RayActorStage, ABC):
16
+ """
17
+ Abstract base class for source stages in a RayPipeline.
18
+ Source stages do not support an input queue.
19
+ Instead, they must implement get_input() to fetch control messages from an external source.
20
+ """
21
+
22
+ def __init__(self, config: Any, log_to_stdout=False) -> None:
23
+ super().__init__(config, log_to_stdout=log_to_stdout)
24
+ self.paused = False
25
+
26
+ @ray.method(num_returns=1)
27
+ def set_input_queue(self, queue_handle: Any) -> bool:
28
+ raise NotImplementedError("Source stages do not support an input queue.")
29
+
30
+ def get_input(self) -> Any:
31
+ """
32
+ Source stages must implement get_input() to fetch control messages from an external source.
33
+ """
34
+ pass
35
+
36
+ @abstractmethod
37
+ def _read_input(self) -> Any:
38
+ """
39
+ For source stages, read_input simply calls get_input().
40
+ """
41
+ pass
42
+
43
+ @ray.method(num_returns=1)
44
+ def pause(self) -> bool:
45
+ """
46
+ Pause the source stage so that it will not write to its output queue.
47
+ """
48
+ self.paused = True
49
+ logger.info("Source stage paused.")
50
+ return True
51
+
52
+ @ray.method(num_returns=1)
53
+ def resume(self) -> bool:
54
+ """
55
+ Resume the source stage to allow writing to its output queue.
56
+ """
57
+ self.paused = False
58
+ logger.info("Source stage resumed.")
59
+ return True