nv-ingest 25.6.26.dev20250626__tar.gz → 25.6.28.dev20250628__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/PKG-INFO +1 -1
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +33 -5
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -6
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +1 -10
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +47 -2
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +1 -3
- nv_ingest-25.6.28.dev20250628/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +203 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/PKG-INFO +1 -1
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/SOURCES.txt +1 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/LICENSE +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/MANIFEST.in +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/main.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/v1/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/v1/metrics.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/version.py +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/requires.txt +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/pyproject.toml +0 -0
- {nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/setup.cfg +0 -0
|
@@ -9,6 +9,7 @@ import threading
|
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
+
from types import FunctionType
|
|
12
13
|
|
|
13
14
|
import psutil
|
|
14
15
|
import uuid
|
|
@@ -24,6 +25,9 @@ import time
|
|
|
24
25
|
from nv_ingest.framework.orchestration.ray.primitives.pipeline_topology import PipelineTopology, StageInfo
|
|
25
26
|
from nv_ingest.framework.orchestration.ray.primitives.ray_stat_collector import RayStatsCollector
|
|
26
27
|
from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import PIDController, ResourceConstraintManager
|
|
28
|
+
from nv_ingest.framework.orchestration.ray.util.pipeline.tools import wrap_callable_as_stage
|
|
29
|
+
from nv_ingest_api.util.imports.callable_signatures import ingest_stage_callable_signature
|
|
30
|
+
from nv_ingest_api.util.imports.dynamic_resolvers import resolve_callable_from_path
|
|
27
31
|
|
|
28
32
|
logger = logging.getLogger(__name__)
|
|
29
33
|
|
|
@@ -43,7 +47,7 @@ class PipelineInterface(ABC):
|
|
|
43
47
|
Parameters
|
|
44
48
|
----------
|
|
45
49
|
monitor_poll_interval : float
|
|
46
|
-
Interval in seconds for monitoring poll (default: 5.0).
|
|
50
|
+
Interval in seconds for the monitoring poll (default: 5.0).
|
|
47
51
|
scaling_poll_interval : float
|
|
48
52
|
Interval in seconds for scaling decisions (default: 30.0).
|
|
49
53
|
"""
|
|
@@ -270,7 +274,7 @@ class RayPipeline(PipelineInterface):
|
|
|
270
274
|
|
|
271
275
|
logger.info("RayStatsCollector initialized using StatsConfig.")
|
|
272
276
|
|
|
273
|
-
# --- Accessor Methods for
|
|
277
|
+
# --- Accessor Methods for Stat Collector (and internal use) ---
|
|
274
278
|
|
|
275
279
|
def __del__(self):
|
|
276
280
|
try:
|
|
@@ -428,15 +432,39 @@ class RayPipeline(PipelineInterface):
|
|
|
428
432
|
return self
|
|
429
433
|
|
|
430
434
|
def add_stage(
|
|
431
|
-
self,
|
|
435
|
+
self,
|
|
436
|
+
*,
|
|
437
|
+
name: str,
|
|
438
|
+
stage_actor: Any,
|
|
439
|
+
config: BaseModel,
|
|
440
|
+
min_replicas: int = 0,
|
|
441
|
+
max_replicas: int = 1,
|
|
432
442
|
) -> "RayPipeline":
|
|
433
443
|
if min_replicas < 0:
|
|
434
444
|
logger.warning(f"Stage '{name}': min_replicas cannot be negative. Overriding to 0.")
|
|
435
445
|
min_replicas = 0
|
|
446
|
+
|
|
447
|
+
resolved_actor = stage_actor
|
|
448
|
+
|
|
449
|
+
# Support module path (e.g., "mypkg.mymodule:my_lambda")
|
|
450
|
+
if isinstance(stage_actor, str):
|
|
451
|
+
resolved_actor = resolve_callable_from_path(
|
|
452
|
+
callable_path=stage_actor, signature_schema=ingest_stage_callable_signature
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Wrap callables
|
|
456
|
+
if isinstance(resolved_actor, FunctionType):
|
|
457
|
+
schema_type = type(config)
|
|
458
|
+
resolved_actor = wrap_callable_as_stage(resolved_actor, schema_type)
|
|
459
|
+
|
|
436
460
|
stage_info = StageInfo(
|
|
437
|
-
name=name,
|
|
461
|
+
name=name,
|
|
462
|
+
callable=resolved_actor,
|
|
463
|
+
config=config,
|
|
464
|
+
min_replicas=min_replicas,
|
|
465
|
+
max_replicas=max_replicas,
|
|
438
466
|
)
|
|
439
|
-
self.topology.add_stage(stage_info)
|
|
467
|
+
self.topology.add_stage(stage_info)
|
|
440
468
|
|
|
441
469
|
return self
|
|
442
470
|
|
|
@@ -23,16 +23,13 @@ class RayActorSourceStage(RayActorStage, ABC):
|
|
|
23
23
|
super().__init__(config, log_to_stdout=log_to_stdout)
|
|
24
24
|
self.paused = False
|
|
25
25
|
|
|
26
|
+
def on_data(self, IngestControlMessage):
|
|
27
|
+
return NotImplemented("Source stages do not implement on_data().")
|
|
28
|
+
|
|
26
29
|
@ray.method(num_returns=1)
|
|
27
30
|
def set_input_queue(self, queue_handle: Any) -> bool:
|
|
28
31
|
raise NotImplementedError("Source stages do not support an input queue.")
|
|
29
32
|
|
|
30
|
-
def get_input(self) -> Any:
|
|
31
|
-
"""
|
|
32
|
-
Source stages must implement get_input() to fetch control messages from an external source.
|
|
33
|
-
"""
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
33
|
@abstractmethod
|
|
37
34
|
def _read_input(self) -> Any:
|
|
38
35
|
"""
|
|
@@ -304,14 +304,6 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
304
304
|
|
|
305
305
|
return control_message
|
|
306
306
|
|
|
307
|
-
def on_data(self, control_message: any) -> any:
|
|
308
|
-
"""
|
|
309
|
-
Process the control message.
|
|
310
|
-
For this source stage, no additional processing is done, so simply return it.
|
|
311
|
-
"""
|
|
312
|
-
self._logger.debug("on_data: Received control message for processing")
|
|
313
|
-
return control_message
|
|
314
|
-
|
|
315
307
|
# In the processing loop, instead of checking a boolean, we wait on the event.
|
|
316
308
|
def _processing_loop(self) -> None:
|
|
317
309
|
"""
|
|
@@ -336,7 +328,6 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
336
328
|
self._active_processing = True
|
|
337
329
|
|
|
338
330
|
self._logger.debug("Control message received; processing data")
|
|
339
|
-
updated_cm = self.on_data(control_message)
|
|
340
331
|
|
|
341
332
|
# Block until not paused using the pause event.
|
|
342
333
|
if self.output_queue is not None:
|
|
@@ -349,7 +340,7 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
349
340
|
|
|
350
341
|
while True:
|
|
351
342
|
try:
|
|
352
|
-
self.output_queue.put(
|
|
343
|
+
self.output_queue.put(control_message)
|
|
353
344
|
self.stats["successful_queue_writes"] += 1
|
|
354
345
|
break
|
|
355
346
|
except Exception:
|
|
@@ -1,11 +1,14 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
1
5
|
import logging
|
|
2
6
|
from typing import Any
|
|
3
7
|
import ray
|
|
4
8
|
|
|
5
|
-
# Assume these imports come from your project:
|
|
6
9
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
7
10
|
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
8
|
-
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
|
|
9
12
|
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
10
13
|
from nv_ingest_api.internal.schemas.transform.transform_text_splitter_schema import TextSplitterSchema
|
|
11
14
|
from nv_ingest_api.internal.transform.split_text import transform_text_split_and_tokenize_internal
|
|
@@ -72,3 +75,45 @@ class TextSplitterStage(RayActorStage):
|
|
|
72
75
|
logger.info("TextSplitterStage.on_data: Finished processing, returning updated message.")
|
|
73
76
|
|
|
74
77
|
return message
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def text_splitter_fn(control_message: IngestControlMessage, stage_config: TextSplitterSchema) -> IngestControlMessage:
|
|
81
|
+
"""
|
|
82
|
+
Process an incoming IngestControlMessage by splitting and tokenizing its text.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
control_message : IngestControlMessage
|
|
87
|
+
The incoming message containing the payload DataFrame.
|
|
88
|
+
|
|
89
|
+
stage_config : BaseModel
|
|
90
|
+
The stage level configuration object
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
IngestControlMessage
|
|
95
|
+
The updated message with its payload transformed.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
# Extract the DataFrame payload.
|
|
99
|
+
df_payload = control_message.payload()
|
|
100
|
+
logger.debug("Extracted payload with %d rows.", len(df_payload))
|
|
101
|
+
|
|
102
|
+
# Remove the "split" task to obtain task-specific configuration.
|
|
103
|
+
task_config = remove_task_by_type(control_message, "split")
|
|
104
|
+
logger.debug("Extracted task config: %s", task_config)
|
|
105
|
+
|
|
106
|
+
# Transform the DataFrame (split text and tokenize).
|
|
107
|
+
df_updated = transform_text_split_and_tokenize_internal(
|
|
108
|
+
df_transform_ledger=df_payload,
|
|
109
|
+
task_config=task_config,
|
|
110
|
+
transform_config=stage_config,
|
|
111
|
+
execution_trace_log=None,
|
|
112
|
+
)
|
|
113
|
+
logger.info("TextSplitterStage.on_data: Transformation complete. Updated payload has %d rows.", len(df_updated))
|
|
114
|
+
|
|
115
|
+
# Update the message payload.
|
|
116
|
+
control_message.payload(df_updated)
|
|
117
|
+
logger.info("TextSplitterStage.on_data: Finished processing, returning updated message.")
|
|
118
|
+
|
|
119
|
+
return control_message
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
-
# TODO(Devin)
|
|
6
|
-
# flake8: noqa
|
|
7
5
|
import os
|
|
8
6
|
|
|
9
7
|
import click
|
|
@@ -11,6 +9,7 @@ import logging
|
|
|
11
9
|
|
|
12
10
|
from nv_ingest.framework.orchestration.ray.stages.sinks.default_drain import DefaultDrainSink
|
|
13
11
|
from nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer import OpenTelemetryTracerStage
|
|
12
|
+
from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
|
|
14
13
|
from nv_ingest.framework.schemas.framework_otel_tracer_schema import OpenTelemetryTracerSchema
|
|
15
14
|
from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema
|
|
16
15
|
|
|
@@ -41,7 +40,6 @@ from nv_ingest.framework.orchestration.ray.stages.storage.image_storage import I
|
|
|
41
40
|
from nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings import EmbeddingStorageStage
|
|
42
41
|
from nv_ingest.framework.orchestration.ray.stages.transforms.image_caption import ImageCaptionTransformStage
|
|
43
42
|
from nv_ingest.framework.orchestration.ray.stages.transforms.text_embed import TextEmbeddingTransformStage
|
|
44
|
-
from nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter import TextSplitterStage
|
|
45
43
|
from nv_ingest.framework.schemas.framework_metadata_injector_schema import MetadataInjectorSchema
|
|
46
44
|
from nv_ingest_api.internal.schemas.extract.extract_audio_schema import AudioExtractorSchema
|
|
47
45
|
from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import uuid
|
|
7
|
+
from typing import Callable, Optional, Union, Dict, List, Type
|
|
8
|
+
|
|
9
|
+
import ray
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
13
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
14
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable
|
|
15
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def wrap_callable_as_stage(
|
|
21
|
+
fn: Callable[[object, BaseModel], object],
|
|
22
|
+
schema_type: Type[BaseModel],
|
|
23
|
+
*,
|
|
24
|
+
required_tasks: Optional[List[str]] = None,
|
|
25
|
+
trace_id: Optional[str] = None,
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Factory to wrap a user-supplied function into a Ray actor, returning a proxy
|
|
29
|
+
for unique, isolated dynamic actor creation.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
fn : Callable[[IngestControlMessage, BaseModel], IngestControlMessage]
|
|
34
|
+
The processing function to be wrapped in the Ray actor.
|
|
35
|
+
schema_type : Type[BaseModel]
|
|
36
|
+
Pydantic schema used to validate and pass the stage config.
|
|
37
|
+
required_tasks : Optional[List[str]], optional
|
|
38
|
+
Task names this stage should filter on. If None, no filtering is applied.
|
|
39
|
+
trace_id : Optional[str], optional
|
|
40
|
+
Optional name for trace annotation; defaults to the function name.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
StageProxy : object
|
|
45
|
+
A factory-like proxy exposing `.remote()` and `.options()` for Ray-idiomatic
|
|
46
|
+
actor creation. Direct instantiation or class method use is not supported.
|
|
47
|
+
|
|
48
|
+
Notes
|
|
49
|
+
-----
|
|
50
|
+
- Each call to `.remote()` or `.options()` generates a new, dynamically created class
|
|
51
|
+
(using `type()`), ensuring Ray treats each as a unique actor type and preventing
|
|
52
|
+
class/actor name collisions or registry issues. This is essential when running
|
|
53
|
+
dynamic or parallel pipelines and tests.
|
|
54
|
+
- Only `.remote(config)` and `.options(...)` (chained with `.remote(config)`) are supported.
|
|
55
|
+
All other class/actor patterns will raise `NotImplementedError`.
|
|
56
|
+
"""
|
|
57
|
+
trace_name = trace_id or fn.__name__
|
|
58
|
+
|
|
59
|
+
def make_actor_class():
|
|
60
|
+
"""
|
|
61
|
+
Dynamically constructs a unique Ray actor class for every call.
|
|
62
|
+
|
|
63
|
+
Engineering Note
|
|
64
|
+
----------------
|
|
65
|
+
This pattern uses Python's `type()` to create a new class object for each actor instance,
|
|
66
|
+
guaranteeing a unique type each time. Ray's internal registry identifies actor types
|
|
67
|
+
by their Python class object. If you use the same class (even with different logic or
|
|
68
|
+
@ray.remote), Ray may reuse or overwrite them, causing hard-to-diagnose bugs in
|
|
69
|
+
parallel or test code. By generating a fresh class each time, we fully isolate state,
|
|
70
|
+
serialization, and Ray's registry—avoiding actor collisions and test pollution.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
new_class : type
|
|
75
|
+
The dynamically constructed RayActorStage subclass.
|
|
76
|
+
"""
|
|
77
|
+
class_name = f"LambdaStage_{fn.__name__}_{uuid.uuid4().hex[:8]}"
|
|
78
|
+
|
|
79
|
+
def __init__(self, config: Union[Dict, BaseModel]) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
config : Union[Dict, BaseModel]
|
|
84
|
+
Stage configuration, validated against `schema_type`.
|
|
85
|
+
"""
|
|
86
|
+
validated_config = schema_type(**config) if not isinstance(config, schema_type) else config
|
|
87
|
+
super(new_class, self).__init__(validated_config, log_to_stdout=True)
|
|
88
|
+
self.validated_config = validated_config
|
|
89
|
+
self._logger.info(f"{self.__class__.__name__} initialized with validated config.")
|
|
90
|
+
|
|
91
|
+
@traceable(trace_name)
|
|
92
|
+
@nv_ingest_node_failure_try_except(annotation_id=trace_name, raise_on_failure=False)
|
|
93
|
+
@filter_by_task(required_tasks=required_tasks) if required_tasks else (lambda f: f)
|
|
94
|
+
def on_data(self, control_message):
|
|
95
|
+
"""
|
|
96
|
+
Processes a control message using the wrapped function.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
control_message : IngestControlMessage
|
|
101
|
+
The message to be processed.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
IngestControlMessage
|
|
106
|
+
The processed message, or the original on failure.
|
|
107
|
+
"""
|
|
108
|
+
try:
|
|
109
|
+
return fn(control_message, self.validated_config)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
self._logger.exception(f"{self.__class__.__name__} failed: {e}")
|
|
112
|
+
self.stats["errors"] += 1
|
|
113
|
+
return control_message
|
|
114
|
+
|
|
115
|
+
# --- ENGINEERING NOTE ---
|
|
116
|
+
# The `class_dict` collects all the methods and attributes for the dynamic class.
|
|
117
|
+
# This allows us to build a fresh class object per call, preventing Ray from
|
|
118
|
+
# reusing or overwriting global actor types. It is the critical piece for
|
|
119
|
+
# robust dynamic actor creation in Ray!
|
|
120
|
+
# ------------------------
|
|
121
|
+
|
|
122
|
+
class_dict = {
|
|
123
|
+
"__init__": __init__,
|
|
124
|
+
"on_data": on_data,
|
|
125
|
+
}
|
|
126
|
+
bases = (RayActorStage,)
|
|
127
|
+
new_class = type(class_name, bases, class_dict)
|
|
128
|
+
return new_class
|
|
129
|
+
|
|
130
|
+
class StageProxy:
|
|
131
|
+
"""
|
|
132
|
+
Factory/proxy for dynamic Ray actor creation; not itself a Ray actor.
|
|
133
|
+
|
|
134
|
+
Methods
|
|
135
|
+
-------
|
|
136
|
+
remote(config)
|
|
137
|
+
Instantiate a Ray actor with a unique dynamic class and name.
|
|
138
|
+
options(*args, **kwargs)
|
|
139
|
+
Advanced Ray actor configuration (chain with `.remote(config)`).
|
|
140
|
+
actor_class()
|
|
141
|
+
Generates and returns a fresh actor class (for introspection/testing only).
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def remote(config):
|
|
146
|
+
"""
|
|
147
|
+
Instantiate a Ray actor with a unique dynamic class and name.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
config : Union[Dict, BaseModel]
|
|
152
|
+
Stage configuration to pass to the actor.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
ray.actor.ActorHandle
|
|
157
|
+
Handle to the started Ray actor.
|
|
158
|
+
"""
|
|
159
|
+
_ActorClass = ray.remote(make_actor_class())
|
|
160
|
+
unique_name = f"{fn.__name__}_{str(uuid.uuid4())[:8]}"
|
|
161
|
+
return _ActorClass.options(name=unique_name).remote(config)
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def options(*args, **kwargs):
|
|
165
|
+
"""
|
|
166
|
+
Return a Ray actor class with the specified options set.
|
|
167
|
+
Must call `.remote(config)` on the result.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
*args
|
|
172
|
+
Positional arguments for Ray actor options.
|
|
173
|
+
**kwargs
|
|
174
|
+
Keyword arguments for Ray actor options (e.g., resources).
|
|
175
|
+
|
|
176
|
+
Returns
|
|
177
|
+
-------
|
|
178
|
+
ray.actor.ActorClass
|
|
179
|
+
Ray actor class, requires .remote(config) to instantiate.
|
|
180
|
+
"""
|
|
181
|
+
ActorClass = ray.remote(make_actor_class())
|
|
182
|
+
if "name" not in kwargs:
|
|
183
|
+
kwargs["name"] = f"{fn.__name__}_{str(uuid.uuid4())[:8]}"
|
|
184
|
+
return ActorClass.options(*args, **kwargs)
|
|
185
|
+
|
|
186
|
+
def __new__(cls, *a, **k):
|
|
187
|
+
raise NotImplementedError("StageProxy is a factory, not a Ray actor or class. Use .remote() or .options().")
|
|
188
|
+
|
|
189
|
+
def __call__(self, *a, **k):
|
|
190
|
+
raise NotImplementedError("StageProxy is a factory, not a Ray actor or class. Use .remote() or .options().")
|
|
191
|
+
|
|
192
|
+
def __getattr__(self, name):
|
|
193
|
+
# Only allow access to known public members
|
|
194
|
+
if name in {"remote", "options", "actor_class"}:
|
|
195
|
+
return getattr(self, name)
|
|
196
|
+
raise NotImplementedError(
|
|
197
|
+
f"StageProxy does not implement '{name}'. Only .remote(), .options(), .actor_class() are available."
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# For testing or introspection only.
|
|
201
|
+
# actor_class = staticmethod(make_actor_class)
|
|
202
|
+
|
|
203
|
+
return StageProxy
|
{nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/SOURCES.txt
RENAMED
|
@@ -76,6 +76,7 @@ nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py
|
|
|
76
76
|
nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py
|
|
77
77
|
nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py
|
|
78
78
|
nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py
|
|
79
|
+
nv_ingest/framework/orchestration/ray/util/pipeline/tools.py
|
|
79
80
|
nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py
|
|
80
81
|
nv_ingest/framework/orchestration/ray/util/system_tools/memory.py
|
|
81
82
|
nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/api/v1/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest/framework/util/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/requires.txt
RENAMED
|
File without changes
|
{nv_ingest-25.6.26.dev20250626 → nv_ingest-25.6.28.dev20250628}/nv_ingest.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|