nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,195 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import json
6
+ import logging
7
+ import math
8
+ import os
9
+ from typing import Dict, Any
10
+
11
+ import ray
12
+ from pydantic import BaseModel
13
+
14
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline
15
+ from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
16
+ add_source_stage,
17
+ add_metadata_injector_stage,
18
+ add_pdf_extractor_stage,
19
+ add_image_extractor_stage,
20
+ add_docx_extractor_stage,
21
+ add_audio_extractor_stage,
22
+ add_image_dedup_stage,
23
+ add_image_filter_stage,
24
+ add_table_extractor_stage,
25
+ add_chart_extractor_stage,
26
+ add_image_caption_stage,
27
+ add_text_splitter_stage,
28
+ add_text_embedding_stage,
29
+ add_embedding_storage_stage,
30
+ add_image_storage_stage,
31
+ add_message_broker_response_stage,
32
+ add_pptx_extractor_stage,
33
+ add_infographic_extractor_stage,
34
+ add_otel_tracer_stage,
35
+ add_default_drain_stage,
36
+ )
37
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
38
+
39
+ logger = logging.getLogger("uvicorn")
40
+
41
+
42
+ def export_config_to_env(ingest_config: Any) -> None:
43
+ if isinstance(ingest_config, BaseModel):
44
+ ingest_config = ingest_config.model_dump()
45
+
46
+ os.environ.update({key.upper(): val for key, val in ingest_config.items()})
47
+
48
+
49
+ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any] = None):
50
+ # Initialize the pipeline with the configuration
51
+ if ingest_config:
52
+ # Export the config to environment variables
53
+ export_config_to_env(ingest_config)
54
+
55
+ current_level = logging.getLogger().getEffectiveLevel()
56
+ ray.init(
57
+ namespace="nv_ingest_ray",
58
+ logging_level=current_level,
59
+ ignore_reinit_error=True,
60
+ dashboard_host="0.0.0.0",
61
+ dashboard_port=8265,
62
+ _system_config={
63
+ "local_fs_capacity_threshold": 0.9,
64
+ "object_spilling_config": json.dumps(
65
+ {
66
+ "type": "filesystem",
67
+ "params": {
68
+ "directory_path": [
69
+ "/tmp/ray_spill_testing_0",
70
+ "/tmp/ray_spill_testing_1",
71
+ "/tmp/ray_spill_testing_2",
72
+ "/tmp/ray_spill_testing_3",
73
+ ],
74
+ "buffer_size": 100_000_000,
75
+ },
76
+ },
77
+ ),
78
+ },
79
+ )
80
+ system_resource_probe = SystemResourceProbe()
81
+
82
+ effective_cpu_core_count = system_resource_probe.get_effective_cores()
83
+ default_cpu_count = int(os.environ.get("NV_INGEST_MAX_UTIL", int(max(1, math.floor(effective_cpu_core_count)))))
84
+
85
+ add_meter_stage = os.environ.get("MESSAGE_CLIENT_TYPE") != "simple"
86
+ _ = add_meter_stage # TODO(Devin)
87
+
88
+ ########################################################################################################
89
+ ## Insertion and Pre-processing stages
90
+ ########################################################################################################
91
+ logger.debug("Setting up ingestion pipeline")
92
+ source_stage_id = add_source_stage(pipeline, default_cpu_count)
93
+ # TODO(Devin): Job counter used a global stats object that isn't ray compatible, need to update.
94
+ # submitted_job_counter_stage = add_submitted_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
95
+ metadata_injector_stage_id = add_metadata_injector_stage(pipeline, default_cpu_count)
96
+ ########################################################################################################
97
+
98
+ ########################################################################################################
99
+ ## Primitive extraction
100
+ ########################################################################################################
101
+ pdf_extractor_stage_id = add_pdf_extractor_stage(pipeline, default_cpu_count)
102
+ image_extractor_stage_id = add_image_extractor_stage(pipeline, default_cpu_count)
103
+ docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
104
+ pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
105
+ audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
106
+ ########################################################################################################
107
+
108
+ ########################################################################################################
109
+ ## Post-processing
110
+ ########################################################################################################
111
+ image_dedup_stage_id = add_image_dedup_stage(pipeline, default_cpu_count)
112
+ image_filter_stage_id = add_image_filter_stage(pipeline, default_cpu_count)
113
+ table_extraction_stage_id = add_table_extractor_stage(pipeline, default_cpu_count)
114
+ chart_extraction_stage_id = add_chart_extractor_stage(pipeline, default_cpu_count)
115
+ infographic_extraction_stage_id = add_infographic_extractor_stage(pipeline, default_cpu_count)
116
+ image_caption_stage_id = add_image_caption_stage(pipeline, default_cpu_count)
117
+ ########################################################################################################
118
+
119
+ ########################################################################################################
120
+ ## Transforms and data synthesis
121
+ ########################################################################################################
122
+ text_splitter_stage_id = add_text_splitter_stage(pipeline, default_cpu_count)
123
+ embed_extractions_stage_id = add_text_embedding_stage(pipeline, default_cpu_count)
124
+
125
+ ########################################################################################################
126
+ ## Storage and output
127
+ ########################################################################################################
128
+ embedding_storage_stage_id = add_embedding_storage_stage(pipeline, default_cpu_count)
129
+ image_storage_stage_id = add_image_storage_stage(pipeline, default_cpu_count)
130
+ # vdb_task_sink_stage = add_vdb_task_sink_stage(pipe, morpheus_pipeline_config, ingest_config)
131
+ broker_response_stage_id = add_message_broker_response_stage(pipeline, default_cpu_count)
132
+ ########################################################################################################
133
+
134
+ #######################################################################################################
135
+ ## Telemetry (Note: everything after the sync stage is out of the hot path, please keep it that way) ##
136
+ #######################################################################################################
137
+ otel_tracer_stage_id = add_otel_tracer_stage(pipeline, default_cpu_count)
138
+
139
+ # TODO(devin)
140
+ # if add_meter_stage:
141
+ # otel_meter_stage = add_otel_meter_stage(pipe, morpheus_pipeline_config, ingest_config)
142
+ # else:
143
+ # otel_meter_stage = None
144
+ # completed_job_counter_stage = add_completed_job_counter_stage(pipe, morpheus_pipeline_config, ingest_config)
145
+ ########################################################################################################
146
+
147
+ # Add a drain stage to the pipeline -- flushes and deletes control messages
148
+ drain_id = add_default_drain_stage(pipeline, default_cpu_count)
149
+
150
+ ingest_edge_buffer_size = int(os.environ.get("INGEST_EDGE_BUFFER_SIZE", 32))
151
+
152
+ # Add edges
153
+ ###### Intake Stages ########
154
+ pipeline.make_edge(source_stage_id, metadata_injector_stage_id, queue_size=ingest_edge_buffer_size)
155
+ pipeline.make_edge(metadata_injector_stage_id, pdf_extractor_stage_id, queue_size=ingest_edge_buffer_size)
156
+
157
+ ###### Document Extractors ########
158
+ pipeline.make_edge(pdf_extractor_stage_id, audio_extractor_stage_id, queue_size=ingest_edge_buffer_size)
159
+ pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
160
+ pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
161
+ pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
162
+ pipeline.make_edge(image_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
163
+
164
+ ###### Primitive Extractors ########
165
+ pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
166
+ pipeline.make_edge(table_extraction_stage_id, chart_extraction_stage_id, queue_size=ingest_edge_buffer_size)
167
+ pipeline.make_edge(chart_extraction_stage_id, image_filter_stage_id, queue_size=ingest_edge_buffer_size)
168
+
169
+ ###### Primitive Mutators ########
170
+ pipeline.make_edge(image_filter_stage_id, image_dedup_stage_id, queue_size=ingest_edge_buffer_size)
171
+ pipeline.make_edge(image_dedup_stage_id, text_splitter_stage_id, queue_size=ingest_edge_buffer_size)
172
+
173
+ ###### Primitive Transforms ########
174
+ pipeline.make_edge(text_splitter_stage_id, embed_extractions_stage_id, queue_size=ingest_edge_buffer_size)
175
+ pipeline.make_edge(embed_extractions_stage_id, image_caption_stage_id, queue_size=ingest_edge_buffer_size)
176
+ pipeline.make_edge(image_caption_stage_id, image_storage_stage_id, queue_size=ingest_edge_buffer_size)
177
+
178
+ ###### Primitive Storage ########
179
+ pipeline.make_edge(image_storage_stage_id, embedding_storage_stage_id, queue_size=ingest_edge_buffer_size)
180
+ pipeline.make_edge(embedding_storage_stage_id, broker_response_stage_id, queue_size=ingest_edge_buffer_size)
181
+
182
+ ###### Response and Telemetry ########
183
+ pipeline.make_edge(broker_response_stage_id, otel_tracer_stage_id, queue_size=ingest_edge_buffer_size)
184
+ pipeline.make_edge(otel_tracer_stage_id, drain_id, queue_size=ingest_edge_buffer_size)
185
+
186
+ pipeline.build()
187
+
188
+ # TODO(devin)
189
+ # if add_meter_stage:
190
+ # pipe.add_edge(sink_stage, otel_meter_stage)
191
+ # pipe.add_edge(otel_meter_stage, otel_tracer_stage)
192
+ # else:
193
+ # pipe.add_edge(sink_stage, otel_tracer_stage)
194
+
195
+ # pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)
@@ -0,0 +1,170 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import os
7
+ import time
8
+ from datetime import datetime
9
+ from typing import Union, Tuple
10
+
11
+ import ray
12
+ from pydantic import BaseModel, ConfigDict
13
+
14
+ from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline, ScalingConfig
15
+ from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def str_to_bool(value: str) -> bool:
21
+ return value.strip().lower() in {"1", "true", "yes", "on"}
22
+
23
+
24
+ DISABLE_DYNAMIC_SCALING = str_to_bool(os.environ.get("INGEST_DISABLE_DYNAMIC_SCALING", "false"))
25
+ DYNAMIC_MEMORY_THRESHOLD = float(os.environ.get("INGEST_DYNAMIC_MEMORY_THRESHOLD", 0.75))
26
+
27
+
28
+ class PipelineCreationSchema(BaseModel):
29
+ """
30
+ Schema for pipeline creation configuration.
31
+
32
+ Contains all parameters required to set up and execute the pipeline,
33
+ including endpoints, API keys, and processing options.
34
+ """
35
+
36
+ # Audio processing settings
37
+ audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
38
+ audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
39
+ audio_infer_protocol: str = os.getenv("AUDIO_INFER_PROTOCOL", "grpc")
40
+
41
+ # Embedding model settings
42
+ embedding_nim_endpoint: str = os.getenv("EMBEDDING_NIM_ENDPOINT", "https://integrate.api.nvidia.com/v1")
43
+ embedding_nim_model_name: str = os.getenv("EMBEDDING_NIM_MODEL_NAME", "nvidia/llama-3.2-nv-embedqa-1b-v2")
44
+
45
+ # General pipeline settings
46
+ ingest_log_level: str = os.getenv("INGEST_LOG_LEVEL", "INFO")
47
+ max_ingest_process_workers: str = os.getenv("MAX_INGEST_PROCESS_WORKERS", "16")
48
+
49
+ # Messaging configuration
50
+ message_client_host: str = os.getenv("MESSAGE_CLIENT_HOST", "localhost")
51
+ message_client_port: str = os.getenv("MESSAGE_CLIENT_PORT", "7671")
52
+ message_client_type: str = os.getenv("MESSAGE_CLIENT_TYPE", "simple")
53
+
54
+ # NeMo Retriever settings
55
+ nemoretriever_parse_http_endpoint: str = os.getenv(
56
+ "NEMORETRIEVER_PARSE_HTTP_ENDPOINT", "https://integrate.api.nvidia.com/v1/chat/completions"
57
+ )
58
+ nemoretriever_parse_infer_protocol: str = os.getenv("NEMORETRIEVER_PARSE_INFER_PROTOCOL", "http")
59
+ nemoretriever_parse_model_name: str = os.getenv("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
60
+
61
+ # API keys
62
+ ngc_api_key: str = os.getenv("NGC_API_KEY", "")
63
+ nvidia_build_api_key: str = os.getenv("NVIDIA_BUILD_API_KEY", "")
64
+
65
+ # Observability settings
66
+ otel_exporter_otlp_endpoint: str = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317")
67
+
68
+ # OCR settings
69
+ paddle_http_endpoint: str = os.getenv("PADDLE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr")
70
+ paddle_infer_protocol: str = os.getenv("PADDLE_INFER_PROTOCOL", "http")
71
+
72
+ # Task queue settings
73
+ REDIS_INGEST_TASK_QUEUE: str = "ingest_task_queue"
74
+
75
+ # Vision language model settings
76
+ vlm_caption_endpoint: str = os.getenv(
77
+ "VLM_CAPTION_ENDPOINT", "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
78
+ )
79
+ vlm_caption_model_name: str = os.getenv("VLM_CAPTION_MODEL_NAME", "meta/llama-3.2-11b-vision-instruct")
80
+
81
+ # YOLOX image processing settings
82
+ yolox_graphic_elements_http_endpoint: str = os.getenv(
83
+ "YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT",
84
+ "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1",
85
+ )
86
+ yolox_graphic_elements_infer_protocol: str = os.getenv("YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL", "http")
87
+
88
+ # YOLOX page elements settings
89
+ yolox_http_endpoint: str = os.getenv(
90
+ "YOLOX_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
91
+ )
92
+ yolox_infer_protocol: str = os.getenv("YOLOX_INFER_PROTOCOL", "http")
93
+
94
+ # YOLOX table structure settings
95
+ yolox_table_structure_http_endpoint: str = os.getenv(
96
+ "YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT", "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
97
+ )
98
+ yolox_table_structure_infer_protocol: str = os.getenv("YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL", "http")
99
+
100
+ model_config = ConfigDict(extra="forbid")
101
+
102
+
103
+ def _launch_pipeline(
104
+ ingest_config: PipelineCreationSchema,
105
+ block: bool,
106
+ disable_dynamic_scaling: bool = None,
107
+ dynamic_memory_threshold: float = None,
108
+ ) -> Tuple[Union[RayPipeline, None], float]:
109
+ logger.info("Starting pipeline setup")
110
+
111
+ dynamic_memory_scaling = not DISABLE_DYNAMIC_SCALING
112
+ if disable_dynamic_scaling is not None:
113
+ dynamic_memory_scaling = not disable_dynamic_scaling
114
+
115
+ dynamic_memory_threshold = dynamic_memory_threshold if dynamic_memory_threshold else DYNAMIC_MEMORY_THRESHOLD
116
+
117
+ scaling_config = ScalingConfig(
118
+ dynamic_memory_scaling=dynamic_memory_scaling, dynamic_memory_threshold=dynamic_memory_threshold
119
+ )
120
+
121
+ pipeline = RayPipeline(scaling_config=scaling_config)
122
+ start_abs = datetime.now()
123
+
124
+ # Set up the ingestion pipeline
125
+ setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
126
+
127
+ # Record setup time
128
+ end_setup = start_run = datetime.now()
129
+ setup_elapsed = (end_setup - start_abs).total_seconds()
130
+ logger.info(f"Pipeline setup completed in {setup_elapsed:.2f} seconds")
131
+
132
+ # Run the pipeline
133
+ logger.debug("Running pipeline")
134
+ pipeline.start()
135
+
136
+ if block:
137
+ try:
138
+ while True:
139
+ time.sleep(5)
140
+ except KeyboardInterrupt:
141
+ logger.info("Interrupt received, shutting down pipeline.")
142
+ pipeline.stop()
143
+ ray.shutdown()
144
+ logger.info("Ray shutdown complete.")
145
+
146
+ # Record execution times
147
+ end_run = datetime.now()
148
+ run_elapsed = (end_run - start_run).total_seconds()
149
+ total_elapsed = (end_run - start_abs).total_seconds()
150
+
151
+ logger.info(f"Pipeline run completed in {run_elapsed:.2f} seconds")
152
+ logger.info(f"Total time elapsed: {total_elapsed:.2f} seconds")
153
+
154
+ return None, total_elapsed
155
+ else:
156
+ return pipeline, 0.0
157
+
158
+
159
+ def run_pipeline(
160
+ ingest_config: PipelineCreationSchema,
161
+ block: bool = True,
162
+ disable_dynamic_scaling: bool = None,
163
+ dynamic_memory_threshold: float = None,
164
+ ) -> Union[RayPipeline, float]:
165
+ pipeline, total_elapsed = _launch_pipeline(ingest_config, block, disable_dynamic_scaling, dynamic_memory_threshold)
166
+
167
+ if block:
168
+ logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
169
+
170
+ return pipeline