nv-ingest 2025.5.21.dev20250521__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- nv_ingest/__init__.py +20 -0
- nv_ingest/api/__init__.py +3 -0
- nv_ingest/api/main.py +43 -0
- nv_ingest/api/v1/__init__.py +3 -0
- nv_ingest/api/v1/health.py +114 -0
- nv_ingest/api/v1/ingest.py +454 -0
- nv_ingest/framework/__init__.py +3 -0
- nv_ingest/framework/orchestration/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
- nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
- nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
- nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
- nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
- nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
- nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
- nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
- nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
- nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
- nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
- nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
- nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
- nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
- nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
- nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
- nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
- nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
- nv_ingest/framework/schemas/__init__.py +0 -0
- nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
- nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
- nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
- nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
- nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
- nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
- nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
- nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
- nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
- nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
- nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
- nv_ingest/framework/util/__init__.py +3 -0
- nv_ingest/framework/util/flow_control/__init__.py +8 -0
- nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
- nv_ingest/framework/util/service/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
- nv_ingest/framework/util/service/meta/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
- nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
- nv_ingest/framework/util/telemetry/__init__.py +3 -0
- nv_ingest/framework/util/telemetry/global_stats.py +145 -0
- nv_ingest/version.py +38 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
- nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import functools
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import Dict, List, Any, Union, Tuple, Optional, Callable
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def filter_by_task(
|
|
17
|
+
required_tasks: List[Union[str, Tuple[Any, ...]]],
|
|
18
|
+
forward_func: Optional[Callable[[Any], Any]] = None,
|
|
19
|
+
) -> Callable:
|
|
20
|
+
"""
|
|
21
|
+
Decorator that checks whether an IngestControlMessage contains any of the required tasks.
|
|
22
|
+
Supports both synchronous and asynchronous functions as well as class methods (with 'self').
|
|
23
|
+
If no required task is found, the original message is returned (or forward_func is called if provided).
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
required_tasks : list[Union[str, Tuple[Any, ...]]]
|
|
28
|
+
A list of required tasks. Each element is either a string representing a task name
|
|
29
|
+
or a tuple/list where the first element is the task name and the remaining elements
|
|
30
|
+
specify required task properties.
|
|
31
|
+
forward_func : Optional[Callable[[Any], Any]]
|
|
32
|
+
A function to be called with the IngestControlMessage if no required task is found.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
Callable
|
|
37
|
+
A decorator wrapping a function that expects an IngestControlMessage as one of its first arguments.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def decorator(func: Callable) -> Callable:
|
|
41
|
+
# Helper to extract the IngestControlMessage from the arguments.
|
|
42
|
+
def extract_message(args: Tuple) -> Any:
|
|
43
|
+
if args and hasattr(args[0], "get_tasks"):
|
|
44
|
+
return args[0]
|
|
45
|
+
elif len(args) > 1 and hasattr(args[1], "get_tasks"):
|
|
46
|
+
return args[1]
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
"The first or second argument must be an IngestControlMessage with task handling capabilities."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if asyncio.iscoroutinefunction(func):
|
|
53
|
+
|
|
54
|
+
@functools.wraps(func)
|
|
55
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
56
|
+
message = extract_message(args)
|
|
57
|
+
tasks: Dict[str, List[Any]] = {}
|
|
58
|
+
for task in message.get_tasks():
|
|
59
|
+
tasks.setdefault(task.type, []).append(task.properties)
|
|
60
|
+
for required_task in required_tasks:
|
|
61
|
+
# Case 1: required task is a simple string.
|
|
62
|
+
if isinstance(required_task, str):
|
|
63
|
+
if required_task in tasks:
|
|
64
|
+
logger.debug(
|
|
65
|
+
"Task '%s' found. Proceeding with async function '%s'.",
|
|
66
|
+
required_task,
|
|
67
|
+
func.__name__,
|
|
68
|
+
)
|
|
69
|
+
return await func(*args, **kwargs)
|
|
70
|
+
else:
|
|
71
|
+
logger.debug("Required task '%s' not found.", required_task)
|
|
72
|
+
# Case 2: required task is a tuple/list with properties.
|
|
73
|
+
elif isinstance(required_task, (tuple, list)):
|
|
74
|
+
required_task_name, *required_task_props_list = required_task
|
|
75
|
+
if required_task_name not in tasks:
|
|
76
|
+
continue
|
|
77
|
+
for task_props in tasks.get(required_task_name, []):
|
|
78
|
+
orig_task_props = task_props
|
|
79
|
+
if isinstance(task_props, BaseModel):
|
|
80
|
+
task_props = task_props.model_dump()
|
|
81
|
+
all_match = True
|
|
82
|
+
for required_props in required_task_props_list:
|
|
83
|
+
if not _is_subset(task_props, required_props):
|
|
84
|
+
logger.debug(
|
|
85
|
+
"For task '%s', task properties %s do not match required subset %s.",
|
|
86
|
+
required_task_name,
|
|
87
|
+
orig_task_props,
|
|
88
|
+
required_props,
|
|
89
|
+
)
|
|
90
|
+
all_match = False
|
|
91
|
+
break
|
|
92
|
+
if all_match:
|
|
93
|
+
logger.debug(
|
|
94
|
+
"Task '%s' with properties %s matched for function '%s'.",
|
|
95
|
+
required_task_name,
|
|
96
|
+
orig_task_props,
|
|
97
|
+
func.__name__,
|
|
98
|
+
)
|
|
99
|
+
return await func(*args, **kwargs)
|
|
100
|
+
else:
|
|
101
|
+
logger.debug("Invalid required task type: %s", type(required_task))
|
|
102
|
+
logger.debug("No required task matched for function '%s'.", func.__name__)
|
|
103
|
+
if forward_func:
|
|
104
|
+
logger.debug("Calling forward function for IngestControlMessage.")
|
|
105
|
+
return await forward_func(message)
|
|
106
|
+
else:
|
|
107
|
+
logger.debug("Returning original IngestControlMessage without processing.")
|
|
108
|
+
return message
|
|
109
|
+
|
|
110
|
+
return async_wrapper
|
|
111
|
+
else:
|
|
112
|
+
|
|
113
|
+
@functools.wraps(func)
|
|
114
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
115
|
+
message = extract_message(args)
|
|
116
|
+
tasks: Dict[str, List[Any]] = {}
|
|
117
|
+
for task in message.get_tasks():
|
|
118
|
+
tasks.setdefault(task.type, []).append(task.properties)
|
|
119
|
+
for required_task in required_tasks:
|
|
120
|
+
if isinstance(required_task, str):
|
|
121
|
+
if required_task in tasks:
|
|
122
|
+
logger.debug(
|
|
123
|
+
"Task '%s' found. Proceeding with function '%s'.", required_task, func.__name__
|
|
124
|
+
)
|
|
125
|
+
return func(*args, **kwargs)
|
|
126
|
+
else:
|
|
127
|
+
logger.debug("Required task '%s' not found.", required_task)
|
|
128
|
+
elif isinstance(required_task, (tuple, list)):
|
|
129
|
+
required_task_name, *required_task_props_list = required_task
|
|
130
|
+
if required_task_name not in tasks:
|
|
131
|
+
continue
|
|
132
|
+
for task_props in tasks.get(required_task_name, []):
|
|
133
|
+
orig_task_props = task_props
|
|
134
|
+
if isinstance(task_props, BaseModel):
|
|
135
|
+
task_props = task_props.model_dump()
|
|
136
|
+
all_match = True
|
|
137
|
+
for required_props in required_task_props_list:
|
|
138
|
+
if not _is_subset(task_props, required_props):
|
|
139
|
+
logger.debug(
|
|
140
|
+
"For task '%s', task properties %s do not match required subset %s.",
|
|
141
|
+
required_task_name,
|
|
142
|
+
orig_task_props,
|
|
143
|
+
required_props,
|
|
144
|
+
)
|
|
145
|
+
all_match = False
|
|
146
|
+
break
|
|
147
|
+
if all_match:
|
|
148
|
+
logger.debug(
|
|
149
|
+
"Task '%s' with properties %s matched for function '%s'.",
|
|
150
|
+
required_task_name,
|
|
151
|
+
orig_task_props,
|
|
152
|
+
func.__name__,
|
|
153
|
+
)
|
|
154
|
+
return func(*args, **kwargs)
|
|
155
|
+
else:
|
|
156
|
+
logger.debug("Invalid required task type: %s", type(required_task))
|
|
157
|
+
logger.debug("No required task matched for function '%s'.", func.__name__)
|
|
158
|
+
if forward_func:
|
|
159
|
+
logger.debug("Calling forward function for IngestControlMessage.")
|
|
160
|
+
return forward_func(message)
|
|
161
|
+
else:
|
|
162
|
+
logger.debug("Returning original IngestControlMessage without processing.")
|
|
163
|
+
return message
|
|
164
|
+
|
|
165
|
+
return sync_wrapper
|
|
166
|
+
|
|
167
|
+
return decorator
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _is_subset(superset: Any, subset: Any) -> bool:
|
|
171
|
+
"""
|
|
172
|
+
Recursively checks whether 'subset' is contained within 'superset'. Supports dictionaries,
|
|
173
|
+
lists, strings (including regex patterns), and basic types.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
superset : Any
|
|
178
|
+
The data structure (or value) that is expected to contain the subset.
|
|
179
|
+
subset : Any
|
|
180
|
+
The data structure (or value) to be checked for being a subset of 'superset'. A special
|
|
181
|
+
value "*" matches any value, and strings prefixed with "regex:" are treated as regular
|
|
182
|
+
expression patterns.
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
bool
|
|
187
|
+
True if 'subset' is contained within 'superset', False otherwise.
|
|
188
|
+
"""
|
|
189
|
+
if subset == "*":
|
|
190
|
+
return True
|
|
191
|
+
if isinstance(superset, dict) and isinstance(subset, dict):
|
|
192
|
+
for key, val in subset.items():
|
|
193
|
+
if key not in superset:
|
|
194
|
+
logger.debug("Key '%s' not found in superset dictionary: %s", key, superset)
|
|
195
|
+
return False
|
|
196
|
+
if not _is_subset(superset[key], val):
|
|
197
|
+
logger.debug("Value for key '%s' (%s) does not match expected subset (%s).", key, superset[key], val)
|
|
198
|
+
return False
|
|
199
|
+
return True
|
|
200
|
+
if isinstance(subset, str) and subset.startswith("regex:"):
|
|
201
|
+
pattern = subset[len("regex:") :]
|
|
202
|
+
if isinstance(superset, list):
|
|
203
|
+
for sup_item in superset:
|
|
204
|
+
if re.match(pattern, sup_item):
|
|
205
|
+
return True
|
|
206
|
+
logger.debug("No items in list %s match regex pattern '%s'.", superset, pattern)
|
|
207
|
+
return False
|
|
208
|
+
else:
|
|
209
|
+
if re.match(pattern, superset) is None:
|
|
210
|
+
logger.debug("Value '%s' does not match regex pattern '%s'.", superset, pattern)
|
|
211
|
+
return False
|
|
212
|
+
return True
|
|
213
|
+
if isinstance(superset, list) and not isinstance(subset, list):
|
|
214
|
+
for sup_item in superset:
|
|
215
|
+
if _is_subset(sup_item, subset):
|
|
216
|
+
return True
|
|
217
|
+
logger.debug("None of the items in list %s match the value '%s'.", superset, subset)
|
|
218
|
+
return False
|
|
219
|
+
if isinstance(superset, (list, set)) and isinstance(subset, list):
|
|
220
|
+
for sub_item in subset:
|
|
221
|
+
if not any(_is_subset(sup_item, sub_item) for sup_item in superset):
|
|
222
|
+
logger.debug("No element in %s matches subset element '%s'.", superset, sub_item)
|
|
223
|
+
return False
|
|
224
|
+
return True
|
|
225
|
+
if superset != subset:
|
|
226
|
+
logger.debug("Direct comparison failed: %s != %s", superset, subset)
|
|
227
|
+
return superset == subset
|
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from json import JSONDecodeError
|
|
10
|
+
from typing import Optional, Dict, Any
|
|
11
|
+
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
import redis
|
|
15
|
+
|
|
16
|
+
from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
|
|
17
|
+
from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob
|
|
18
|
+
from nv_ingest.framework.util.service.meta.ingest.ingest_service_meta import IngestServiceMeta
|
|
19
|
+
from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_ingest_job
|
|
20
|
+
from nv_ingest_api.util.service_clients.client_base import FetchMode
|
|
21
|
+
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("uvicorn")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_fetch_mode_from_env() -> "FetchMode":
|
|
27
|
+
"""
|
|
28
|
+
Retrieves the fetch mode from the environment variable FETCH_MODE.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
FetchMode
|
|
33
|
+
The fetch mode as specified by the environment variable, or NON_DESTRUCTIVE by default.
|
|
34
|
+
"""
|
|
35
|
+
mode_str: str = os.getenv("FETCH_MODE", "NON_DESTRUCTIVE").upper()
|
|
36
|
+
try:
|
|
37
|
+
return FetchMode[mode_str]
|
|
38
|
+
except KeyError:
|
|
39
|
+
logger.warning(f"Invalid FETCH_MODE '{mode_str}' in environment. Defaulting to DESTRUCTIVE.")
|
|
40
|
+
return FetchMode.DESTRUCTIVE
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class RedisIngestService(IngestServiceMeta):
|
|
44
|
+
"""
|
|
45
|
+
Submits jobs and fetches results via Redis, supporting multiple fetch modes
|
|
46
|
+
and state management with TTLs. Operates asynchronously using asyncio.to_thread
|
|
47
|
+
for synchronous Redis client operations.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
_concurrency_level: int = int(os.getenv("CONCURRENCY_LEVEL", "10"))
|
|
51
|
+
__shared_instance: Optional["RedisIngestService"] = None
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def get_instance() -> "RedisIngestService":
|
|
55
|
+
"""
|
|
56
|
+
Static access method implementing the Singleton pattern.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
RedisIngestService
|
|
61
|
+
The singleton instance of the RedisIngestService.
|
|
62
|
+
"""
|
|
63
|
+
if RedisIngestService.__shared_instance is None:
|
|
64
|
+
redis_host: str = os.getenv("MESSAGE_CLIENT_HOST", "localhost")
|
|
65
|
+
redis_port: int = int(os.getenv("MESSAGE_CLIENT_PORT", "6379"))
|
|
66
|
+
redis_task_queue: str = os.getenv("REDIS_INGEST_TASK_QUEUE", "ingest_task_queue")
|
|
67
|
+
|
|
68
|
+
fetch_mode: "FetchMode" = get_fetch_mode_from_env()
|
|
69
|
+
result_data_ttl: int = int(os.getenv("RESULT_DATA_TTL_SECONDS", "3600"))
|
|
70
|
+
state_ttl: int = int(os.getenv("STATE_TTL_SECONDS", "7200"))
|
|
71
|
+
|
|
72
|
+
cache_config: Dict[str, Any] = {
|
|
73
|
+
"directory": os.getenv("FETCH_CACHE_DIR", "./.fetch_cache"),
|
|
74
|
+
"ttl": int(os.getenv("FETCH_CACHE_TTL_SECONDS", "3600")),
|
|
75
|
+
}
|
|
76
|
+
use_ssl: bool = os.getenv("REDIS_USE_SSL", "false").lower() == "true"
|
|
77
|
+
|
|
78
|
+
RedisIngestService.__shared_instance = RedisIngestService(
|
|
79
|
+
redis_hostname=redis_host,
|
|
80
|
+
redis_port=redis_port,
|
|
81
|
+
redis_task_queue=redis_task_queue,
|
|
82
|
+
fetch_mode=fetch_mode,
|
|
83
|
+
result_data_ttl_seconds=result_data_ttl if result_data_ttl > 0 else None,
|
|
84
|
+
state_ttl_seconds=state_ttl if state_ttl > 0 else None,
|
|
85
|
+
cache_config=cache_config,
|
|
86
|
+
use_ssl=use_ssl,
|
|
87
|
+
)
|
|
88
|
+
logger.debug(f"RedisIngestService configured with FetchMode: {fetch_mode.name}")
|
|
89
|
+
else:
|
|
90
|
+
logger.debug("Returning existing RedisIngestService Singleton instance.")
|
|
91
|
+
return RedisIngestService.__shared_instance
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
redis_hostname: str,
|
|
96
|
+
redis_port: int,
|
|
97
|
+
redis_task_queue: str,
|
|
98
|
+
fetch_mode: "FetchMode",
|
|
99
|
+
result_data_ttl_seconds: Optional[int],
|
|
100
|
+
state_ttl_seconds: Optional[int],
|
|
101
|
+
cache_config: Optional[Dict[str, Any]],
|
|
102
|
+
use_ssl: bool,
|
|
103
|
+
) -> None:
|
|
104
|
+
"""
|
|
105
|
+
Initializes the service and the underlying RedisClient.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
redis_hostname : str
|
|
110
|
+
Redis server hostname.
|
|
111
|
+
redis_port : int
|
|
112
|
+
Redis server port.
|
|
113
|
+
redis_task_queue : str
|
|
114
|
+
The Redis queue name for tasks.
|
|
115
|
+
fetch_mode : FetchMode
|
|
116
|
+
The fetch mode configuration.
|
|
117
|
+
result_data_ttl_seconds : int or None
|
|
118
|
+
TTL for result data in seconds, or None to disable.
|
|
119
|
+
state_ttl_seconds : int or None
|
|
120
|
+
TTL for the job state record, or None to disable.
|
|
121
|
+
cache_config : dict or None
|
|
122
|
+
Configuration for caching.
|
|
123
|
+
use_ssl : bool
|
|
124
|
+
Whether to use SSL for the Redis connection.
|
|
125
|
+
"""
|
|
126
|
+
self._redis_hostname: str = redis_hostname
|
|
127
|
+
self._redis_port: int = redis_port
|
|
128
|
+
self._redis_task_queue: str = redis_task_queue
|
|
129
|
+
self._fetch_mode: "FetchMode" = fetch_mode
|
|
130
|
+
self._result_data_ttl_seconds: Optional[int] = result_data_ttl_seconds
|
|
131
|
+
self._state_ttl_seconds: Optional[int] = state_ttl_seconds
|
|
132
|
+
|
|
133
|
+
self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
|
|
134
|
+
self._cache_prefix: str = "processing_cache:"
|
|
135
|
+
self._state_prefix: str = "job_state:"
|
|
136
|
+
|
|
137
|
+
self._ingest_client = RedisClient(
|
|
138
|
+
host=self._redis_hostname,
|
|
139
|
+
port=self._redis_port,
|
|
140
|
+
max_pool_size=self._concurrency_level,
|
|
141
|
+
fetch_mode=self._fetch_mode,
|
|
142
|
+
cache_config=cache_config,
|
|
143
|
+
message_ttl_seconds=self._result_data_ttl_seconds,
|
|
144
|
+
use_ssl=use_ssl,
|
|
145
|
+
max_retries=int(os.getenv("REDIS_MAX_RETRIES", "3")),
|
|
146
|
+
max_backoff=int(os.getenv("REDIS_MAX_BACKOFF", "32")),
|
|
147
|
+
connection_timeout=int(os.getenv("REDIS_CONNECTION_TIMEOUT", "300")),
|
|
148
|
+
)
|
|
149
|
+
logger.debug(
|
|
150
|
+
f"RedisClient initialized for service. Host: {redis_hostname}:{redis_port}, "
|
|
151
|
+
f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Validates, prepares, and submits a job specification to the Redis task queue.
|
|
157
|
+
Sets result data TTL if configured for NON_DESTRUCTIVE mode.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
job_spec_wrapper : MessageWrapper
|
|
162
|
+
A wrapper containing the job specification payload.
|
|
163
|
+
trace_id : str
|
|
164
|
+
A unique identifier for the job.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
str
|
|
169
|
+
The job trace_id.
|
|
170
|
+
|
|
171
|
+
Raises
|
|
172
|
+
------
|
|
173
|
+
ValueError
|
|
174
|
+
If the payload is missing or invalid.
|
|
175
|
+
JSONDecodeError, TypeError
|
|
176
|
+
For payload parsing errors.
|
|
177
|
+
RedisError, ConnectionError
|
|
178
|
+
For Redis-related errors.
|
|
179
|
+
"""
|
|
180
|
+
try:
|
|
181
|
+
json_data = job_spec_wrapper.model_dump(mode="json").get("payload")
|
|
182
|
+
if not json_data:
|
|
183
|
+
raise ValueError("MessageWrapper payload is missing or empty.")
|
|
184
|
+
if isinstance(json_data, str):
|
|
185
|
+
job_spec = json.loads(json_data)
|
|
186
|
+
elif isinstance(json_data, dict):
|
|
187
|
+
job_spec = json_data
|
|
188
|
+
else:
|
|
189
|
+
raise TypeError(f"Unexpected payload type: {type(json_data)}")
|
|
190
|
+
|
|
191
|
+
validate_ingest_job(job_spec)
|
|
192
|
+
job_spec["job_id"] = trace_id
|
|
193
|
+
tasks = job_spec.get("tasks", [])
|
|
194
|
+
updated_tasks = []
|
|
195
|
+
for task in tasks:
|
|
196
|
+
task_prop = task.get("task_properties", {})
|
|
197
|
+
if hasattr(task_prop, "model_dump") and callable(task_prop.model_dump):
|
|
198
|
+
task["task_properties"] = task_prop.model_dump(mode="json")
|
|
199
|
+
elif not isinstance(task_prop, dict):
|
|
200
|
+
try:
|
|
201
|
+
task["task_properties"] = dict(task_prop)
|
|
202
|
+
except (TypeError, ValueError):
|
|
203
|
+
logger.error(f"Cannot convert task_properties to dict: {task_prop}. Skipping properties.")
|
|
204
|
+
task["task_properties"] = {}
|
|
205
|
+
updated_tasks.append(task)
|
|
206
|
+
job_spec["tasks"] = updated_tasks
|
|
207
|
+
job_spec_json = json.dumps(job_spec)
|
|
208
|
+
ttl_for_result: Optional[int] = (
|
|
209
|
+
self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
|
|
210
|
+
)
|
|
211
|
+
logger.debug(
|
|
212
|
+
f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
|
|
213
|
+
)
|
|
214
|
+
await asyncio.to_thread(
|
|
215
|
+
self._ingest_client.submit_message,
|
|
216
|
+
channel_name=self._redis_task_queue,
|
|
217
|
+
message=job_spec_json,
|
|
218
|
+
ttl_seconds=ttl_for_result,
|
|
219
|
+
)
|
|
220
|
+
logger.debug(f"Successfully submitted job {trace_id}")
|
|
221
|
+
return trace_id
|
|
222
|
+
except (JSONDecodeError, TypeError, ValueError) as err:
|
|
223
|
+
logger.exception(f"Data validation or parsing error for job {trace_id}: {err}")
|
|
224
|
+
raise ValueError(f"Invalid job specification: {err}") from err
|
|
225
|
+
except (redis.RedisError, ConnectionError) as err:
|
|
226
|
+
logger.exception(f"Redis error submitting job {trace_id}: {err}")
|
|
227
|
+
raise err
|
|
228
|
+
except Exception as err:
|
|
229
|
+
logger.exception(f"Unexpected error submitting job {trace_id}: {err}")
|
|
230
|
+
raise
|
|
231
|
+
|
|
232
|
+
async def fetch_job(self, job_id: str) -> Optional[Dict]:
|
|
233
|
+
"""
|
|
234
|
+
Fetches the job result using the configured RedisClient fetch mode and timeout.
|
|
235
|
+
Executes the synchronous client call asynchronously.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
job_id : str
|
|
240
|
+
The unique identifier of the job.
|
|
241
|
+
|
|
242
|
+
Returns
|
|
243
|
+
-------
|
|
244
|
+
dict or None
|
|
245
|
+
The job result message.
|
|
246
|
+
|
|
247
|
+
Raises
|
|
248
|
+
------
|
|
249
|
+
TimeoutError, RedisError, ConnectionError, ValueError, RuntimeError
|
|
250
|
+
If the fetch operation fails.
|
|
251
|
+
"""
|
|
252
|
+
try:
|
|
253
|
+
result_channel: str = f"{job_id}"
|
|
254
|
+
logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
|
|
255
|
+
message = await asyncio.to_thread(
|
|
256
|
+
self._ingest_client.fetch_message,
|
|
257
|
+
channel_name=result_channel,
|
|
258
|
+
timeout=10,
|
|
259
|
+
)
|
|
260
|
+
if message is not None:
|
|
261
|
+
logger.debug(f"Successfully fetched result for job {job_id}.")
|
|
262
|
+
return message
|
|
263
|
+
else:
|
|
264
|
+
logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
|
|
265
|
+
raise TimeoutError("No data found (unexpected None response).")
|
|
266
|
+
except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
|
|
267
|
+
logger.info(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
|
|
268
|
+
raise e
|
|
269
|
+
except Exception as e:
|
|
270
|
+
logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
|
|
271
|
+
raise RuntimeError(f"Unexpected error fetching job {job_id}") from e
|
|
272
|
+
|
|
273
|
+
async def set_job_state(self, job_id: str, state: str) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Sets the explicit state of a job and refreshes its TTL.
|
|
276
|
+
|
|
277
|
+
Parameters
|
|
278
|
+
----------
|
|
279
|
+
job_id : str
|
|
280
|
+
The unique identifier of the job.
|
|
281
|
+
state : str
|
|
282
|
+
The state to be assigned to the job.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
None
|
|
287
|
+
"""
|
|
288
|
+
state_key: str = f"{self._state_prefix}{job_id}"
|
|
289
|
+
ttl_to_set: Optional[int] = self._state_ttl_seconds
|
|
290
|
+
try:
|
|
291
|
+
logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
|
|
292
|
+
await asyncio.to_thread(
|
|
293
|
+
self._ingest_client.get_client().set,
|
|
294
|
+
state_key,
|
|
295
|
+
state,
|
|
296
|
+
ex=ttl_to_set,
|
|
297
|
+
)
|
|
298
|
+
logger.debug(f"Successfully set state for {job_id}.")
|
|
299
|
+
except (redis.RedisError, ConnectionError) as err:
|
|
300
|
+
logger.error(f"Failed to set state for {state_key}: {err}")
|
|
301
|
+
except Exception as err:
|
|
302
|
+
logger.exception(f"Unexpected error setting state for {state_key}: {err}")
|
|
303
|
+
|
|
304
|
+
async def get_job_state(self, job_id: str) -> Optional[str]:
|
|
305
|
+
"""
|
|
306
|
+
Retrieves the explicit state of a job.
|
|
307
|
+
|
|
308
|
+
Parameters
|
|
309
|
+
----------
|
|
310
|
+
job_id : str
|
|
311
|
+
The unique identifier of the job.
|
|
312
|
+
|
|
313
|
+
Returns
|
|
314
|
+
-------
|
|
315
|
+
str or None
|
|
316
|
+
The state of the job, or None if not found or upon error.
|
|
317
|
+
"""
|
|
318
|
+
state_key: str = f"{self._state_prefix}{job_id}"
|
|
319
|
+
try:
|
|
320
|
+
data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, state_key)
|
|
321
|
+
if data_bytes:
|
|
322
|
+
state: str = data_bytes.decode("utf-8")
|
|
323
|
+
logger.debug(f"Retrieved state for {job_id}: {state}")
|
|
324
|
+
return state
|
|
325
|
+
else:
|
|
326
|
+
logger.debug(f"No state found for {job_id} (key: {state_key})")
|
|
327
|
+
return None
|
|
328
|
+
except (redis.RedisError, ConnectionError) as err:
|
|
329
|
+
logger.error(f"Redis error getting state for {state_key}: {err}")
|
|
330
|
+
return None
|
|
331
|
+
except Exception as err:
|
|
332
|
+
logger.exception(f"Unexpected error getting state for {state_key}: {err}")
|
|
333
|
+
return None
|
|
334
|
+
|
|
335
|
+
async def set_processing_cache(self, job_id: str, jobs_data: List["ProcessingJob"]) -> None:
|
|
336
|
+
"""
|
|
337
|
+
Stores processing jobs data in a simple key-value cache.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
job_id : str
|
|
342
|
+
The unique identifier of the job.
|
|
343
|
+
jobs_data : list of ProcessingJob
|
|
344
|
+
The processing job data to be cached.
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
None
|
|
349
|
+
"""
|
|
350
|
+
cache_key: str = f"{self._cache_prefix}{job_id}"
|
|
351
|
+
try:
|
|
352
|
+
data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
|
|
353
|
+
await asyncio.to_thread(
|
|
354
|
+
self._ingest_client.get_client().set,
|
|
355
|
+
cache_key,
|
|
356
|
+
data_to_store,
|
|
357
|
+
ex=3600,
|
|
358
|
+
)
|
|
359
|
+
except Exception as err:
|
|
360
|
+
logger.exception(f"Error setting cache for {cache_key}: {err}")
|
|
361
|
+
|
|
362
|
+
async def get_processing_cache(self, job_id: str) -> List["ProcessingJob"]:
|
|
363
|
+
"""
|
|
364
|
+
Retrieves processing jobs data from the simple key-value cache.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
job_id : str
|
|
369
|
+
The unique identifier of the job.
|
|
370
|
+
|
|
371
|
+
Returns
|
|
372
|
+
-------
|
|
373
|
+
list of ProcessingJob
|
|
374
|
+
A list of processing jobs, or an empty list if not found or upon error.
|
|
375
|
+
"""
|
|
376
|
+
cache_key: str = f"{self._cache_prefix}{job_id}"
|
|
377
|
+
try:
|
|
378
|
+
data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, cache_key)
|
|
379
|
+
if data_bytes is None:
|
|
380
|
+
return []
|
|
381
|
+
return [ProcessingJob(**job) for job in json.loads(data_bytes)]
|
|
382
|
+
except Exception as err:
|
|
383
|
+
logger.exception(f"Error getting cache for {cache_key}: {err}")
|
|
384
|
+
return []
|
|
385
|
+
|
|
386
|
+
async def get_fetch_mode(self) -> "FetchMode":
|
|
387
|
+
"""
|
|
388
|
+
Returns the configured fetch mode for the service.
|
|
389
|
+
|
|
390
|
+
Returns
|
|
391
|
+
-------
|
|
392
|
+
FetchMode
|
|
393
|
+
The current fetch mode.
|
|
394
|
+
"""
|
|
395
|
+
return self._fetch_mode
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from abc import ABC
|
|
6
|
+
from abc import abstractmethod
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
|
|
10
|
+
from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob
|
|
11
|
+
from nv_ingest_api.util.service_clients.client_base import FetchMode
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class IngestServiceMeta(ABC):
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def submit_job(self, job_spec: MessageWrapper, trace_id: str) -> str:
|
|
17
|
+
"""Abstract method for submitting one or more jobs to the ingestion pipeline"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def fetch_job(self, job_id: str):
|
|
21
|
+
"""Abstract method for fetching job from ingestion service based on job_id"""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
async def set_processing_cache(self, job_id: str, jobs_data: List[ProcessingJob]) -> None:
|
|
25
|
+
"""Abstract method for setting processing cache"""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
async def get_processing_cache(self, job_id: str) -> List[ProcessingJob]:
|
|
29
|
+
"""Abstract method for getting processing cache"""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
async def set_job_state(self, job_id: str, state: str, ttl: int = 86400):
|
|
33
|
+
"""Abstract method for setting job state"""
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
async def get_job_state(self, job_id: str) -> Optional[str]:
|
|
37
|
+
"""Abstract method for getting job state"""
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def get_fetch_mode(self) -> FetchMode:
|
|
41
|
+
"""Abstract method for getting fetch mode"""
|