nv-ingest 2025.5.21.dev20250521__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (100) hide show
  1. nv_ingest/__init__.py +20 -0
  2. nv_ingest/api/__init__.py +3 -0
  3. nv_ingest/api/main.py +43 -0
  4. nv_ingest/api/v1/__init__.py +3 -0
  5. nv_ingest/api/v1/health.py +114 -0
  6. nv_ingest/api/v1/ingest.py +454 -0
  7. nv_ingest/framework/__init__.py +3 -0
  8. nv_ingest/framework/orchestration/__init__.py +3 -0
  9. nv_ingest/framework/orchestration/ray/__init__.py +3 -0
  10. nv_ingest/framework/orchestration/ray/edges/__init__.py +3 -0
  11. nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +63 -0
  12. nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +73 -0
  13. nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +72 -0
  14. nv_ingest/framework/orchestration/ray/examples/__init__.py +3 -0
  15. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +408 -0
  16. nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +63 -0
  17. nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +94 -0
  18. nv_ingest/framework/orchestration/ray/primitives/__init__.py +3 -0
  19. nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  20. nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +239 -0
  21. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +574 -0
  22. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +1187 -0
  23. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +346 -0
  24. nv_ingest/framework/orchestration/ray/stages/__init__.py +3 -0
  25. nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +3 -0
  26. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +82 -0
  27. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +92 -0
  28. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +81 -0
  29. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +85 -0
  30. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +57 -0
  31. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +113 -0
  32. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +85 -0
  33. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +90 -0
  34. nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +3 -0
  35. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +97 -0
  36. nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +3 -0
  37. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +70 -0
  38. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +82 -0
  39. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +59 -0
  40. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +652 -0
  41. nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +3 -0
  42. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +85 -0
  43. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +84 -0
  44. nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +3 -0
  45. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +41 -0
  46. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +268 -0
  47. nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +3 -0
  48. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +502 -0
  49. nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +3 -0
  50. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +98 -0
  51. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +81 -0
  52. nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +3 -0
  53. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +66 -0
  54. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +3 -0
  55. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +205 -0
  56. nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +3 -0
  57. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +81 -0
  58. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +81 -0
  59. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +74 -0
  60. nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +3 -0
  61. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +65 -0
  62. nv_ingest/framework/orchestration/ray/util/__init__.py +3 -0
  63. nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +3 -0
  64. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +989 -0
  65. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +195 -0
  66. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +170 -0
  67. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +609 -0
  68. nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +3 -0
  69. nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +59 -0
  70. nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +309 -0
  71. nv_ingest/framework/schemas/__init__.py +0 -0
  72. nv_ingest/framework/schemas/framework_ingest_config_schema.py +54 -0
  73. nv_ingest/framework/schemas/framework_job_counter_schema.py +12 -0
  74. nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +18 -0
  75. nv_ingest/framework/schemas/framework_message_broker_source_schema.py +19 -0
  76. nv_ingest/framework/schemas/framework_message_wrapper_schema.py +5 -0
  77. nv_ingest/framework/schemas/framework_metadata_injector_schema.py +15 -0
  78. nv_ingest/framework/schemas/framework_otel_meter_schema.py +16 -0
  79. nv_ingest/framework/schemas/framework_otel_tracer_schema.py +12 -0
  80. nv_ingest/framework/schemas/framework_processing_job_schema.py +25 -0
  81. nv_ingest/framework/schemas/framework_task_injection_schema.py +15 -0
  82. nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +112 -0
  83. nv_ingest/framework/util/__init__.py +3 -0
  84. nv_ingest/framework/util/flow_control/__init__.py +8 -0
  85. nv_ingest/framework/util/flow_control/filter_by_task.py +227 -0
  86. nv_ingest/framework/util/service/__init__.py +3 -0
  87. nv_ingest/framework/util/service/impl/__init__.py +3 -0
  88. nv_ingest/framework/util/service/impl/ingest/__init__.py +3 -0
  89. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +395 -0
  90. nv_ingest/framework/util/service/meta/__init__.py +3 -0
  91. nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  92. nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +41 -0
  93. nv_ingest/framework/util/telemetry/__init__.py +3 -0
  94. nv_ingest/framework/util/telemetry/global_stats.py +145 -0
  95. nv_ingest/version.py +38 -0
  96. nv_ingest-2025.5.21.dev20250521.dist-info/METADATA +263 -0
  97. nv_ingest-2025.5.21.dev20250521.dist-info/RECORD +100 -0
  98. nv_ingest-2025.5.21.dev20250521.dist-info/WHEEL +5 -0
  99. nv_ingest-2025.5.21.dev20250521.dist-info/licenses/LICENSE +201 -0
  100. nv_ingest-2025.5.21.dev20250521.dist-info/top_level.txt +1 -0
@@ -0,0 +1,227 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import asyncio
6
+ import functools
7
+ import logging
8
+ import re
9
+ from typing import Dict, List, Any, Union, Tuple, Optional, Callable
10
+
11
+ from pydantic import BaseModel
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def filter_by_task(
17
+ required_tasks: List[Union[str, Tuple[Any, ...]]],
18
+ forward_func: Optional[Callable[[Any], Any]] = None,
19
+ ) -> Callable:
20
+ """
21
+ Decorator that checks whether an IngestControlMessage contains any of the required tasks.
22
+ Supports both synchronous and asynchronous functions as well as class methods (with 'self').
23
+ If no required task is found, the original message is returned (or forward_func is called if provided).
24
+
25
+ Parameters
26
+ ----------
27
+ required_tasks : list[Union[str, Tuple[Any, ...]]]
28
+ A list of required tasks. Each element is either a string representing a task name
29
+ or a tuple/list where the first element is the task name and the remaining elements
30
+ specify required task properties.
31
+ forward_func : Optional[Callable[[Any], Any]]
32
+ A function to be called with the IngestControlMessage if no required task is found.
33
+
34
+ Returns
35
+ -------
36
+ Callable
37
+ A decorator wrapping a function that expects an IngestControlMessage as one of its first arguments.
38
+ """
39
+
40
+ def decorator(func: Callable) -> Callable:
41
+ # Helper to extract the IngestControlMessage from the arguments.
42
+ def extract_message(args: Tuple) -> Any:
43
+ if args and hasattr(args[0], "get_tasks"):
44
+ return args[0]
45
+ elif len(args) > 1 and hasattr(args[1], "get_tasks"):
46
+ return args[1]
47
+ else:
48
+ raise ValueError(
49
+ "The first or second argument must be an IngestControlMessage with task handling capabilities."
50
+ )
51
+
52
+ if asyncio.iscoroutinefunction(func):
53
+
54
+ @functools.wraps(func)
55
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
56
+ message = extract_message(args)
57
+ tasks: Dict[str, List[Any]] = {}
58
+ for task in message.get_tasks():
59
+ tasks.setdefault(task.type, []).append(task.properties)
60
+ for required_task in required_tasks:
61
+ # Case 1: required task is a simple string.
62
+ if isinstance(required_task, str):
63
+ if required_task in tasks:
64
+ logger.debug(
65
+ "Task '%s' found. Proceeding with async function '%s'.",
66
+ required_task,
67
+ func.__name__,
68
+ )
69
+ return await func(*args, **kwargs)
70
+ else:
71
+ logger.debug("Required task '%s' not found.", required_task)
72
+ # Case 2: required task is a tuple/list with properties.
73
+ elif isinstance(required_task, (tuple, list)):
74
+ required_task_name, *required_task_props_list = required_task
75
+ if required_task_name not in tasks:
76
+ continue
77
+ for task_props in tasks.get(required_task_name, []):
78
+ orig_task_props = task_props
79
+ if isinstance(task_props, BaseModel):
80
+ task_props = task_props.model_dump()
81
+ all_match = True
82
+ for required_props in required_task_props_list:
83
+ if not _is_subset(task_props, required_props):
84
+ logger.debug(
85
+ "For task '%s', task properties %s do not match required subset %s.",
86
+ required_task_name,
87
+ orig_task_props,
88
+ required_props,
89
+ )
90
+ all_match = False
91
+ break
92
+ if all_match:
93
+ logger.debug(
94
+ "Task '%s' with properties %s matched for function '%s'.",
95
+ required_task_name,
96
+ orig_task_props,
97
+ func.__name__,
98
+ )
99
+ return await func(*args, **kwargs)
100
+ else:
101
+ logger.debug("Invalid required task type: %s", type(required_task))
102
+ logger.debug("No required task matched for function '%s'.", func.__name__)
103
+ if forward_func:
104
+ logger.debug("Calling forward function for IngestControlMessage.")
105
+ return await forward_func(message)
106
+ else:
107
+ logger.debug("Returning original IngestControlMessage without processing.")
108
+ return message
109
+
110
+ return async_wrapper
111
+ else:
112
+
113
+ @functools.wraps(func)
114
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
115
+ message = extract_message(args)
116
+ tasks: Dict[str, List[Any]] = {}
117
+ for task in message.get_tasks():
118
+ tasks.setdefault(task.type, []).append(task.properties)
119
+ for required_task in required_tasks:
120
+ if isinstance(required_task, str):
121
+ if required_task in tasks:
122
+ logger.debug(
123
+ "Task '%s' found. Proceeding with function '%s'.", required_task, func.__name__
124
+ )
125
+ return func(*args, **kwargs)
126
+ else:
127
+ logger.debug("Required task '%s' not found.", required_task)
128
+ elif isinstance(required_task, (tuple, list)):
129
+ required_task_name, *required_task_props_list = required_task
130
+ if required_task_name not in tasks:
131
+ continue
132
+ for task_props in tasks.get(required_task_name, []):
133
+ orig_task_props = task_props
134
+ if isinstance(task_props, BaseModel):
135
+ task_props = task_props.model_dump()
136
+ all_match = True
137
+ for required_props in required_task_props_list:
138
+ if not _is_subset(task_props, required_props):
139
+ logger.debug(
140
+ "For task '%s', task properties %s do not match required subset %s.",
141
+ required_task_name,
142
+ orig_task_props,
143
+ required_props,
144
+ )
145
+ all_match = False
146
+ break
147
+ if all_match:
148
+ logger.debug(
149
+ "Task '%s' with properties %s matched for function '%s'.",
150
+ required_task_name,
151
+ orig_task_props,
152
+ func.__name__,
153
+ )
154
+ return func(*args, **kwargs)
155
+ else:
156
+ logger.debug("Invalid required task type: %s", type(required_task))
157
+ logger.debug("No required task matched for function '%s'.", func.__name__)
158
+ if forward_func:
159
+ logger.debug("Calling forward function for IngestControlMessage.")
160
+ return forward_func(message)
161
+ else:
162
+ logger.debug("Returning original IngestControlMessage without processing.")
163
+ return message
164
+
165
+ return sync_wrapper
166
+
167
+ return decorator
168
+
169
+
170
+ def _is_subset(superset: Any, subset: Any) -> bool:
171
+ """
172
+ Recursively checks whether 'subset' is contained within 'superset'. Supports dictionaries,
173
+ lists, strings (including regex patterns), and basic types.
174
+
175
+ Parameters
176
+ ----------
177
+ superset : Any
178
+ The data structure (or value) that is expected to contain the subset.
179
+ subset : Any
180
+ The data structure (or value) to be checked for being a subset of 'superset'. A special
181
+ value "*" matches any value, and strings prefixed with "regex:" are treated as regular
182
+ expression patterns.
183
+
184
+ Returns
185
+ -------
186
+ bool
187
+ True if 'subset' is contained within 'superset', False otherwise.
188
+ """
189
+ if subset == "*":
190
+ return True
191
+ if isinstance(superset, dict) and isinstance(subset, dict):
192
+ for key, val in subset.items():
193
+ if key not in superset:
194
+ logger.debug("Key '%s' not found in superset dictionary: %s", key, superset)
195
+ return False
196
+ if not _is_subset(superset[key], val):
197
+ logger.debug("Value for key '%s' (%s) does not match expected subset (%s).", key, superset[key], val)
198
+ return False
199
+ return True
200
+ if isinstance(subset, str) and subset.startswith("regex:"):
201
+ pattern = subset[len("regex:") :]
202
+ if isinstance(superset, list):
203
+ for sup_item in superset:
204
+ if re.match(pattern, sup_item):
205
+ return True
206
+ logger.debug("No items in list %s match regex pattern '%s'.", superset, pattern)
207
+ return False
208
+ else:
209
+ if re.match(pattern, superset) is None:
210
+ logger.debug("Value '%s' does not match regex pattern '%s'.", superset, pattern)
211
+ return False
212
+ return True
213
+ if isinstance(superset, list) and not isinstance(subset, list):
214
+ for sup_item in superset:
215
+ if _is_subset(sup_item, subset):
216
+ return True
217
+ logger.debug("None of the items in list %s match the value '%s'.", superset, subset)
218
+ return False
219
+ if isinstance(superset, (list, set)) and isinstance(subset, list):
220
+ for sub_item in subset:
221
+ if not any(_is_subset(sup_item, sub_item) for sup_item in superset):
222
+ logger.debug("No element in %s matches subset element '%s'.", superset, sub_item)
223
+ return False
224
+ return True
225
+ if superset != subset:
226
+ logger.debug("Direct comparison failed: %s != %s", superset, subset)
227
+ return superset == subset
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,395 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import os
9
+ from json import JSONDecodeError
10
+ from typing import Optional, Dict, Any
11
+
12
+ from typing import List
13
+
14
+ import redis
15
+
16
+ from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
17
+ from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob
18
+ from nv_ingest.framework.util.service.meta.ingest.ingest_service_meta import IngestServiceMeta
19
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_ingest_job
20
+ from nv_ingest_api.util.service_clients.client_base import FetchMode
21
+ from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
22
+
23
+ logger = logging.getLogger("uvicorn")
24
+
25
+
26
+ def get_fetch_mode_from_env() -> "FetchMode":
27
+ """
28
+ Retrieves the fetch mode from the environment variable FETCH_MODE.
29
+
30
+ Returns
31
+ -------
32
+ FetchMode
33
+ The fetch mode as specified by the environment variable, or NON_DESTRUCTIVE by default.
34
+ """
35
+ mode_str: str = os.getenv("FETCH_MODE", "NON_DESTRUCTIVE").upper()
36
+ try:
37
+ return FetchMode[mode_str]
38
+ except KeyError:
39
+ logger.warning(f"Invalid FETCH_MODE '{mode_str}' in environment. Defaulting to DESTRUCTIVE.")
40
+ return FetchMode.DESTRUCTIVE
41
+
42
+
43
+ class RedisIngestService(IngestServiceMeta):
44
+ """
45
+ Submits jobs and fetches results via Redis, supporting multiple fetch modes
46
+ and state management with TTLs. Operates asynchronously using asyncio.to_thread
47
+ for synchronous Redis client operations.
48
+ """
49
+
50
+ _concurrency_level: int = int(os.getenv("CONCURRENCY_LEVEL", "10"))
51
+ __shared_instance: Optional["RedisIngestService"] = None
52
+
53
+ @staticmethod
54
+ def get_instance() -> "RedisIngestService":
55
+ """
56
+ Static access method implementing the Singleton pattern.
57
+
58
+ Returns
59
+ -------
60
+ RedisIngestService
61
+ The singleton instance of the RedisIngestService.
62
+ """
63
+ if RedisIngestService.__shared_instance is None:
64
+ redis_host: str = os.getenv("MESSAGE_CLIENT_HOST", "localhost")
65
+ redis_port: int = int(os.getenv("MESSAGE_CLIENT_PORT", "6379"))
66
+ redis_task_queue: str = os.getenv("REDIS_INGEST_TASK_QUEUE", "ingest_task_queue")
67
+
68
+ fetch_mode: "FetchMode" = get_fetch_mode_from_env()
69
+ result_data_ttl: int = int(os.getenv("RESULT_DATA_TTL_SECONDS", "3600"))
70
+ state_ttl: int = int(os.getenv("STATE_TTL_SECONDS", "7200"))
71
+
72
+ cache_config: Dict[str, Any] = {
73
+ "directory": os.getenv("FETCH_CACHE_DIR", "./.fetch_cache"),
74
+ "ttl": int(os.getenv("FETCH_CACHE_TTL_SECONDS", "3600")),
75
+ }
76
+ use_ssl: bool = os.getenv("REDIS_USE_SSL", "false").lower() == "true"
77
+
78
+ RedisIngestService.__shared_instance = RedisIngestService(
79
+ redis_hostname=redis_host,
80
+ redis_port=redis_port,
81
+ redis_task_queue=redis_task_queue,
82
+ fetch_mode=fetch_mode,
83
+ result_data_ttl_seconds=result_data_ttl if result_data_ttl > 0 else None,
84
+ state_ttl_seconds=state_ttl if state_ttl > 0 else None,
85
+ cache_config=cache_config,
86
+ use_ssl=use_ssl,
87
+ )
88
+ logger.debug(f"RedisIngestService configured with FetchMode: {fetch_mode.name}")
89
+ else:
90
+ logger.debug("Returning existing RedisIngestService Singleton instance.")
91
+ return RedisIngestService.__shared_instance
92
+
93
+ def __init__(
94
+ self,
95
+ redis_hostname: str,
96
+ redis_port: int,
97
+ redis_task_queue: str,
98
+ fetch_mode: "FetchMode",
99
+ result_data_ttl_seconds: Optional[int],
100
+ state_ttl_seconds: Optional[int],
101
+ cache_config: Optional[Dict[str, Any]],
102
+ use_ssl: bool,
103
+ ) -> None:
104
+ """
105
+ Initializes the service and the underlying RedisClient.
106
+
107
+ Parameters
108
+ ----------
109
+ redis_hostname : str
110
+ Redis server hostname.
111
+ redis_port : int
112
+ Redis server port.
113
+ redis_task_queue : str
114
+ The Redis queue name for tasks.
115
+ fetch_mode : FetchMode
116
+ The fetch mode configuration.
117
+ result_data_ttl_seconds : int or None
118
+ TTL for result data in seconds, or None to disable.
119
+ state_ttl_seconds : int or None
120
+ TTL for the job state record, or None to disable.
121
+ cache_config : dict or None
122
+ Configuration for caching.
123
+ use_ssl : bool
124
+ Whether to use SSL for the Redis connection.
125
+ """
126
+ self._redis_hostname: str = redis_hostname
127
+ self._redis_port: int = redis_port
128
+ self._redis_task_queue: str = redis_task_queue
129
+ self._fetch_mode: "FetchMode" = fetch_mode
130
+ self._result_data_ttl_seconds: Optional[int] = result_data_ttl_seconds
131
+ self._state_ttl_seconds: Optional[int] = state_ttl_seconds
132
+
133
+ self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
134
+ self._cache_prefix: str = "processing_cache:"
135
+ self._state_prefix: str = "job_state:"
136
+
137
+ self._ingest_client = RedisClient(
138
+ host=self._redis_hostname,
139
+ port=self._redis_port,
140
+ max_pool_size=self._concurrency_level,
141
+ fetch_mode=self._fetch_mode,
142
+ cache_config=cache_config,
143
+ message_ttl_seconds=self._result_data_ttl_seconds,
144
+ use_ssl=use_ssl,
145
+ max_retries=int(os.getenv("REDIS_MAX_RETRIES", "3")),
146
+ max_backoff=int(os.getenv("REDIS_MAX_BACKOFF", "32")),
147
+ connection_timeout=int(os.getenv("REDIS_CONNECTION_TIMEOUT", "300")),
148
+ )
149
+ logger.debug(
150
+ f"RedisClient initialized for service. Host: {redis_hostname}:{redis_port}, "
151
+ f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
152
+ )
153
+
154
+ async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
155
+ """
156
+ Validates, prepares, and submits a job specification to the Redis task queue.
157
+ Sets result data TTL if configured for NON_DESTRUCTIVE mode.
158
+
159
+ Parameters
160
+ ----------
161
+ job_spec_wrapper : MessageWrapper
162
+ A wrapper containing the job specification payload.
163
+ trace_id : str
164
+ A unique identifier for the job.
165
+
166
+ Returns
167
+ -------
168
+ str
169
+ The job trace_id.
170
+
171
+ Raises
172
+ ------
173
+ ValueError
174
+ If the payload is missing or invalid.
175
+ JSONDecodeError, TypeError
176
+ For payload parsing errors.
177
+ RedisError, ConnectionError
178
+ For Redis-related errors.
179
+ """
180
+ try:
181
+ json_data = job_spec_wrapper.model_dump(mode="json").get("payload")
182
+ if not json_data:
183
+ raise ValueError("MessageWrapper payload is missing or empty.")
184
+ if isinstance(json_data, str):
185
+ job_spec = json.loads(json_data)
186
+ elif isinstance(json_data, dict):
187
+ job_spec = json_data
188
+ else:
189
+ raise TypeError(f"Unexpected payload type: {type(json_data)}")
190
+
191
+ validate_ingest_job(job_spec)
192
+ job_spec["job_id"] = trace_id
193
+ tasks = job_spec.get("tasks", [])
194
+ updated_tasks = []
195
+ for task in tasks:
196
+ task_prop = task.get("task_properties", {})
197
+ if hasattr(task_prop, "model_dump") and callable(task_prop.model_dump):
198
+ task["task_properties"] = task_prop.model_dump(mode="json")
199
+ elif not isinstance(task_prop, dict):
200
+ try:
201
+ task["task_properties"] = dict(task_prop)
202
+ except (TypeError, ValueError):
203
+ logger.error(f"Cannot convert task_properties to dict: {task_prop}. Skipping properties.")
204
+ task["task_properties"] = {}
205
+ updated_tasks.append(task)
206
+ job_spec["tasks"] = updated_tasks
207
+ job_spec_json = json.dumps(job_spec)
208
+ ttl_for_result: Optional[int] = (
209
+ self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
210
+ )
211
+ logger.debug(
212
+ f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
213
+ )
214
+ await asyncio.to_thread(
215
+ self._ingest_client.submit_message,
216
+ channel_name=self._redis_task_queue,
217
+ message=job_spec_json,
218
+ ttl_seconds=ttl_for_result,
219
+ )
220
+ logger.debug(f"Successfully submitted job {trace_id}")
221
+ return trace_id
222
+ except (JSONDecodeError, TypeError, ValueError) as err:
223
+ logger.exception(f"Data validation or parsing error for job {trace_id}: {err}")
224
+ raise ValueError(f"Invalid job specification: {err}") from err
225
+ except (redis.RedisError, ConnectionError) as err:
226
+ logger.exception(f"Redis error submitting job {trace_id}: {err}")
227
+ raise err
228
+ except Exception as err:
229
+ logger.exception(f"Unexpected error submitting job {trace_id}: {err}")
230
+ raise
231
+
232
+ async def fetch_job(self, job_id: str) -> Optional[Dict]:
233
+ """
234
+ Fetches the job result using the configured RedisClient fetch mode and timeout.
235
+ Executes the synchronous client call asynchronously.
236
+
237
+ Parameters
238
+ ----------
239
+ job_id : str
240
+ The unique identifier of the job.
241
+
242
+ Returns
243
+ -------
244
+ dict or None
245
+ The job result message.
246
+
247
+ Raises
248
+ ------
249
+ TimeoutError, RedisError, ConnectionError, ValueError, RuntimeError
250
+ If the fetch operation fails.
251
+ """
252
+ try:
253
+ result_channel: str = f"{job_id}"
254
+ logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
255
+ message = await asyncio.to_thread(
256
+ self._ingest_client.fetch_message,
257
+ channel_name=result_channel,
258
+ timeout=10,
259
+ )
260
+ if message is not None:
261
+ logger.debug(f"Successfully fetched result for job {job_id}.")
262
+ return message
263
+ else:
264
+ logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
265
+ raise TimeoutError("No data found (unexpected None response).")
266
+ except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
267
+ logger.info(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
268
+ raise e
269
+ except Exception as e:
270
+ logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
271
+ raise RuntimeError(f"Unexpected error fetching job {job_id}") from e
272
+
273
+ async def set_job_state(self, job_id: str, state: str) -> None:
274
+ """
275
+ Sets the explicit state of a job and refreshes its TTL.
276
+
277
+ Parameters
278
+ ----------
279
+ job_id : str
280
+ The unique identifier of the job.
281
+ state : str
282
+ The state to be assigned to the job.
283
+
284
+ Returns
285
+ -------
286
+ None
287
+ """
288
+ state_key: str = f"{self._state_prefix}{job_id}"
289
+ ttl_to_set: Optional[int] = self._state_ttl_seconds
290
+ try:
291
+ logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
292
+ await asyncio.to_thread(
293
+ self._ingest_client.get_client().set,
294
+ state_key,
295
+ state,
296
+ ex=ttl_to_set,
297
+ )
298
+ logger.debug(f"Successfully set state for {job_id}.")
299
+ except (redis.RedisError, ConnectionError) as err:
300
+ logger.error(f"Failed to set state for {state_key}: {err}")
301
+ except Exception as err:
302
+ logger.exception(f"Unexpected error setting state for {state_key}: {err}")
303
+
304
+ async def get_job_state(self, job_id: str) -> Optional[str]:
305
+ """
306
+ Retrieves the explicit state of a job.
307
+
308
+ Parameters
309
+ ----------
310
+ job_id : str
311
+ The unique identifier of the job.
312
+
313
+ Returns
314
+ -------
315
+ str or None
316
+ The state of the job, or None if not found or upon error.
317
+ """
318
+ state_key: str = f"{self._state_prefix}{job_id}"
319
+ try:
320
+ data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, state_key)
321
+ if data_bytes:
322
+ state: str = data_bytes.decode("utf-8")
323
+ logger.debug(f"Retrieved state for {job_id}: {state}")
324
+ return state
325
+ else:
326
+ logger.debug(f"No state found for {job_id} (key: {state_key})")
327
+ return None
328
+ except (redis.RedisError, ConnectionError) as err:
329
+ logger.error(f"Redis error getting state for {state_key}: {err}")
330
+ return None
331
+ except Exception as err:
332
+ logger.exception(f"Unexpected error getting state for {state_key}: {err}")
333
+ return None
334
+
335
+ async def set_processing_cache(self, job_id: str, jobs_data: List["ProcessingJob"]) -> None:
336
+ """
337
+ Stores processing jobs data in a simple key-value cache.
338
+
339
+ Parameters
340
+ ----------
341
+ job_id : str
342
+ The unique identifier of the job.
343
+ jobs_data : list of ProcessingJob
344
+ The processing job data to be cached.
345
+
346
+ Returns
347
+ -------
348
+ None
349
+ """
350
+ cache_key: str = f"{self._cache_prefix}{job_id}"
351
+ try:
352
+ data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
353
+ await asyncio.to_thread(
354
+ self._ingest_client.get_client().set,
355
+ cache_key,
356
+ data_to_store,
357
+ ex=3600,
358
+ )
359
+ except Exception as err:
360
+ logger.exception(f"Error setting cache for {cache_key}: {err}")
361
+
362
+ async def get_processing_cache(self, job_id: str) -> List["ProcessingJob"]:
363
+ """
364
+ Retrieves processing jobs data from the simple key-value cache.
365
+
366
+ Parameters
367
+ ----------
368
+ job_id : str
369
+ The unique identifier of the job.
370
+
371
+ Returns
372
+ -------
373
+ list of ProcessingJob
374
+ A list of processing jobs, or an empty list if not found or upon error.
375
+ """
376
+ cache_key: str = f"{self._cache_prefix}{job_id}"
377
+ try:
378
+ data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, cache_key)
379
+ if data_bytes is None:
380
+ return []
381
+ return [ProcessingJob(**job) for job in json.loads(data_bytes)]
382
+ except Exception as err:
383
+ logger.exception(f"Error getting cache for {cache_key}: {err}")
384
+ return []
385
+
386
+ async def get_fetch_mode(self) -> "FetchMode":
387
+ """
388
+ Returns the configured fetch mode for the service.
389
+
390
+ Returns
391
+ -------
392
+ FetchMode
393
+ The current fetch mode.
394
+ """
395
+ return self._fetch_mode
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from abc import ABC
6
+ from abc import abstractmethod
7
+ from typing import List, Optional
8
+
9
+ from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
10
+ from nv_ingest.framework.schemas.framework_processing_job_schema import ProcessingJob
11
+ from nv_ingest_api.util.service_clients.client_base import FetchMode
12
+
13
+
14
+ class IngestServiceMeta(ABC):
15
+ @abstractmethod
16
+ async def submit_job(self, job_spec: MessageWrapper, trace_id: str) -> str:
17
+ """Abstract method for submitting one or more jobs to the ingestion pipeline"""
18
+
19
+ @abstractmethod
20
+ async def fetch_job(self, job_id: str):
21
+ """Abstract method for fetching job from ingestion service based on job_id"""
22
+
23
+ @abstractmethod
24
+ async def set_processing_cache(self, job_id: str, jobs_data: List[ProcessingJob]) -> None:
25
+ """Abstract method for setting processing cache"""
26
+
27
+ @abstractmethod
28
+ async def get_processing_cache(self, job_id: str) -> List[ProcessingJob]:
29
+ """Abstract method for getting processing cache"""
30
+
31
+ @abstractmethod
32
+ async def set_job_state(self, job_id: str, state: str, ttl: int = 86400):
33
+ """Abstract method for setting job state"""
34
+
35
+ @abstractmethod
36
+ async def get_job_state(self, job_id: str) -> Optional[str]:
37
+ """Abstract method for getting job state"""
38
+
39
+ @abstractmethod
40
+ async def get_fetch_mode(self) -> FetchMode:
41
+ """Abstract method for getting fetch mode"""