nv-ingest-api 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +4 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +124 -14
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +10 -1
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +10 -1
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +6 -4
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +4 -2
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +9 -1
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +39 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +9 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
- nv_ingest_api/internal/transform/embed_text.py +82 -0
- nv_ingest_api/util/dataloader/dataloader.py +20 -9
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +1 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +8 -2
- nv_ingest_api/util/service_clients/redis/redis_client.py +160 -0
- {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/METADATA +2 -1
- {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/RECORD +25 -23
- {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
|
7
7
|
from functools import partial
|
|
8
8
|
from typing import Any, Dict, Tuple, Optional, Iterable, List
|
|
9
9
|
|
|
10
|
+
import glom
|
|
10
11
|
import pandas as pd
|
|
11
12
|
from openai import OpenAI
|
|
12
13
|
|
|
@@ -282,6 +283,33 @@ def _add_embeddings(row, embeddings, info_msgs):
|
|
|
282
283
|
return row
|
|
283
284
|
|
|
284
285
|
|
|
286
|
+
def _add_custom_embeddings(row, embeddings, result_target_field):
|
|
287
|
+
"""
|
|
288
|
+
Updates a DataFrame row with embedding data and associated error info
|
|
289
|
+
based on a user supplied custom content field.
|
|
290
|
+
|
|
291
|
+
Parameters
|
|
292
|
+
----------
|
|
293
|
+
row : pandas.Series
|
|
294
|
+
A row of the DataFrame.
|
|
295
|
+
embeddings : dict
|
|
296
|
+
Dictionary mapping row indices to embeddings.
|
|
297
|
+
result_target_field: str
|
|
298
|
+
The field in custom_content to output the embeddings to
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
pandas.Series
|
|
303
|
+
The updated row
|
|
304
|
+
"""
|
|
305
|
+
embedding = embeddings.get(row.name, None)
|
|
306
|
+
|
|
307
|
+
if embedding is not None:
|
|
308
|
+
row["metadata"] = glom.assign(row["metadata"], "custom_content." + result_target_field, embedding, missing=dict)
|
|
309
|
+
|
|
310
|
+
return row
|
|
311
|
+
|
|
312
|
+
|
|
285
313
|
def _format_image_input_string(image_b64: Optional[str]) -> str:
|
|
286
314
|
if not image_b64:
|
|
287
315
|
return
|
|
@@ -381,6 +409,20 @@ def _get_pandas_audio_content(row, modality="text"):
|
|
|
381
409
|
return row.get("audio_metadata", {}).get("audio_transcript")
|
|
382
410
|
|
|
383
411
|
|
|
412
|
+
def _get_pandas_custom_content(row, custom_content_field):
|
|
413
|
+
custom_content = row.get("custom_content", {})
|
|
414
|
+
content = glom.glom(custom_content, custom_content_field, default=None)
|
|
415
|
+
if content is None:
|
|
416
|
+
logger.warning(f"Custom content field: {custom_content_field} not found")
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
try:
|
|
420
|
+
return str(content)
|
|
421
|
+
except (TypeError, ValueError):
|
|
422
|
+
logger.warning(f"Cannot convert custom content field: {custom_content_field} to string")
|
|
423
|
+
return None
|
|
424
|
+
|
|
425
|
+
|
|
384
426
|
# ------------------------------------------------------------------------------
|
|
385
427
|
# Batch Processing Utilities
|
|
386
428
|
# ------------------------------------------------------------------------------
|
|
@@ -519,6 +561,7 @@ def transform_create_text_embeddings_internal(
|
|
|
519
561
|
api_key = task_config.get("api_key") or transform_config.api_key
|
|
520
562
|
endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
|
|
521
563
|
model_name = task_config.get("model_name") or transform_config.embedding_model
|
|
564
|
+
custom_content_field = task_config.get("custom_content_field") or transform_config.custom_content_field
|
|
522
565
|
|
|
523
566
|
if execution_trace_log is None:
|
|
524
567
|
execution_trace_log = {}
|
|
@@ -612,4 +655,43 @@ def transform_create_text_embeddings_internal(
|
|
|
612
655
|
content_masks.append(content_mask)
|
|
613
656
|
|
|
614
657
|
combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
|
|
658
|
+
|
|
659
|
+
# Embed custom content
|
|
660
|
+
if custom_content_field is not None:
|
|
661
|
+
result_target_field = task_config.get("result_target_field") or custom_content_field + "_embedding"
|
|
662
|
+
|
|
663
|
+
extracted_custom_content = (
|
|
664
|
+
combined_df["metadata"]
|
|
665
|
+
.apply(partial(_get_pandas_custom_content, custom_content_field=custom_content_field))
|
|
666
|
+
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
valid_custom_content_mask = extracted_custom_content.notna()
|
|
670
|
+
if valid_custom_content_mask.any():
|
|
671
|
+
custom_content_list = extracted_custom_content[valid_custom_content_mask].to_list()
|
|
672
|
+
custom_content_batches = _generate_batches(custom_content_list, batch_size=transform_config.batch_size)
|
|
673
|
+
|
|
674
|
+
custom_content_embeddings = _async_runner(
|
|
675
|
+
custom_content_batches,
|
|
676
|
+
api_key,
|
|
677
|
+
endpoint_url,
|
|
678
|
+
model_name,
|
|
679
|
+
transform_config.encoding_format,
|
|
680
|
+
transform_config.input_type,
|
|
681
|
+
transform_config.truncate,
|
|
682
|
+
False,
|
|
683
|
+
)
|
|
684
|
+
custom_embeddings_dict = dict(
|
|
685
|
+
zip(
|
|
686
|
+
extracted_custom_content.loc[valid_custom_content_mask].index,
|
|
687
|
+
custom_content_embeddings.get("embeddings", []),
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
else:
|
|
691
|
+
custom_embeddings_dict = {}
|
|
692
|
+
|
|
693
|
+
combined_df = combined_df.apply(
|
|
694
|
+
_add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1
|
|
695
|
+
)
|
|
696
|
+
|
|
615
697
|
return combined_df, {"trace_info": execution_trace_log}
|
|
@@ -254,22 +254,29 @@ else:
|
|
|
254
254
|
file = None
|
|
255
255
|
try:
|
|
256
256
|
for file in paths:
|
|
257
|
+
if thread_stop.is_set():
|
|
258
|
+
return
|
|
257
259
|
if isinstance(file, tuple):
|
|
258
260
|
video_file, audio_file = file
|
|
261
|
+
if thread_stop.is_set():
|
|
262
|
+
return
|
|
259
263
|
with open(video_file, "rb") as f:
|
|
260
264
|
video = f.read()
|
|
265
|
+
if thread_stop.is_set():
|
|
266
|
+
return
|
|
261
267
|
with open(audio_file, "rb") as f:
|
|
262
268
|
audio = f.read()
|
|
263
269
|
queue.put((video, audio))
|
|
264
270
|
else:
|
|
265
|
-
if thread_stop:
|
|
271
|
+
if thread_stop.is_set():
|
|
266
272
|
return
|
|
267
273
|
with open(file, "rb") as f:
|
|
268
274
|
queue.put(f.read())
|
|
269
275
|
except Exception as e:
|
|
270
276
|
logging.error(f"Error processing file {file}: {e}")
|
|
271
277
|
queue.put(RuntimeError(f"Error processing file {file}: {e}"))
|
|
272
|
-
|
|
278
|
+
finally:
|
|
279
|
+
queue.put(StopIteration)
|
|
273
280
|
|
|
274
281
|
class DataLoader:
|
|
275
282
|
"""
|
|
@@ -290,7 +297,7 @@ else:
|
|
|
290
297
|
):
|
|
291
298
|
interface = interface if interface else MediaInterface()
|
|
292
299
|
self.thread = None
|
|
293
|
-
self.thread_stop =
|
|
300
|
+
self.thread_stop = threading.Event()
|
|
294
301
|
self.queue = queue.Queue(size)
|
|
295
302
|
self.path = Path(path)
|
|
296
303
|
self.output_dir = output_dir
|
|
@@ -323,16 +330,20 @@ else:
|
|
|
323
330
|
Reset itertor by stopping the thread and clearing the queue.
|
|
324
331
|
"""
|
|
325
332
|
if self.thread:
|
|
326
|
-
self.thread_stop
|
|
333
|
+
self.thread_stop.set()
|
|
327
334
|
self.thread.join()
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
self.queue.
|
|
335
|
+
self.thread = None
|
|
336
|
+
try:
|
|
337
|
+
while True:
|
|
338
|
+
self.queue.get_nowait()
|
|
339
|
+
except Exception:
|
|
340
|
+
pass
|
|
341
|
+
finally:
|
|
342
|
+
self.thread_stop.clear()
|
|
332
343
|
|
|
333
344
|
def __iter__(self):
|
|
334
345
|
self.stop()
|
|
335
|
-
self.thread_stop
|
|
346
|
+
self.thread_stop.clear()
|
|
336
347
|
self.thread = threading.Thread(
|
|
337
348
|
target=load_data,
|
|
338
349
|
args=(
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Optional
|
|
8
|
+
import logging
|
|
9
|
+
import time
|
|
10
|
+
import random
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _SchedulingStrategy:
|
|
14
|
+
"""
|
|
15
|
+
Base scheduling strategy interface. Implementations must provide a non-blocking
|
|
16
|
+
single-sweep attempt over non-immediate queues and return a job or None.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _LotteryStrategy(_SchedulingStrategy):
|
|
24
|
+
"""
|
|
25
|
+
Lottery scheduling with fixed weights.
|
|
26
|
+
Weights: micro=4, small=2, large=1, medium=1, default=1
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, prioritize_immediate: bool = True) -> None:
|
|
30
|
+
self._weights: Dict[str, int] = {
|
|
31
|
+
"micro": 4,
|
|
32
|
+
"small": 2,
|
|
33
|
+
"large": 1,
|
|
34
|
+
"medium": 1,
|
|
35
|
+
"default": 1,
|
|
36
|
+
}
|
|
37
|
+
self._prioritize_immediate: bool = bool(prioritize_immediate)
|
|
38
|
+
|
|
39
|
+
def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
|
|
40
|
+
# Immediate-first if enabled (non-blocking)
|
|
41
|
+
if self._prioritize_immediate:
|
|
42
|
+
try:
|
|
43
|
+
job = client.fetch_message(queues["immediate"], 0)
|
|
44
|
+
if job is not None:
|
|
45
|
+
return job
|
|
46
|
+
except TimeoutError:
|
|
47
|
+
pass
|
|
48
|
+
candidates = list(order)
|
|
49
|
+
weights = [self._weights[q] for q in candidates]
|
|
50
|
+
while candidates:
|
|
51
|
+
try:
|
|
52
|
+
chosen = random.choices(candidates, weights=weights, k=1)[0]
|
|
53
|
+
job = client.fetch_message(queues[chosen], 0)
|
|
54
|
+
if job is not None:
|
|
55
|
+
return job
|
|
56
|
+
except TimeoutError:
|
|
57
|
+
pass
|
|
58
|
+
finally:
|
|
59
|
+
idx = candidates.index(chosen)
|
|
60
|
+
del candidates[idx]
|
|
61
|
+
del weights[idx]
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _SimpleStrategy(_SchedulingStrategy):
|
|
66
|
+
"""
|
|
67
|
+
Simple strategy placeholder. Actual simple-mode handling is done in QosScheduler.fetch_next
|
|
68
|
+
to directly fetch from the base 'default' queue using the provided timeout.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
|
|
72
|
+
# Block up to 30s on the base/default queue and return first available job
|
|
73
|
+
try:
|
|
74
|
+
return client.fetch_message(queues["default"], 30.0)
|
|
75
|
+
except TimeoutError:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class _RoundRobinStrategy(_SchedulingStrategy):
|
|
80
|
+
"""
|
|
81
|
+
Simple round-robin over non-immediate queues. Maintains rotation across calls.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(self, order: list[str], prioritize_immediate: bool = True) -> None:
|
|
85
|
+
self._order = list(order)
|
|
86
|
+
self._len = len(self._order)
|
|
87
|
+
self._idx = 0
|
|
88
|
+
self._prioritize_immediate: bool = bool(prioritize_immediate)
|
|
89
|
+
|
|
90
|
+
def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
|
|
91
|
+
# Immediate-first if enabled (non-blocking)
|
|
92
|
+
if self._prioritize_immediate:
|
|
93
|
+
try:
|
|
94
|
+
job = client.fetch_message(queues["immediate"], 0)
|
|
95
|
+
if job is not None:
|
|
96
|
+
return job
|
|
97
|
+
except TimeoutError:
|
|
98
|
+
pass
|
|
99
|
+
start_idx = self._idx
|
|
100
|
+
for step in range(self._len):
|
|
101
|
+
i = (start_idx + step) % self._len
|
|
102
|
+
qname = self._order[i]
|
|
103
|
+
try:
|
|
104
|
+
job = client.fetch_message(queues[qname], 0)
|
|
105
|
+
if job is not None:
|
|
106
|
+
# advance rotation to the position after the chosen one
|
|
107
|
+
self._idx = (i + 1) % self._len
|
|
108
|
+
return job
|
|
109
|
+
except TimeoutError:
|
|
110
|
+
continue
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class _WeightedRoundRobinStrategy(_SchedulingStrategy):
|
|
115
|
+
"""
|
|
116
|
+
Smooth Weighted Round Robin (SWRR) using weights micro=4, small=2, large=1, medium=1, default=1.
|
|
117
|
+
Maintains current weights across calls.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, prioritize_immediate: bool = True) -> None:
|
|
121
|
+
self._weights: Dict[str, int] = {
|
|
122
|
+
"micro": 4,
|
|
123
|
+
"small": 2,
|
|
124
|
+
"large": 1,
|
|
125
|
+
"medium": 1,
|
|
126
|
+
"default": 1,
|
|
127
|
+
}
|
|
128
|
+
self._current: Dict[str, int] = {k: 0 for k in self._weights.keys()}
|
|
129
|
+
self._total: int = sum(self._weights.values())
|
|
130
|
+
self._prioritize_immediate: bool = bool(prioritize_immediate)
|
|
131
|
+
|
|
132
|
+
def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
|
|
133
|
+
# Immediate-first if enabled (non-blocking)
|
|
134
|
+
if self._prioritize_immediate:
|
|
135
|
+
try:
|
|
136
|
+
job = client.fetch_message(queues["immediate"], 0)
|
|
137
|
+
if job is not None:
|
|
138
|
+
return job
|
|
139
|
+
except TimeoutError:
|
|
140
|
+
pass
|
|
141
|
+
# Attempt up to len(order) selections per sweep, excluding queues that prove empty
|
|
142
|
+
active = list(order)
|
|
143
|
+
for _ in range(len(order)):
|
|
144
|
+
if not active:
|
|
145
|
+
break
|
|
146
|
+
for q in active:
|
|
147
|
+
self._current[q] += self._weights[q]
|
|
148
|
+
chosen = max(active, key=lambda q: self._current[q])
|
|
149
|
+
self._current[chosen] -= self._total
|
|
150
|
+
try:
|
|
151
|
+
job = client.fetch_message(queues[chosen], 0)
|
|
152
|
+
if job is not None:
|
|
153
|
+
return job
|
|
154
|
+
except TimeoutError:
|
|
155
|
+
job = None
|
|
156
|
+
# If no job available from chosen, exclude it for the remainder of this sweep
|
|
157
|
+
if job is None and chosen in active:
|
|
158
|
+
active.remove(chosen)
|
|
159
|
+
# Fallback: single non-blocking attempt for each queue in order
|
|
160
|
+
for q in order:
|
|
161
|
+
try:
|
|
162
|
+
job = client.fetch_message(queues[q], 0)
|
|
163
|
+
if job is not None:
|
|
164
|
+
return job
|
|
165
|
+
except TimeoutError:
|
|
166
|
+
continue
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class QosScheduler:
|
|
171
|
+
"""
|
|
172
|
+
Simplified scheduler that fetches jobs from the default queue only.
|
|
173
|
+
Uses the provided timeout value when polling the broker.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(
|
|
177
|
+
self,
|
|
178
|
+
base_queue: str,
|
|
179
|
+
total_buffer_capacity: int = 1,
|
|
180
|
+
num_prefetch_threads: int = 0,
|
|
181
|
+
prefetch_poll_interval: float = 0.0,
|
|
182
|
+
prefetch_non_immediate: bool = False,
|
|
183
|
+
strategy: str = "lottery",
|
|
184
|
+
prioritize_immediate: bool = True,
|
|
185
|
+
) -> None:
|
|
186
|
+
self.base_queue = base_queue
|
|
187
|
+
|
|
188
|
+
# Define all derived queues; default behavior still uses only "default"
|
|
189
|
+
self.queues: Dict[str, str] = {
|
|
190
|
+
"default": f"{base_queue}",
|
|
191
|
+
"immediate": f"{base_queue}_immediate",
|
|
192
|
+
"micro": f"{base_queue}_micro",
|
|
193
|
+
"small": f"{base_queue}_small",
|
|
194
|
+
"medium": f"{base_queue}_medium",
|
|
195
|
+
"large": f"{base_queue}_large",
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# Priority order for multi-queue fetching; "immediate" always first
|
|
199
|
+
self._priority_order = [
|
|
200
|
+
"immediate",
|
|
201
|
+
"micro",
|
|
202
|
+
"small",
|
|
203
|
+
"medium",
|
|
204
|
+
"large",
|
|
205
|
+
"default",
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
# Non-immediate queue order reference
|
|
209
|
+
self._non_immediate_order = ["micro", "small", "large", "medium", "default"]
|
|
210
|
+
|
|
211
|
+
# Logger
|
|
212
|
+
self._logger = logging.getLogger(__name__)
|
|
213
|
+
|
|
214
|
+
# No prefetching - just direct calls
|
|
215
|
+
self._total_buffer_capacity: int = int(total_buffer_capacity)
|
|
216
|
+
self._num_prefetch_threads: int = int(num_prefetch_threads)
|
|
217
|
+
self._prefetch_poll_interval: float = float(prefetch_poll_interval)
|
|
218
|
+
self._prefetch_non_immediate: bool = bool(prefetch_non_immediate)
|
|
219
|
+
|
|
220
|
+
# Strategy selection
|
|
221
|
+
self._simple_mode: bool = False
|
|
222
|
+
if strategy == "simple":
|
|
223
|
+
self._strategy_impl: _SchedulingStrategy = _SimpleStrategy()
|
|
224
|
+
self._simple_mode = True
|
|
225
|
+
elif strategy == "round_robin":
|
|
226
|
+
self._strategy_impl = _RoundRobinStrategy(self._non_immediate_order, prioritize_immediate)
|
|
227
|
+
elif strategy == "weighted_round_robin":
|
|
228
|
+
self._strategy_impl = _WeightedRoundRobinStrategy(prioritize_immediate)
|
|
229
|
+
else:
|
|
230
|
+
self._strategy_impl = _LotteryStrategy(prioritize_immediate)
|
|
231
|
+
|
|
232
|
+
# Context manager helpers for clean shutdown
|
|
233
|
+
def __enter__(self) -> "QosScheduler":
|
|
234
|
+
return self
|
|
235
|
+
|
|
236
|
+
def __exit__(self, exc_type, exc, tb) -> None:
|
|
237
|
+
self.close()
|
|
238
|
+
|
|
239
|
+
# ---------------------------- Public API ----------------------------
|
|
240
|
+
def close(self) -> None:
|
|
241
|
+
"""
|
|
242
|
+
Cleanly close the scheduler. No-op for the current implementation
|
|
243
|
+
since we do not spin background threads.
|
|
244
|
+
"""
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
def fetch_next(self, client, timeout: float = 0.0) -> Optional[dict]:
|
|
248
|
+
"""
|
|
249
|
+
Immediate-first, then strategy-based scheduling among non-immediate queues.
|
|
250
|
+
|
|
251
|
+
Behavior:
|
|
252
|
+
- Always check 'immediate' first (non-blocking). If present, return immediately.
|
|
253
|
+
- If not, select using the configured strategy (lottery, round_robin, weighted_round_robin).
|
|
254
|
+
- If no job is found in a full pass:
|
|
255
|
+
- If timeout <= 0: return None.
|
|
256
|
+
- Else: sleep in 0.5s increments and retry until accumulated elapsed time >= timeout.
|
|
257
|
+
"""
|
|
258
|
+
# Simple mode: delegate to the strategy (blocks up to 30s on base queue)
|
|
259
|
+
if getattr(self, "_simple_mode", False):
|
|
260
|
+
return self._strategy_impl.try_once(client, self.queues, self._non_immediate_order)
|
|
261
|
+
|
|
262
|
+
start = time.monotonic()
|
|
263
|
+
while True:
|
|
264
|
+
# Strategy-based attempt (strategy may include immediate priority internally)
|
|
265
|
+
job = self._strategy_impl.try_once(client, self.queues, self._non_immediate_order)
|
|
266
|
+
if job is not None:
|
|
267
|
+
return job
|
|
268
|
+
|
|
269
|
+
# No job found in this sweep
|
|
270
|
+
if timeout <= 0:
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
elapsed = time.monotonic() - start
|
|
274
|
+
if elapsed >= timeout:
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
# Sleep up to 0.5s, but not beyond remaining timeout
|
|
278
|
+
remaining = timeout - elapsed
|
|
279
|
+
sleep_time = 0.5 if remaining > 0.5 else remaining
|
|
280
|
+
if sleep_time > 0:
|
|
281
|
+
time.sleep(sleep_time)
|
|
282
|
+
else:
|
|
283
|
+
return None
|
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
import math
|
|
8
|
-
import multiprocessing as mp
|
|
9
8
|
import os
|
|
9
|
+
import sys
|
|
10
|
+
import multiprocessing as mp
|
|
10
11
|
from threading import Lock
|
|
11
12
|
from typing import Any, Callable, Optional
|
|
12
13
|
|
|
@@ -103,7 +104,12 @@ class ProcessWorkerPoolSingleton:
|
|
|
103
104
|
The total number of worker processes to start.
|
|
104
105
|
"""
|
|
105
106
|
self._total_workers = total_max_workers
|
|
106
|
-
|
|
107
|
+
|
|
108
|
+
start_method = "fork"
|
|
109
|
+
if sys.platform.lower() == "darwin":
|
|
110
|
+
start_method = "spawn"
|
|
111
|
+
self._context: mp.context.ForkContext = mp.get_context(start_method)
|
|
112
|
+
|
|
107
113
|
# Bounded task queue: maximum tasks queued = 2 * total_max_workers.
|
|
108
114
|
self._task_queue: mp.Queue = self._context.Queue(maxsize=2 * total_max_workers)
|
|
109
115
|
self._next_task_id: int = 0
|
|
@@ -650,6 +650,22 @@ class RedisClient(MessageBrokerClientBase):
|
|
|
650
650
|
except Exception as e:
|
|
651
651
|
logger.exception(f"{log_prefix}: Cache read error: {e}. Trying Redis.")
|
|
652
652
|
|
|
653
|
+
# If caller requests non-blocking behavior (timeout <= 0), attempt immediate pop.
|
|
654
|
+
if timeout is not None and timeout <= 0:
|
|
655
|
+
try:
|
|
656
|
+
client = self.get_client()
|
|
657
|
+
popped = client.lpop(channel_name)
|
|
658
|
+
if popped is None:
|
|
659
|
+
return None
|
|
660
|
+
try:
|
|
661
|
+
return json.loads(popped)
|
|
662
|
+
except json.JSONDecodeError as e:
|
|
663
|
+
logger.error(f"Failed to decode JSON from non-blocking LPOP on '{channel_name}': {e}")
|
|
664
|
+
return None
|
|
665
|
+
except Exception as e:
|
|
666
|
+
logger.warning(f"Non-blocking LPOP failed for '{channel_name}': {e}")
|
|
667
|
+
return None
|
|
668
|
+
|
|
653
669
|
while True:
|
|
654
670
|
try:
|
|
655
671
|
fetch_result: Union[Dict[str, Any], List[Dict[str, Any]]]
|
|
@@ -711,6 +727,150 @@ class RedisClient(MessageBrokerClientBase):
|
|
|
711
727
|
logger.exception(f"{log_prefix}: Unexpected error during fetch: {e}")
|
|
712
728
|
raise ValueError(f"Unexpected error during fetch: {e}") from e
|
|
713
729
|
|
|
730
|
+
def fetch_message_from_any(self, channel_names: List[str], timeout: float = 0) -> Optional[Dict[str, Any]]:
|
|
731
|
+
"""
|
|
732
|
+
Attempt to fetch a message from the first non-empty list among the provided channel names
|
|
733
|
+
using Redis BLPOP. If the popped item represents a fragmented message, this method will
|
|
734
|
+
continue popping from the same channel to reconstruct the full message.
|
|
735
|
+
|
|
736
|
+
Parameters
|
|
737
|
+
----------
|
|
738
|
+
channel_names : List[str]
|
|
739
|
+
Ordered list of Redis list keys to attempt in priority order.
|
|
740
|
+
timeout : float, optional
|
|
741
|
+
Timeout in seconds to wait for any item across the provided lists. Redis supports
|
|
742
|
+
integer-second timeouts; sub-second values will be truncated.
|
|
743
|
+
|
|
744
|
+
Returns
|
|
745
|
+
-------
|
|
746
|
+
dict or None
|
|
747
|
+
The reconstructed message dictionary if an item was fetched; otherwise None on timeout.
|
|
748
|
+
"""
|
|
749
|
+
if not channel_names:
|
|
750
|
+
return None
|
|
751
|
+
|
|
752
|
+
client = self.get_client()
|
|
753
|
+
blpop_timeout = int(max(0, timeout))
|
|
754
|
+
try:
|
|
755
|
+
res = client.blpop(channel_names, timeout=blpop_timeout)
|
|
756
|
+
except (redis.RedisError, ConnectionError) as e:
|
|
757
|
+
logger.debug(f"BLPOP error on {channel_names}: {e}")
|
|
758
|
+
return None
|
|
759
|
+
|
|
760
|
+
if res is None:
|
|
761
|
+
return None
|
|
762
|
+
|
|
763
|
+
list_key, first_bytes = res
|
|
764
|
+
if isinstance(list_key, bytes):
|
|
765
|
+
try:
|
|
766
|
+
list_key = list_key.decode("utf-8")
|
|
767
|
+
except Exception:
|
|
768
|
+
list_key = str(list_key)
|
|
769
|
+
# Decode first element
|
|
770
|
+
try:
|
|
771
|
+
first_msg = json.loads(first_bytes)
|
|
772
|
+
except json.JSONDecodeError as e:
|
|
773
|
+
logger.error(f"Failed to decode JSON popped from '{list_key}': {e}")
|
|
774
|
+
return None
|
|
775
|
+
|
|
776
|
+
expected_count: int = int(first_msg.get("fragment_count", 1))
|
|
777
|
+
if expected_count <= 1:
|
|
778
|
+
return first_msg
|
|
779
|
+
|
|
780
|
+
# Collect remaining fragments from the same list key
|
|
781
|
+
fragments: List[Dict[str, Any]] = [first_msg]
|
|
782
|
+
accumulated = 0.0
|
|
783
|
+
start_time = time.monotonic()
|
|
784
|
+
for i in range(1, expected_count):
|
|
785
|
+
remaining = max(0, timeout - accumulated)
|
|
786
|
+
per_frag_timeout = int(max(1, remaining)) if timeout else 1
|
|
787
|
+
try:
|
|
788
|
+
frag_res = client.blpop([list_key], timeout=per_frag_timeout)
|
|
789
|
+
except (redis.RedisError, ConnectionError) as e:
|
|
790
|
+
logger.error(f"BLPOP error while collecting fragments from '{list_key}': {e}")
|
|
791
|
+
return None
|
|
792
|
+
if frag_res is None:
|
|
793
|
+
logger.error(f"Timeout while collecting fragment {i}/{expected_count-1} from '{list_key}'")
|
|
794
|
+
return None
|
|
795
|
+
_, frag_key_bytes_or_val = frag_res
|
|
796
|
+
# Redis returns (key, value); we don't need the key here
|
|
797
|
+
frag_bytes = frag_key_bytes_or_val
|
|
798
|
+
try:
|
|
799
|
+
frag_msg = json.loads(frag_bytes)
|
|
800
|
+
fragments.append(frag_msg)
|
|
801
|
+
except json.JSONDecodeError as e:
|
|
802
|
+
logger.error(f"Failed to decode fragment JSON from '{list_key}': {e}")
|
|
803
|
+
return None
|
|
804
|
+
accumulated = time.monotonic() - start_time
|
|
805
|
+
|
|
806
|
+
# Combine and return
|
|
807
|
+
try:
|
|
808
|
+
return self._combine_fragments(fragments)
|
|
809
|
+
except Exception as e:
|
|
810
|
+
logger.error(f"Error combining fragments from '{list_key}': {e}")
|
|
811
|
+
return None
|
|
812
|
+
|
|
813
|
+
def fetch_message_from_any_with_key(
|
|
814
|
+
self, channel_names: List[str], timeout: float = 0
|
|
815
|
+
) -> Optional[Tuple[str, Dict[str, Any]]]:
|
|
816
|
+
"""
|
|
817
|
+
Like fetch_message_from_any(), but returns the Redis list key together with the message.
|
|
818
|
+
This is useful for higher-level schedulers that need to apply per-category quotas.
|
|
819
|
+
"""
|
|
820
|
+
if not channel_names:
|
|
821
|
+
return None
|
|
822
|
+
|
|
823
|
+
client = self.get_client()
|
|
824
|
+
blpop_timeout = int(max(0, timeout))
|
|
825
|
+
try:
|
|
826
|
+
res = client.blpop(channel_names, timeout=blpop_timeout)
|
|
827
|
+
except (redis.RedisError, ConnectionError) as e:
|
|
828
|
+
logger.debug(f"BLPOP error on {channel_names}: {e}")
|
|
829
|
+
return None
|
|
830
|
+
|
|
831
|
+
if res is None:
|
|
832
|
+
return None
|
|
833
|
+
|
|
834
|
+
list_key, first_bytes = res
|
|
835
|
+
try:
|
|
836
|
+
first_msg = json.loads(first_bytes)
|
|
837
|
+
except json.JSONDecodeError as e:
|
|
838
|
+
logger.error(f"Failed to decode JSON popped from '{list_key}': {e}")
|
|
839
|
+
return None
|
|
840
|
+
|
|
841
|
+
expected_count: int = int(first_msg.get("fragment_count", 1))
|
|
842
|
+
if expected_count <= 1:
|
|
843
|
+
return list_key, first_msg
|
|
844
|
+
|
|
845
|
+
fragments: List[Dict[str, Any]] = [first_msg]
|
|
846
|
+
accumulated = 0.0
|
|
847
|
+
start_time = time.monotonic()
|
|
848
|
+
for i in range(1, expected_count):
|
|
849
|
+
remaining = max(0, timeout - accumulated)
|
|
850
|
+
per_frag_timeout = int(max(1, remaining)) if timeout else 1
|
|
851
|
+
try:
|
|
852
|
+
frag_res = client.blpop([list_key], timeout=per_frag_timeout)
|
|
853
|
+
except (redis.RedisError, ConnectionError) as e:
|
|
854
|
+
logger.error(f"BLPOP error while collecting fragments from '{list_key}': {e}")
|
|
855
|
+
return None
|
|
856
|
+
if frag_res is None:
|
|
857
|
+
logger.error(f"Timeout while collecting fragment {i}/{expected_count-1} from '{list_key}'")
|
|
858
|
+
return None
|
|
859
|
+
_, frag_bytes = frag_res
|
|
860
|
+
try:
|
|
861
|
+
frag_msg = json.loads(frag_bytes)
|
|
862
|
+
fragments.append(frag_msg)
|
|
863
|
+
except json.JSONDecodeError as e:
|
|
864
|
+
logger.error(f"Failed to decode fragment JSON from '{list_key}': {e}")
|
|
865
|
+
return None
|
|
866
|
+
accumulated = time.monotonic() - start_time
|
|
867
|
+
|
|
868
|
+
try:
|
|
869
|
+
return list_key, self._combine_fragments(fragments)
|
|
870
|
+
except Exception as e:
|
|
871
|
+
logger.error(f"Error combining fragments from '{list_key}': {e}")
|
|
872
|
+
return None
|
|
873
|
+
|
|
714
874
|
@staticmethod
|
|
715
875
|
def _combine_fragments(fragments: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
716
876
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-api
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.11.2.dev20251102
|
|
4
4
|
Summary: Python module with core document ingestion functions.
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -222,6 +222,7 @@ Requires-Dist: fsspec>=2025.5.1
|
|
|
222
222
|
Requires-Dist: universal_pathlib>=0.2.6
|
|
223
223
|
Requires-Dist: ffmpeg-python==0.2.0
|
|
224
224
|
Requires-Dist: tritonclient
|
|
225
|
+
Requires-Dist: glom
|
|
225
226
|
Dynamic: license-file
|
|
226
227
|
|
|
227
228
|
# nv-ingest-api
|