nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nv_ingest/api/v2/README.md +44 -18
- nv_ingest/api/v2/ingest.py +409 -57
- nv_ingest/framework/orchestration/process/dependent_services.py +17 -10
- nv_ingest/framework/orchestration/process/strategies.py +6 -2
- nv_ingest/framework/orchestration/process/termination.py +49 -9
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
- nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +41 -8
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +33 -11
- nv_ingest/pipeline/default_libmode_pipeline_impl.py +2 -2
- nv_ingest/pipeline/default_pipeline_impl.py +46 -21
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/METADATA +1 -2
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/RECORD +17 -16
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/WHEEL +0 -0
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/top_level.txt +0 -0
|
@@ -19,20 +19,45 @@ logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def _safe_log(level: int, msg: str) -> None:
|
|
22
|
-
"""Best-effort logging that won't
|
|
22
|
+
"""Best-effort logging that won't emit handler tracebacks on closed streams.
|
|
23
|
+
|
|
24
|
+
Temporarily disables logging.raiseExceptions to prevent the logging module
|
|
25
|
+
from printing "--- Logging error ---" to stderr if a handler's stream is
|
|
26
|
+
already closed (common during process teardown). Falls back to writing to
|
|
27
|
+
sys.__stderr__ if available.
|
|
28
|
+
"""
|
|
23
29
|
try:
|
|
24
|
-
|
|
30
|
+
import logging as _logging
|
|
31
|
+
|
|
32
|
+
prev = getattr(_logging, "raiseExceptions", True)
|
|
33
|
+
# Suppress handler errors being printed to stderr
|
|
34
|
+
_logging.raiseExceptions = False
|
|
35
|
+
|
|
36
|
+
# If there are no handlers, skip and use stderr fallback
|
|
37
|
+
if logger.handlers:
|
|
38
|
+
logger.log(level, msg)
|
|
39
|
+
return
|
|
25
40
|
except Exception:
|
|
41
|
+
# Intentionally ignore and try stderr fallback
|
|
42
|
+
pass
|
|
43
|
+
finally:
|
|
26
44
|
try:
|
|
27
|
-
#
|
|
28
|
-
import sys
|
|
45
|
+
import logging as _logging # re-import safe even if earlier failed
|
|
29
46
|
|
|
30
|
-
|
|
31
|
-
sys.__stderr__.write(msg + "\n")
|
|
32
|
-
sys.__stderr__.flush()
|
|
47
|
+
_logging.raiseExceptions = prev # type: ignore[name-defined]
|
|
33
48
|
except Exception:
|
|
34
49
|
pass
|
|
35
50
|
|
|
51
|
+
# Fallback to stderr if available
|
|
52
|
+
try:
|
|
53
|
+
import sys
|
|
54
|
+
|
|
55
|
+
if hasattr(sys, "__stderr__") and sys.__stderr__:
|
|
56
|
+
sys.__stderr__.write(msg + "\n")
|
|
57
|
+
sys.__stderr__.flush()
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
36
61
|
|
|
37
62
|
def kill_pipeline_process_group(process) -> None:
|
|
38
63
|
"""
|
|
@@ -74,7 +99,17 @@ def kill_pipeline_process_group(process) -> None:
|
|
|
74
99
|
|
|
75
100
|
try:
|
|
76
101
|
# Send graceful termination to the entire process group
|
|
77
|
-
|
|
102
|
+
try:
|
|
103
|
+
pgid = os.getpgid(pid)
|
|
104
|
+
except Exception:
|
|
105
|
+
# Process already gone
|
|
106
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} not found during SIGTERM phase")
|
|
107
|
+
return
|
|
108
|
+
try:
|
|
109
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
110
|
+
except ProcessLookupError:
|
|
111
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} no longer exists (SIGTERM)")
|
|
112
|
+
return
|
|
78
113
|
|
|
79
114
|
# If we have a Process handle, give it a chance to exit cleanly
|
|
80
115
|
if proc is not None and hasattr(proc, "join"):
|
|
@@ -95,7 +130,12 @@ def kill_pipeline_process_group(process) -> None:
|
|
|
95
130
|
if still_alive:
|
|
96
131
|
_safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
|
|
97
132
|
try:
|
|
98
|
-
|
|
133
|
+
try:
|
|
134
|
+
pgid2 = os.getpgid(pid)
|
|
135
|
+
except Exception:
|
|
136
|
+
_safe_log(logging.DEBUG, f"Process group for PID {pid} vanished before SIGKILL")
|
|
137
|
+
return
|
|
138
|
+
os.killpg(pgid2, signal.SIGKILL)
|
|
99
139
|
finally:
|
|
100
140
|
if proc is not None and hasattr(proc, "join"):
|
|
101
141
|
try:
|
|
@@ -152,11 +152,11 @@ if __name__ == "__main__":
|
|
|
152
152
|
os.environ["OCR_MODEL_NAME"] = "paddle"
|
|
153
153
|
os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
154
154
|
os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
155
|
-
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/
|
|
155
|
+
os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
|
|
156
156
|
logger.info("Environment variables set.")
|
|
157
157
|
|
|
158
158
|
image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
159
|
-
model_name = "nvidia/
|
|
159
|
+
model_name = "nvidia/nemotron-nano-12b-v2-vl"
|
|
160
160
|
yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
|
|
161
161
|
(
|
|
162
162
|
yolox_table_structure_grpc,
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from typing import Optional
|
|
8
|
-
|
|
9
8
|
import ray
|
|
10
9
|
|
|
11
10
|
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
|
|
|
67
66
|
# Extract the DataFrame payload.
|
|
68
67
|
df_ledger = control_message.payload()
|
|
69
68
|
self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
|
|
70
|
-
|
|
71
69
|
# Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
|
|
72
70
|
task_config = remove_task_by_type(control_message, "extract")
|
|
73
71
|
self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import ray
|
|
7
|
+
|
|
8
|
+
from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
|
|
9
|
+
from nv_ingest.framework.util.flow_control import filter_by_task
|
|
10
|
+
from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
|
|
11
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
|
|
12
|
+
from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
|
|
13
|
+
from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
|
|
14
|
+
from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@ray.remote
|
|
23
|
+
class OCRExtractorStage(RayActorStage):
|
|
24
|
+
"""
|
|
25
|
+
A Ray actor stage that extracts text data from image content.
|
|
26
|
+
|
|
27
|
+
It expects an IngestControlMessage containing a DataFrame with image data. It then:
|
|
28
|
+
1. Removes the "text_data_extract" task from the message.
|
|
29
|
+
2. Calls the text extraction logic using a validated configuration.
|
|
30
|
+
3. Updates the message payload with the extracted text DataFrame.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
|
|
34
|
+
super().__init__(config, log_to_stdout=False, stage_name=stage_name)
|
|
35
|
+
try:
|
|
36
|
+
self.validated_config = config
|
|
37
|
+
self._logger.info("OCRExtractorStage configuration validated successfully.")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
self._logger.exception(f"Error validating Text extractor config: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
@nv_ingest_node_failure_try_except()
|
|
43
|
+
@traceable()
|
|
44
|
+
@udf_intercept_hook()
|
|
45
|
+
@filter_by_task(required_tasks=["ocr_data_extract"])
|
|
46
|
+
def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
|
|
47
|
+
# Extract DataFrame payload
|
|
48
|
+
df_ledger = control_message.payload()
|
|
49
|
+
if df_ledger.empty:
|
|
50
|
+
return control_message
|
|
51
|
+
|
|
52
|
+
# Remove the "text_data_extract" task from the message
|
|
53
|
+
task_config = remove_task_by_type(control_message, "ocr_data_extract")
|
|
54
|
+
|
|
55
|
+
execution_trace_log = {}
|
|
56
|
+
new_df, extraction_info = extract_text_data_from_image_internal(
|
|
57
|
+
df_extraction_ledger=df_ledger,
|
|
58
|
+
task_config=task_config,
|
|
59
|
+
extraction_config=self.validated_config,
|
|
60
|
+
execution_trace_log=execution_trace_log,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
control_message.payload(new_df)
|
|
64
|
+
control_message.set_metadata("ocr_extraction_info", extraction_info)
|
|
65
|
+
|
|
66
|
+
do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
|
|
67
|
+
if do_trace_tagging and execution_trace_log:
|
|
68
|
+
parent_name = self.stage_name if self.stage_name else "ocr_extractor"
|
|
69
|
+
set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
|
|
70
|
+
|
|
71
|
+
return control_message
|
|
@@ -30,6 +30,7 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
|
|
|
30
30
|
from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
|
|
31
31
|
from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
|
|
32
32
|
from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
|
|
33
|
+
from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
|
|
33
34
|
|
|
34
35
|
logger = logging.getLogger(__name__)
|
|
35
36
|
|
|
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
|
|
|
89
90
|
|
|
90
91
|
# Use the discriminated union for broker_client
|
|
91
92
|
broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
|
|
92
|
-
task_queue: str = Field(
|
|
93
|
-
|
|
93
|
+
task_queue: str = Field(
|
|
94
|
+
..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
|
|
95
|
+
)
|
|
96
|
+
poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
|
|
94
97
|
|
|
95
98
|
|
|
96
99
|
@ray.remote
|
|
@@ -134,7 +137,29 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
134
137
|
self._current_backoff_sleep: float = 0.0
|
|
135
138
|
self._last_backoff_log_time: float = 0.0
|
|
136
139
|
|
|
137
|
-
|
|
140
|
+
# Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
|
|
141
|
+
strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
|
|
142
|
+
self.scheduler = QosScheduler(
|
|
143
|
+
self.task_queue,
|
|
144
|
+
num_prefetch_threads=6, # one per category (no-op for simple strategy)
|
|
145
|
+
total_buffer_capacity=96, # e.g., ~16 per thread
|
|
146
|
+
prefetch_poll_interval=0.002, # faster polling for responsiveness
|
|
147
|
+
prefetch_non_immediate=True, # enable prefetch for non-immediate categories
|
|
148
|
+
strategy=strategy,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self._logger.info(
|
|
152
|
+
"MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
|
|
153
|
+
self.task_queue,
|
|
154
|
+
{
|
|
155
|
+
"immediate": f"{self.task_queue}_immediate",
|
|
156
|
+
"micro": f"{self.task_queue}_micro",
|
|
157
|
+
"small": f"{self.task_queue}_small",
|
|
158
|
+
"medium": f"{self.task_queue}_medium",
|
|
159
|
+
"large": f"{self.task_queue}_large",
|
|
160
|
+
"default": f"{self.task_queue}",
|
|
161
|
+
},
|
|
162
|
+
)
|
|
138
163
|
|
|
139
164
|
# --- Private helper methods ---
|
|
140
165
|
def _create_client(self):
|
|
@@ -265,14 +290,21 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
265
290
|
|
|
266
291
|
return control_message
|
|
267
292
|
|
|
268
|
-
def _fetch_message(self, timeout=
|
|
293
|
+
def _fetch_message(self, timeout=0):
|
|
269
294
|
"""
|
|
270
|
-
Fetch a message from the message broker.
|
|
295
|
+
Fetch a message from the message broker using fair scheduling across derived queues.
|
|
296
|
+
This is a non-blocking sweep across all queues for the current scheduling cycle. If no
|
|
297
|
+
message is found across any queue, return None so the caller can sleep briefly.
|
|
271
298
|
"""
|
|
272
299
|
try:
|
|
273
|
-
|
|
300
|
+
# Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
|
|
301
|
+
job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
|
|
274
302
|
if job is None:
|
|
275
|
-
self._logger.debug(
|
|
303
|
+
self._logger.debug(
|
|
304
|
+
"No message received from derived queues for base "
|
|
305
|
+
"'%s' (immediate, micro, small, medium, large, default)",
|
|
306
|
+
self.task_queue,
|
|
307
|
+
)
|
|
276
308
|
# Do not treat normal empty polls as failures
|
|
277
309
|
self._fetch_failure_count = 0
|
|
278
310
|
self._current_backoff_sleep = 0.0
|
|
@@ -336,7 +368,8 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
|
|
|
336
368
|
Instead of reading from an input edge, fetch a message from the broker.
|
|
337
369
|
"""
|
|
338
370
|
self._logger.debug("read_input: calling _fetch_message()")
|
|
339
|
-
|
|
371
|
+
# Perform a non-blocking sweep across all queues for this cycle
|
|
372
|
+
job = self._fetch_message(timeout=0)
|
|
340
373
|
if job is None:
|
|
341
374
|
# Sleep for either the configured poll interval or the current backoff, whichever is larger
|
|
342
375
|
sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
|
|
@@ -218,12 +218,33 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
218
218
|
ttl_for_result: Optional[int] = (
|
|
219
219
|
self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
|
|
220
220
|
)
|
|
221
|
+
# Determine target queue based on optional QoS hint
|
|
222
|
+
queue_hint = None
|
|
223
|
+
try:
|
|
224
|
+
routing_opts = job_spec.get("routing_options") or {}
|
|
225
|
+
tracing_opts = job_spec.get("tracing_options") or {}
|
|
226
|
+
queue_hint = routing_opts.get("queue_hint") or tracing_opts.get("queue_hint")
|
|
227
|
+
except Exception:
|
|
228
|
+
queue_hint = None
|
|
229
|
+
allowed = {"default", "immediate", "micro", "small", "medium", "large"}
|
|
230
|
+
if isinstance(queue_hint, str) and queue_hint in allowed:
|
|
231
|
+
if queue_hint == "default":
|
|
232
|
+
channel_name = self._redis_task_queue
|
|
233
|
+
else:
|
|
234
|
+
channel_name = f"{self._redis_task_queue}_{queue_hint}"
|
|
235
|
+
else:
|
|
236
|
+
channel_name = self._redis_task_queue
|
|
237
|
+
logger.debug(
|
|
238
|
+
f"Submitting job {trace_id} to queue '{channel_name}' (hint={queue_hint}) "
|
|
239
|
+
f"with result TTL: {ttl_for_result}"
|
|
240
|
+
)
|
|
241
|
+
|
|
221
242
|
logger.debug(
|
|
222
243
|
f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
|
|
223
244
|
)
|
|
224
245
|
await self._run_bounded_to_thread(
|
|
225
246
|
self._ingest_client.submit_message,
|
|
226
|
-
channel_name=
|
|
247
|
+
channel_name=channel_name,
|
|
227
248
|
message=job_spec_json,
|
|
228
249
|
ttl_seconds=ttl_for_result,
|
|
229
250
|
)
|
|
@@ -436,12 +457,13 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
436
457
|
metadata_key = f"parent:{parent_job_id}:metadata"
|
|
437
458
|
|
|
438
459
|
try:
|
|
439
|
-
# Store subjob IDs as a set
|
|
440
|
-
|
|
441
|
-
self.
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
460
|
+
# Store subjob IDs as a set (only if there are subjobs)
|
|
461
|
+
if subjob_ids:
|
|
462
|
+
await self._run_bounded_to_thread(
|
|
463
|
+
self._ingest_client.get_client().sadd,
|
|
464
|
+
parent_key,
|
|
465
|
+
*subjob_ids,
|
|
466
|
+
)
|
|
445
467
|
|
|
446
468
|
# Store metadata as hash (including original subjob ordering for deterministic fetches)
|
|
447
469
|
metadata_to_store = dict(metadata)
|
|
@@ -500,21 +522,21 @@ class RedisIngestService(IngestServiceMeta):
|
|
|
500
522
|
metadata_key = f"parent:{parent_job_id}:metadata"
|
|
501
523
|
|
|
502
524
|
try:
|
|
503
|
-
# Check if this is a parent job
|
|
525
|
+
# Check if this is a parent job (check metadata_key since non-split PDFs may not have parent_key)
|
|
504
526
|
exists = await self._run_bounded_to_thread(
|
|
505
527
|
self._ingest_client.get_client().exists,
|
|
506
|
-
parent_key
|
|
528
|
+
metadata_key, # Check metadata instead of parent_key for non-split PDF support
|
|
507
529
|
)
|
|
508
530
|
|
|
509
531
|
if not exists:
|
|
510
532
|
return None
|
|
511
533
|
|
|
512
|
-
# Get subjob IDs
|
|
534
|
+
# Get subjob IDs (may be empty for non-split PDFs)
|
|
513
535
|
subjob_ids_bytes = await self._run_bounded_to_thread(
|
|
514
536
|
self._ingest_client.get_client().smembers,
|
|
515
537
|
parent_key,
|
|
516
538
|
)
|
|
517
|
-
subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes}
|
|
539
|
+
subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes} if subjob_ids_bytes else set()
|
|
518
540
|
|
|
519
541
|
# Get metadata
|
|
520
542
|
metadata_dict = await self._run_bounded_to_thread(
|
|
@@ -318,8 +318,8 @@ stages:
|
|
|
318
318
|
actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
|
|
319
319
|
config:
|
|
320
320
|
api_key: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
321
|
-
endpoint_url: $VLM_CAPTION_ENDPOINT|"
|
|
322
|
-
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/
|
|
321
|
+
endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
|
|
322
|
+
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
|
|
323
323
|
prompt: "Caption the content of this image:"
|
|
324
324
|
replicas:
|
|
325
325
|
min_replicas: 0
|
|
@@ -192,6 +192,27 @@ stages:
|
|
|
192
192
|
strategy: "static"
|
|
193
193
|
value: 1
|
|
194
194
|
|
|
195
|
+
- name: "ocr_extractor"
|
|
196
|
+
type: "stage"
|
|
197
|
+
phase: 1 # EXTRACTION
|
|
198
|
+
actor: "nv_ingest.framework.orchestration.ray.stages.extractors.ocr_extractor:OCRExtractorStage"
|
|
199
|
+
config:
|
|
200
|
+
endpoint_config:
|
|
201
|
+
ocr_endpoints: [
|
|
202
|
+
$OCR_GRPC_ENDPOINT|"ocr:8001",
|
|
203
|
+
$OCR_HTTP_ENDPOINT|"http://ocr:8000/v1/infer",
|
|
204
|
+
]
|
|
205
|
+
ocr_infer_protocol: $OCR_INFER_PROTOCOL|grpc
|
|
206
|
+
auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
207
|
+
replicas:
|
|
208
|
+
min_replicas: 0
|
|
209
|
+
max_replicas:
|
|
210
|
+
strategy: "static"
|
|
211
|
+
value: 4
|
|
212
|
+
static_replicas:
|
|
213
|
+
strategy: "static"
|
|
214
|
+
value: 3
|
|
215
|
+
|
|
195
216
|
- name: "infographic_extractor"
|
|
196
217
|
type: "stage"
|
|
197
218
|
phase: 1 # EXTRACTION
|
|
@@ -317,7 +338,8 @@ stages:
|
|
|
317
338
|
actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
|
|
318
339
|
config:
|
|
319
340
|
api_key: $NGC_API_KEY|$NVIDIA_API_KEY
|
|
320
|
-
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/
|
|
341
|
+
model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
|
|
342
|
+
endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
|
|
321
343
|
prompt: "Caption the content of this image:"
|
|
322
344
|
replicas:
|
|
323
345
|
min_replicas: 0
|
|
@@ -427,76 +449,79 @@ edges:
|
|
|
427
449
|
# Intake
|
|
428
450
|
- from: "source_stage"
|
|
429
451
|
to: "metadata_injector"
|
|
430
|
-
queue_size:
|
|
452
|
+
queue_size: 4
|
|
431
453
|
|
|
432
454
|
# Document Extractors
|
|
433
455
|
- from: "metadata_injector"
|
|
434
456
|
to: "pdf_extractor"
|
|
435
|
-
queue_size:
|
|
457
|
+
queue_size: 8
|
|
436
458
|
- from: "pdf_extractor"
|
|
437
459
|
to: "audio_extractor"
|
|
438
|
-
queue_size:
|
|
460
|
+
queue_size: 4
|
|
439
461
|
- from: "audio_extractor"
|
|
440
462
|
to: "docx_extractor"
|
|
441
|
-
queue_size:
|
|
463
|
+
queue_size: 4
|
|
442
464
|
- from: "docx_extractor"
|
|
443
465
|
to: "pptx_extractor"
|
|
444
|
-
queue_size:
|
|
466
|
+
queue_size: 4
|
|
445
467
|
- from: "pptx_extractor"
|
|
446
468
|
to: "image_extractor"
|
|
447
|
-
queue_size:
|
|
469
|
+
queue_size: 4
|
|
448
470
|
- from: "image_extractor"
|
|
449
471
|
to: "html_extractor"
|
|
450
|
-
queue_size:
|
|
472
|
+
queue_size: 4
|
|
451
473
|
- from: "html_extractor"
|
|
452
474
|
to: "infographic_extractor"
|
|
453
|
-
queue_size:
|
|
475
|
+
queue_size: 4
|
|
454
476
|
|
|
455
477
|
# Primitive Extractors
|
|
456
478
|
- from: "infographic_extractor"
|
|
457
479
|
to: "table_extractor"
|
|
458
|
-
queue_size:
|
|
480
|
+
queue_size: 4
|
|
459
481
|
- from: "table_extractor"
|
|
460
482
|
to: "chart_extractor"
|
|
461
|
-
queue_size:
|
|
483
|
+
queue_size: 4
|
|
462
484
|
- from: "chart_extractor"
|
|
485
|
+
to: "ocr_extractor"
|
|
486
|
+
queue_size: 8
|
|
487
|
+
- from: "ocr_extractor"
|
|
463
488
|
to: "image_filter"
|
|
464
|
-
queue_size:
|
|
489
|
+
queue_size: 4
|
|
465
490
|
|
|
466
491
|
# Primitive Mutators
|
|
467
492
|
- from: "image_filter"
|
|
468
493
|
to: "image_dedup"
|
|
469
|
-
queue_size:
|
|
494
|
+
queue_size: 4
|
|
470
495
|
- from: "image_dedup"
|
|
471
496
|
to: "text_splitter"
|
|
472
|
-
queue_size:
|
|
497
|
+
queue_size: 4
|
|
473
498
|
|
|
474
499
|
# Primitive Transforms
|
|
475
500
|
- from: "text_splitter"
|
|
476
501
|
to: "image_caption"
|
|
477
|
-
queue_size:
|
|
502
|
+
queue_size: 4
|
|
478
503
|
- from: "image_caption"
|
|
479
504
|
to: "text_embedder"
|
|
480
|
-
queue_size:
|
|
505
|
+
queue_size: 4
|
|
481
506
|
- from: "text_embedder"
|
|
482
507
|
to: "image_storage"
|
|
483
|
-
queue_size:
|
|
508
|
+
queue_size: 4
|
|
484
509
|
|
|
485
510
|
# Primitive Storage
|
|
486
511
|
- from: "image_storage"
|
|
487
512
|
to: "embedding_storage"
|
|
488
|
-
queue_size:
|
|
513
|
+
queue_size: 4
|
|
489
514
|
- from: "embedding_storage"
|
|
490
515
|
to: "broker_response"
|
|
491
|
-
queue_size:
|
|
516
|
+
queue_size: 4
|
|
492
517
|
|
|
493
518
|
# Response and Telemetry
|
|
494
519
|
- from: "broker_response"
|
|
495
520
|
to: "otel_tracer"
|
|
496
|
-
queue_size:
|
|
521
|
+
queue_size: 4
|
|
497
522
|
- from: "otel_tracer"
|
|
498
523
|
to: "default_drain"
|
|
499
|
-
queue_size:
|
|
524
|
+
queue_size: 4
|
|
500
525
|
|
|
501
526
|
# Pipeline Runtime Configuration
|
|
502
527
|
pipeline:
|
{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.11.19.dev20251119
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -226,7 +226,6 @@ Requires-Dist: isodate>=0.7.2
|
|
|
226
226
|
Requires-Dist: langdetect>=1.0.9
|
|
227
227
|
Requires-Dist: minio>=7.2.12
|
|
228
228
|
Requires-Dist: librosa>=0.10.2
|
|
229
|
-
Requires-Dist: openai>=1.82.0
|
|
230
229
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
231
230
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
232
231
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/RECORD
RENAMED
|
@@ -7,27 +7,27 @@ nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,
|
|
|
7
7
|
nv_ingest/api/v1/health.py,sha256=pV-RoVq5y0iBPp0qZoLzd1xKpd0JiHAi0UMyMj99LqU,4740
|
|
8
8
|
nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
|
|
9
9
|
nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
|
|
10
|
-
nv_ingest/api/v2/README.md,sha256=
|
|
10
|
+
nv_ingest/api/v2/README.md,sha256=VhpdjEmCyr3qIOhwqISFx9C5WezJFcxYc-NB9S98HMg,7562
|
|
11
11
|
nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
12
|
-
nv_ingest/api/v2/ingest.py,sha256=
|
|
12
|
+
nv_ingest/api/v2/ingest.py,sha256=ikbZE2eAjSnFmt5CcpTduY1t9DsUQBhnBQlsd3HaBww,53103
|
|
13
13
|
nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
14
14
|
nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
15
15
|
nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
16
16
|
nv_ingest/framework/orchestration/execution/helpers.py,sha256=-F8SZh7ISWtzJz6X1O2LQ133t-17Jxi8lL-NHz4rwj0,2818
|
|
17
17
|
nv_ingest/framework/orchestration/execution/options.py,sha256=Ms1t4591EIv4ZrMRdhsCYPgLnMVXJosG3MURCbPXUoA,3983
|
|
18
18
|
nv_ingest/framework/orchestration/process/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
19
|
-
nv_ingest/framework/orchestration/process/dependent_services.py,sha256=
|
|
19
|
+
nv_ingest/framework/orchestration/process/dependent_services.py,sha256=s0j_rsFtCKHFIuvOkBe9NEAkPNPhSYse_ApeHka8gyg,3032
|
|
20
20
|
nv_ingest/framework/orchestration/process/execution.py,sha256=P1kzpYV23e4QYrKw9Td1TCZK3CK1ENVqqnI_axRCqBk,19814
|
|
21
21
|
nv_ingest/framework/orchestration/process/lifecycle.py,sha256=L5NDwnzSMQPGjqJDC8jC75L1YqWey-dtK8N_HgBzb0E,8001
|
|
22
|
-
nv_ingest/framework/orchestration/process/strategies.py,sha256=
|
|
23
|
-
nv_ingest/framework/orchestration/process/termination.py,sha256=
|
|
22
|
+
nv_ingest/framework/orchestration/process/strategies.py,sha256=Q1Q04PPseF775omeS0FoXfK187NiS_bbqTaaJRwzKn8,7972
|
|
23
|
+
nv_ingest/framework/orchestration/process/termination.py,sha256=PAogFeW0FATFS6Mcp_UkZgq_SbWV18RtdZN-0NbComw,5042
|
|
24
24
|
nv_ingest/framework/orchestration/ray/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
25
25
|
nv_ingest/framework/orchestration/ray/edges/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
26
26
|
nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py,sha256=PQliU_kyGbO9o42njpb8FrDMLrbLqwZzmBNXifxyG5Y,2312
|
|
27
27
|
nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py,sha256=VFii2yxJuikimOxie3edKq5JN06g78AF8bdHSHVX8p8,2677
|
|
28
28
|
nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py,sha256=N6NH4KgZJ60e_JkGRcSmfQtX37qtX4TMcavOR-n3heE,2549
|
|
29
29
|
nv_ingest/framework/orchestration/ray/examples/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
30
|
-
nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=
|
|
30
|
+
nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=Bn4rjkO14BwvvUNG_HBCSVXetYk7DKqRRsYHJADWqjc,16455
|
|
31
31
|
nv_ingest/framework/orchestration/ray/examples/task_source_harness.py,sha256=Yt7uxThg7s8WuMiaHLKC8r1XAG7QixegfkT-juE5oNw,1953
|
|
32
32
|
nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha256=XkvsoIzH5ftXvAZ4ox7mxbx7ESVx6D8Xupcwbqgd52w,3277
|
|
33
33
|
nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
@@ -38,12 +38,13 @@ nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=t9lf6zTj
|
|
|
38
38
|
nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=GGY6_i6_g5xTFzdo9Qmsu9i4knMTq6pJfgm-aaPEt_o,17226
|
|
39
39
|
nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
40
40
|
nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
41
|
-
nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=
|
|
41
|
+
nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=UVp_kDmkaBlfO0Mbl_IxKq6imzLvs4-DKHgUHJIh3mo,3629
|
|
42
42
|
nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py,sha256=rfaDx6PqRCguhSYkJI6iVmMMtAlJNxzKfUrLmw_fKqs,4381
|
|
43
43
|
nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py,sha256=R4vshPcAUN2U6BIv8BCZQ862wLx8RJhCGXfpQ3K09Bs,3627
|
|
44
44
|
nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py,sha256=7JrZSVIrK4_wr2s7TOTss7pgTY2F9GPQ7Ze3F_WFlKU,3642
|
|
45
45
|
nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py,sha256=iY9fEfucfgCmO2ixX6qwn418J97nJz_FQGh7B6yziVo,3980
|
|
46
46
|
nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py,sha256=v5J7dnJBEaDfjoTz_N_yC3RAt6lwMLgLT28V-ahquLE,3261
|
|
47
|
+
nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py,sha256=pwVoA5-CF9GVWusoFZOMGBvSyW5udD9bdxVJXA_SghE,3188
|
|
47
48
|
nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=QagIA99AsHLihjRbXm-2BphdoQGHwzOHlqLyz7oDOSk,4992
|
|
48
49
|
nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py,sha256=RMbbl7Cuj4BT-TcgUx_0k8R-DLdw-o3fHxcIBIgrWt4,3776
|
|
49
50
|
nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py,sha256=p71ktv6v5T-9npYpCbgbwW6-fS-65UWS7rCm8OWr2Bc,4170
|
|
@@ -61,7 +62,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py,sha256=wQSlVx3T14
|
|
|
61
62
|
nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py,sha256=_USW1Vq8G2Wn-QFdPfFQCrtKG46hHeJvkEGbBxdpbVM,1488
|
|
62
63
|
nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,sha256=QcvMQXIJ7EWIxty76Mo5Xv38Oj6X2KuS8qXQlf7E1uA,11676
|
|
63
64
|
nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
64
|
-
nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=
|
|
65
|
+
nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=LrqaWpWyuiAHlpXWKYSyHZJBFegGXfNlpCXrucbK5NM,24067
|
|
65
66
|
nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
66
67
|
nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=WZN_-3Li-izDaPtk8IMrtn2os1ckT3U8Rb2PsfOWrcI,4009
|
|
67
68
|
nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=EUtwhSDf-qGLVEhWEInr1VaLsvpcHUSyzCmHQVai-Ps,3547
|
|
@@ -103,22 +104,22 @@ nv_ingest/framework/util/flow_control/udf_intercept.py,sha256=zQ9uuCcHLEd0P52Eiw
|
|
|
103
104
|
nv_ingest/framework/util/service/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
104
105
|
nv_ingest/framework/util/service/impl/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
105
106
|
nv_ingest/framework/util/service/impl/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
106
|
-
nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=
|
|
107
|
+
nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=59P-BMWnFY37GJm5w23-TMxgLhiZGZpJogC0gjDBaTA,23835
|
|
107
108
|
nv_ingest/framework/util/service/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
108
109
|
nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
109
110
|
nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
|
|
110
111
|
nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
111
112
|
nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
|
|
112
113
|
nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
113
|
-
nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=
|
|
114
|
-
nv_ingest/pipeline/default_pipeline_impl.py,sha256=
|
|
114
|
+
nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=yNJtjfHQyxtasGa1hQrvgX7UrPa7BAd0oog8EIN8Y_w,15592
|
|
115
|
+
nv_ingest/pipeline/default_pipeline_impl.py,sha256=DhClC17lWUvtBIi2mCC4WkLWT0lxY-CFY0n6nriAxas,16017
|
|
115
116
|
nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
|
|
116
117
|
nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
|
|
117
118
|
nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
118
119
|
nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
|
|
119
120
|
nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
|
|
120
|
-
nv_ingest-2025.
|
|
121
|
-
nv_ingest-2025.
|
|
122
|
-
nv_ingest-2025.
|
|
123
|
-
nv_ingest-2025.
|
|
124
|
-
nv_ingest-2025.
|
|
121
|
+
nv_ingest-2025.11.19.dev20251119.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
122
|
+
nv_ingest-2025.11.19.dev20251119.dist-info/METADATA,sha256=arJTf3Axy2qKAFDlP4lsKCftTw4vnJp3EECP6hmylYU,15092
|
|
123
|
+
nv_ingest-2025.11.19.dev20251119.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
124
|
+
nv_ingest-2025.11.19.dev20251119.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
|
|
125
|
+
nv_ingest-2025.11.19.dev20251119.dist-info/RECORD,,
|
{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.11.19.dev20251119.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|