nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,20 +19,45 @@ logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
21
  def _safe_log(level: int, msg: str) -> None:
22
- """Best-effort logging that won't crash during interpreter shutdown."""
22
+ """Best-effort logging that won't emit handler tracebacks on closed streams.
23
+
24
+ Temporarily disables logging.raiseExceptions to prevent the logging module
25
+ from printing "--- Logging error ---" to stderr if a handler's stream is
26
+ already closed (common during process teardown). Falls back to writing to
27
+ sys.__stderr__ if available.
28
+ """
23
29
  try:
24
- logger.log(level, msg)
30
+ import logging as _logging
31
+
32
+ prev = getattr(_logging, "raiseExceptions", True)
33
+ # Suppress handler errors being printed to stderr
34
+ _logging.raiseExceptions = False
35
+
36
+ # If there are no handlers, skip and use stderr fallback
37
+ if logger.handlers:
38
+ logger.log(level, msg)
39
+ return
25
40
  except Exception:
41
+ # Intentionally ignore and try stderr fallback
42
+ pass
43
+ finally:
26
44
  try:
27
- # Fallback to stderr if available
28
- import sys
45
+ import logging as _logging # re-import safe even if earlier failed
29
46
 
30
- if hasattr(sys, "__stderr__") and sys.__stderr__:
31
- sys.__stderr__.write(msg + "\n")
32
- sys.__stderr__.flush()
47
+ _logging.raiseExceptions = prev # type: ignore[name-defined]
33
48
  except Exception:
34
49
  pass
35
50
 
51
+ # Fallback to stderr if available
52
+ try:
53
+ import sys
54
+
55
+ if hasattr(sys, "__stderr__") and sys.__stderr__:
56
+ sys.__stderr__.write(msg + "\n")
57
+ sys.__stderr__.flush()
58
+ except Exception:
59
+ pass
60
+
36
61
 
37
62
  def kill_pipeline_process_group(process) -> None:
38
63
  """
@@ -74,7 +99,17 @@ def kill_pipeline_process_group(process) -> None:
74
99
 
75
100
  try:
76
101
  # Send graceful termination to the entire process group
77
- os.killpg(os.getpgid(pid), signal.SIGTERM)
102
+ try:
103
+ pgid = os.getpgid(pid)
104
+ except Exception:
105
+ # Process already gone
106
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} not found during SIGTERM phase")
107
+ return
108
+ try:
109
+ os.killpg(pgid, signal.SIGTERM)
110
+ except ProcessLookupError:
111
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} no longer exists (SIGTERM)")
112
+ return
78
113
 
79
114
  # If we have a Process handle, give it a chance to exit cleanly
80
115
  if proc is not None and hasattr(proc, "join"):
@@ -95,7 +130,12 @@ def kill_pipeline_process_group(process) -> None:
95
130
  if still_alive:
96
131
  _safe_log(logging.WARNING, "Process group did not terminate gracefully, using SIGKILL")
97
132
  try:
98
- os.killpg(os.getpgid(pid), signal.SIGKILL)
133
+ try:
134
+ pgid2 = os.getpgid(pid)
135
+ except Exception:
136
+ _safe_log(logging.DEBUG, f"Process group for PID {pid} vanished before SIGKILL")
137
+ return
138
+ os.killpg(pgid2, signal.SIGKILL)
99
139
  finally:
100
140
  if proc is not None and hasattr(proc, "join"):
101
141
  try:
@@ -152,11 +152,11 @@ if __name__ == "__main__":
152
152
  os.environ["OCR_MODEL_NAME"] = "paddle"
153
153
  os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
154
  os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
155
- os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
155
+ os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
156
156
  logger.info("Environment variables set.")
157
157
 
158
158
  image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
159
- model_name = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
159
+ model_name = "nvidia/nemotron-nano-12b-v2-vl"
160
160
  yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
161
161
  (
162
162
  yolox_table_structure_grpc,
@@ -5,7 +5,6 @@
5
5
 
6
6
  import logging
7
7
  from typing import Optional
8
-
9
8
  import ray
10
9
 
11
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
67
66
  # Extract the DataFrame payload.
68
67
  df_ledger = control_message.payload()
69
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
70
-
71
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
72
70
  task_config = remove_task_by_type(control_message, "extract")
73
71
  self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -30,6 +30,7 @@ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import validate_inges
30
30
  from nv_ingest_api.util.message_brokers.simple_message_broker.simple_client import SimpleClient
31
31
  from nv_ingest_api.util.service_clients.redis.redis_client import RedisClient
32
32
  from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
33
+ from nv_ingest_api.util.message_brokers.qos_scheduler import QosScheduler
33
34
 
34
35
  logger = logging.getLogger(__name__)
35
36
 
@@ -89,8 +90,10 @@ class MessageBrokerTaskSourceConfig(BaseModel):
89
90
 
90
91
  # Use the discriminated union for broker_client
91
92
  broker_client: Union[RedisClientConfig, SimpleClientConfig] = Field(..., discriminator="client_type")
92
- task_queue: str = Field(..., description="The name of the queue to fetch tasks from.")
93
- poll_interval: float = Field(default=0.1, gt=0, description="Polling interval in seconds.")
93
+ task_queue: str = Field(
94
+ ..., description="The base name of the queue to fetch tasks from. Derives sub-queues for fair scheduling."
95
+ )
96
+ poll_interval: float = Field(default=0.0, gt=0, description="Polling interval in seconds.")
94
97
 
95
98
 
96
99
  @ray.remote
@@ -134,7 +137,29 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
134
137
  self._current_backoff_sleep: float = 0.0
135
138
  self._last_backoff_log_time: float = 0.0
136
139
 
137
- self._logger.debug("MessageBrokerTaskSourceStage initialized. Task queue: %s", self.task_queue)
140
+ # Initialize QoS scheduler. Use a simple base-queue strategy for SimpleClient.
141
+ strategy = "simple" if isinstance(self.client, SimpleClient) else "lottery"
142
+ self.scheduler = QosScheduler(
143
+ self.task_queue,
144
+ num_prefetch_threads=6, # one per category (no-op for simple strategy)
145
+ total_buffer_capacity=96, # e.g., ~16 per thread
146
+ prefetch_poll_interval=0.002, # faster polling for responsiveness
147
+ prefetch_non_immediate=True, # enable prefetch for non-immediate categories
148
+ strategy=strategy,
149
+ )
150
+
151
+ self._logger.info(
152
+ "MessageBrokerTaskSourceStage initialized. Base task queue: %s | Derived queues: %s",
153
+ self.task_queue,
154
+ {
155
+ "immediate": f"{self.task_queue}_immediate",
156
+ "micro": f"{self.task_queue}_micro",
157
+ "small": f"{self.task_queue}_small",
158
+ "medium": f"{self.task_queue}_medium",
159
+ "large": f"{self.task_queue}_large",
160
+ "default": f"{self.task_queue}",
161
+ },
162
+ )
138
163
 
139
164
  # --- Private helper methods ---
140
165
  def _create_client(self):
@@ -265,14 +290,21 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
265
290
 
266
291
  return control_message
267
292
 
268
- def _fetch_message(self, timeout=100):
293
+ def _fetch_message(self, timeout=0):
269
294
  """
270
- Fetch a message from the message broker.
295
+ Fetch a message from the message broker using fair scheduling across derived queues.
296
+ This is a non-blocking sweep across all queues for the current scheduling cycle. If no
297
+ message is found across any queue, return None so the caller can sleep briefly.
271
298
  """
272
299
  try:
273
- job = self.client.fetch_message(self.task_queue, timeout)
300
+ # Use scheduler to fetch next. In simple strategy this will block up to poll_interval on base queue.
301
+ job = self.scheduler.fetch_next(self.client, timeout=self.config.poll_interval)
274
302
  if job is None:
275
- self._logger.debug("No message received from '%s'", self.task_queue)
303
+ self._logger.debug(
304
+ "No message received from derived queues for base "
305
+ "'%s' (immediate, micro, small, medium, large, default)",
306
+ self.task_queue,
307
+ )
276
308
  # Do not treat normal empty polls as failures
277
309
  self._fetch_failure_count = 0
278
310
  self._current_backoff_sleep = 0.0
@@ -336,7 +368,8 @@ class MessageBrokerTaskSourceStage(RayActorSourceStage):
336
368
  Instead of reading from an input edge, fetch a message from the broker.
337
369
  """
338
370
  self._logger.debug("read_input: calling _fetch_message()")
339
- job = self._fetch_message(timeout=100)
371
+ # Perform a non-blocking sweep across all queues for this cycle
372
+ job = self._fetch_message(timeout=0)
340
373
  if job is None:
341
374
  # Sleep for either the configured poll interval or the current backoff, whichever is larger
342
375
  sleep_time = max(self.config.poll_interval, getattr(self, "_current_backoff_sleep", 0.0))
@@ -218,12 +218,33 @@ class RedisIngestService(IngestServiceMeta):
218
218
  ttl_for_result: Optional[int] = (
219
219
  self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
220
220
  )
221
+ # Determine target queue based on optional QoS hint
222
+ queue_hint = None
223
+ try:
224
+ routing_opts = job_spec.get("routing_options") or {}
225
+ tracing_opts = job_spec.get("tracing_options") or {}
226
+ queue_hint = routing_opts.get("queue_hint") or tracing_opts.get("queue_hint")
227
+ except Exception:
228
+ queue_hint = None
229
+ allowed = {"default", "immediate", "micro", "small", "medium", "large"}
230
+ if isinstance(queue_hint, str) and queue_hint in allowed:
231
+ if queue_hint == "default":
232
+ channel_name = self._redis_task_queue
233
+ else:
234
+ channel_name = f"{self._redis_task_queue}_{queue_hint}"
235
+ else:
236
+ channel_name = self._redis_task_queue
237
+ logger.debug(
238
+ f"Submitting job {trace_id} to queue '{channel_name}' (hint={queue_hint}) "
239
+ f"with result TTL: {ttl_for_result}"
240
+ )
241
+
221
242
  logger.debug(
222
243
  f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
223
244
  )
224
245
  await self._run_bounded_to_thread(
225
246
  self._ingest_client.submit_message,
226
- channel_name=self._redis_task_queue,
247
+ channel_name=channel_name,
227
248
  message=job_spec_json,
228
249
  ttl_seconds=ttl_for_result,
229
250
  )
@@ -436,12 +457,13 @@ class RedisIngestService(IngestServiceMeta):
436
457
  metadata_key = f"parent:{parent_job_id}:metadata"
437
458
 
438
459
  try:
439
- # Store subjob IDs as a set
440
- await self._run_bounded_to_thread(
441
- self._ingest_client.get_client().sadd,
442
- parent_key,
443
- *subjob_ids,
444
- )
460
+ # Store subjob IDs as a set (only if there are subjobs)
461
+ if subjob_ids:
462
+ await self._run_bounded_to_thread(
463
+ self._ingest_client.get_client().sadd,
464
+ parent_key,
465
+ *subjob_ids,
466
+ )
445
467
 
446
468
  # Store metadata as hash (including original subjob ordering for deterministic fetches)
447
469
  metadata_to_store = dict(metadata)
@@ -500,21 +522,21 @@ class RedisIngestService(IngestServiceMeta):
500
522
  metadata_key = f"parent:{parent_job_id}:metadata"
501
523
 
502
524
  try:
503
- # Check if this is a parent job
525
+ # Check if this is a parent job (check metadata_key since non-split PDFs may not have parent_key)
504
526
  exists = await self._run_bounded_to_thread(
505
527
  self._ingest_client.get_client().exists,
506
- parent_key,
528
+ metadata_key, # Check metadata instead of parent_key for non-split PDF support
507
529
  )
508
530
 
509
531
  if not exists:
510
532
  return None
511
533
 
512
- # Get subjob IDs
534
+ # Get subjob IDs (may be empty for non-split PDFs)
513
535
  subjob_ids_bytes = await self._run_bounded_to_thread(
514
536
  self._ingest_client.get_client().smembers,
515
537
  parent_key,
516
538
  )
517
- subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes}
539
+ subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes} if subjob_ids_bytes else set()
518
540
 
519
541
  # Get metadata
520
542
  metadata_dict = await self._run_bounded_to_thread(
@@ -318,8 +318,8 @@ stages:
318
318
  actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
319
319
  config:
320
320
  api_key: $NGC_API_KEY|$NVIDIA_API_KEY
321
- endpoint_url: $VLM_CAPTION_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
322
- model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
321
+ endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
322
+ model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
323
323
  prompt: "Caption the content of this image:"
324
324
  replicas:
325
325
  min_replicas: 0
@@ -192,6 +192,27 @@ stages:
192
192
  strategy: "static"
193
193
  value: 1
194
194
 
195
+ - name: "ocr_extractor"
196
+ type: "stage"
197
+ phase: 1 # EXTRACTION
198
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.ocr_extractor:OCRExtractorStage"
199
+ config:
200
+ endpoint_config:
201
+ ocr_endpoints: [
202
+ $OCR_GRPC_ENDPOINT|"ocr:8001",
203
+ $OCR_HTTP_ENDPOINT|"http://ocr:8000/v1/infer",
204
+ ]
205
+ ocr_infer_protocol: $OCR_INFER_PROTOCOL|grpc
206
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
207
+ replicas:
208
+ min_replicas: 0
209
+ max_replicas:
210
+ strategy: "static"
211
+ value: 4
212
+ static_replicas:
213
+ strategy: "static"
214
+ value: 3
215
+
195
216
  - name: "infographic_extractor"
196
217
  type: "stage"
197
218
  phase: 1 # EXTRACTION
@@ -317,7 +338,8 @@ stages:
317
338
  actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
318
339
  config:
319
340
  api_key: $NGC_API_KEY|$NVIDIA_API_KEY
320
- model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
341
+ model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
342
+ endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
321
343
  prompt: "Caption the content of this image:"
322
344
  replicas:
323
345
  min_replicas: 0
@@ -427,76 +449,79 @@ edges:
427
449
  # Intake
428
450
  - from: "source_stage"
429
451
  to: "metadata_injector"
430
- queue_size: 32
452
+ queue_size: 4
431
453
 
432
454
  # Document Extractors
433
455
  - from: "metadata_injector"
434
456
  to: "pdf_extractor"
435
- queue_size: 32
457
+ queue_size: 8
436
458
  - from: "pdf_extractor"
437
459
  to: "audio_extractor"
438
- queue_size: 32
460
+ queue_size: 4
439
461
  - from: "audio_extractor"
440
462
  to: "docx_extractor"
441
- queue_size: 32
463
+ queue_size: 4
442
464
  - from: "docx_extractor"
443
465
  to: "pptx_extractor"
444
- queue_size: 32
466
+ queue_size: 4
445
467
  - from: "pptx_extractor"
446
468
  to: "image_extractor"
447
- queue_size: 32
469
+ queue_size: 4
448
470
  - from: "image_extractor"
449
471
  to: "html_extractor"
450
- queue_size: 32
472
+ queue_size: 4
451
473
  - from: "html_extractor"
452
474
  to: "infographic_extractor"
453
- queue_size: 32
475
+ queue_size: 4
454
476
 
455
477
  # Primitive Extractors
456
478
  - from: "infographic_extractor"
457
479
  to: "table_extractor"
458
- queue_size: 32
480
+ queue_size: 4
459
481
  - from: "table_extractor"
460
482
  to: "chart_extractor"
461
- queue_size: 32
483
+ queue_size: 4
462
484
  - from: "chart_extractor"
485
+ to: "ocr_extractor"
486
+ queue_size: 8
487
+ - from: "ocr_extractor"
463
488
  to: "image_filter"
464
- queue_size: 32
489
+ queue_size: 4
465
490
 
466
491
  # Primitive Mutators
467
492
  - from: "image_filter"
468
493
  to: "image_dedup"
469
- queue_size: 32
494
+ queue_size: 4
470
495
  - from: "image_dedup"
471
496
  to: "text_splitter"
472
- queue_size: 32
497
+ queue_size: 4
473
498
 
474
499
  # Primitive Transforms
475
500
  - from: "text_splitter"
476
501
  to: "image_caption"
477
- queue_size: 32
502
+ queue_size: 4
478
503
  - from: "image_caption"
479
504
  to: "text_embedder"
480
- queue_size: 32
505
+ queue_size: 4
481
506
  - from: "text_embedder"
482
507
  to: "image_storage"
483
- queue_size: 32
508
+ queue_size: 4
484
509
 
485
510
  # Primitive Storage
486
511
  - from: "image_storage"
487
512
  to: "embedding_storage"
488
- queue_size: 32
513
+ queue_size: 4
489
514
  - from: "embedding_storage"
490
515
  to: "broker_response"
491
- queue_size: 32
516
+ queue_size: 4
492
517
 
493
518
  # Response and Telemetry
494
519
  - from: "broker_response"
495
520
  to: "otel_tracer"
496
- queue_size: 32
521
+ queue_size: 4
497
522
  - from: "otel_tracer"
498
523
  to: "default_drain"
499
- queue_size: 32
524
+ queue_size: 4
500
525
 
501
526
  # Pipeline Runtime Configuration
502
527
  pipeline:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.22.dev20251022
3
+ Version: 2025.11.19.dev20251119
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -226,7 +226,6 @@ Requires-Dist: isodate>=0.7.2
226
226
  Requires-Dist: langdetect>=1.0.9
227
227
  Requires-Dist: minio>=7.2.12
228
228
  Requires-Dist: librosa>=0.10.2
229
- Requires-Dist: openai>=1.82.0
230
229
  Requires-Dist: opentelemetry-api>=1.27.0
231
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
232
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -7,27 +7,27 @@ nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,
7
7
  nv_ingest/api/v1/health.py,sha256=pV-RoVq5y0iBPp0qZoLzd1xKpd0JiHAi0UMyMj99LqU,4740
8
8
  nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
9
9
  nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
10
- nv_ingest/api/v2/README.md,sha256=tbQOcD_67YWedboAcDRlZJgjvVZZTW1-ZodcqP0iynk,7133
10
+ nv_ingest/api/v2/README.md,sha256=VhpdjEmCyr3qIOhwqISFx9C5WezJFcxYc-NB9S98HMg,7562
11
11
  nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
12
- nv_ingest/api/v2/ingest.py,sha256=v5l1c1BdmgyPqMzRj8CezI3dR6HpKOuevfomT1v4RGc,37313
12
+ nv_ingest/api/v2/ingest.py,sha256=ikbZE2eAjSnFmt5CcpTduY1t9DsUQBhnBQlsd3HaBww,53103
13
13
  nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
14
14
  nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
15
15
  nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
16
16
  nv_ingest/framework/orchestration/execution/helpers.py,sha256=-F8SZh7ISWtzJz6X1O2LQ133t-17Jxi8lL-NHz4rwj0,2818
17
17
  nv_ingest/framework/orchestration/execution/options.py,sha256=Ms1t4591EIv4ZrMRdhsCYPgLnMVXJosG3MURCbPXUoA,3983
18
18
  nv_ingest/framework/orchestration/process/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
19
- nv_ingest/framework/orchestration/process/dependent_services.py,sha256=ERf2M4O6pvbLDFrvayBHHL7M-FIwECeDEDTY3bi7MBg,2940
19
+ nv_ingest/framework/orchestration/process/dependent_services.py,sha256=s0j_rsFtCKHFIuvOkBe9NEAkPNPhSYse_ApeHka8gyg,3032
20
20
  nv_ingest/framework/orchestration/process/execution.py,sha256=P1kzpYV23e4QYrKw9Td1TCZK3CK1ENVqqnI_axRCqBk,19814
21
21
  nv_ingest/framework/orchestration/process/lifecycle.py,sha256=L5NDwnzSMQPGjqJDC8jC75L1YqWey-dtK8N_HgBzb0E,8001
22
- nv_ingest/framework/orchestration/process/strategies.py,sha256=D7fdTPA7uuteoj6McA6hm1J5ArqoDdSZ7W6_ONDX7N0,7845
23
- nv_ingest/framework/orchestration/process/termination.py,sha256=_aI2ZzCasGfqwu0fcvufOlr1BGAay_Noxq5pAu67gv4,3593
22
+ nv_ingest/framework/orchestration/process/strategies.py,sha256=Q1Q04PPseF775omeS0FoXfK187NiS_bbqTaaJRwzKn8,7972
23
+ nv_ingest/framework/orchestration/process/termination.py,sha256=PAogFeW0FATFS6Mcp_UkZgq_SbWV18RtdZN-0NbComw,5042
24
24
  nv_ingest/framework/orchestration/ray/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
25
25
  nv_ingest/framework/orchestration/ray/edges/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
26
26
  nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py,sha256=PQliU_kyGbO9o42njpb8FrDMLrbLqwZzmBNXifxyG5Y,2312
27
27
  nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py,sha256=VFii2yxJuikimOxie3edKq5JN06g78AF8bdHSHVX8p8,2677
28
28
  nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py,sha256=N6NH4KgZJ60e_JkGRcSmfQtX37qtX4TMcavOR-n3heE,2549
29
29
  nv_ingest/framework/orchestration/ray/examples/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
30
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=hnRLybIpVTj3mXkLW0ErWVn4vRsInjNZmA80JqDiQuw,16473
30
+ nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=Bn4rjkO14BwvvUNG_HBCSVXetYk7DKqRRsYHJADWqjc,16455
31
31
  nv_ingest/framework/orchestration/ray/examples/task_source_harness.py,sha256=Yt7uxThg7s8WuMiaHLKC8r1XAG7QixegfkT-juE5oNw,1953
32
32
  nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha256=XkvsoIzH5ftXvAZ4ox7mxbx7ESVx6D8Xupcwbqgd52w,3277
33
33
  nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -38,12 +38,13 @@ nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=t9lf6zTj
38
38
  nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=GGY6_i6_g5xTFzdo9Qmsu9i4knMTq6pJfgm-aaPEt_o,17226
39
39
  nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
40
40
  nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
41
- nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=4SdgvzI9oJ_OK5oWGir9wXVIPV4Pont2EKv9mwcWMC0,3631
41
+ nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=UVp_kDmkaBlfO0Mbl_IxKq6imzLvs4-DKHgUHJIh3mo,3629
42
42
  nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py,sha256=rfaDx6PqRCguhSYkJI6iVmMMtAlJNxzKfUrLmw_fKqs,4381
43
43
  nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py,sha256=R4vshPcAUN2U6BIv8BCZQ862wLx8RJhCGXfpQ3K09Bs,3627
44
44
  nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py,sha256=7JrZSVIrK4_wr2s7TOTss7pgTY2F9GPQ7Ze3F_WFlKU,3642
45
45
  nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py,sha256=iY9fEfucfgCmO2ixX6qwn418J97nJz_FQGh7B6yziVo,3980
46
46
  nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py,sha256=v5J7dnJBEaDfjoTz_N_yC3RAt6lwMLgLT28V-ahquLE,3261
47
+ nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py,sha256=pwVoA5-CF9GVWusoFZOMGBvSyW5udD9bdxVJXA_SghE,3188
47
48
  nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=QagIA99AsHLihjRbXm-2BphdoQGHwzOHlqLyz7oDOSk,4992
48
49
  nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py,sha256=RMbbl7Cuj4BT-TcgUx_0k8R-DLdw-o3fHxcIBIgrWt4,3776
49
50
  nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py,sha256=p71ktv6v5T-9npYpCbgbwW6-fS-65UWS7rCm8OWr2Bc,4170
@@ -61,7 +62,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py,sha256=wQSlVx3T14
61
62
  nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py,sha256=_USW1Vq8G2Wn-QFdPfFQCrtKG46hHeJvkEGbBxdpbVM,1488
62
63
  nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,sha256=QcvMQXIJ7EWIxty76Mo5Xv38Oj6X2KuS8qXQlf7E1uA,11676
63
64
  nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
64
- nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=Qm9XtTNX2CcUAlZRw33BS3Ql0djcsMGp52FPA2zHu3Q,22340
65
+ nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=LrqaWpWyuiAHlpXWKYSyHZJBFegGXfNlpCXrucbK5NM,24067
65
66
  nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
66
67
  nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=WZN_-3Li-izDaPtk8IMrtn2os1ckT3U8Rb2PsfOWrcI,4009
67
68
  nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=EUtwhSDf-qGLVEhWEInr1VaLsvpcHUSyzCmHQVai-Ps,3547
@@ -103,22 +104,22 @@ nv_ingest/framework/util/flow_control/udf_intercept.py,sha256=zQ9uuCcHLEd0P52Eiw
103
104
  nv_ingest/framework/util/service/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
104
105
  nv_ingest/framework/util/service/impl/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
105
106
  nv_ingest/framework/util/service/impl/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
106
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=OuGC3FFhkLQLR3x4s-tyxGguYYn8ORKr2xkzMy2br0g,22552
107
+ nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=59P-BMWnFY37GJm5w23-TMxgLhiZGZpJogC0gjDBaTA,23835
107
108
  nv_ingest/framework/util/service/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
108
109
  nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
109
110
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
110
111
  nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
111
112
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
112
113
  nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
113
- nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=MiyKe8RS18PNYwEVvrASiHFpynR_BavOe0hhVnUdbEc,15618
114
- nv_ingest/pipeline/default_pipeline_impl.py,sha256=irVm_wmJW5a7a3xTJd18AFZfwLheERkhCty-0XZrIMY,15288
114
+ nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=yNJtjfHQyxtasGa1hQrvgX7UrPa7BAd0oog8EIN8Y_w,15592
115
+ nv_ingest/pipeline/default_pipeline_impl.py,sha256=DhClC17lWUvtBIi2mCC4WkLWT0lxY-CFY0n6nriAxas,16017
115
116
  nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
116
117
  nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
117
118
  nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
118
119
  nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
119
120
  nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
120
- nv_ingest-2025.10.22.dev20251022.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
121
- nv_ingest-2025.10.22.dev20251022.dist-info/METADATA,sha256=fBAiUkJijOoKO-QsdNYEpDF9X1ovQ2BBSBBhLP-Yykw,15122
122
- nv_ingest-2025.10.22.dev20251022.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
- nv_ingest-2025.10.22.dev20251022.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
124
- nv_ingest-2025.10.22.dev20251022.dist-info/RECORD,,
121
+ nv_ingest-2025.11.19.dev20251119.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
122
+ nv_ingest-2025.11.19.dev20251119.dist-info/METADATA,sha256=arJTf3Axy2qKAFDlP4lsKCftTw4vnJp3EECP6hmylYU,15092
123
+ nv_ingest-2025.11.19.dev20251119.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
+ nv_ingest-2025.11.19.dev20251119.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
125
+ nv_ingest-2025.11.19.dev20251119.dist-info/RECORD,,