nv-ingest 25.6.2__tar.gz → 25.6.26.dev20250626__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (108) hide show
  1. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/PKG-INFO +1 -2
  2. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/api/main.py +1 -1
  3. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +2 -2
  4. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +1 -1
  5. nv_ingest-25.6.26.dev20250626/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +161 -0
  6. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/version.py +0 -8
  7. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest.egg-info/PKG-INFO +1 -2
  8. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest.egg-info/requires.txt +0 -1
  9. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/pyproject.toml +0 -1
  10. nv_ingest-25.6.2/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -97
  11. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/LICENSE +0 -0
  12. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/MANIFEST.in +0 -0
  13. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/__init__.py +0 -0
  14. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/api/__init__.py +0 -0
  15. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/api/v1/__init__.py +0 -0
  16. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/api/v1/health.py +0 -0
  17. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/api/v1/ingest.py +0 -0
  18. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/api/v1/metrics.py +0 -0
  19. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/__init__.py +0 -0
  20. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/__init__.py +0 -0
  21. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  22. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  23. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  24. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  25. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  26. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  27. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  28. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  29. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  30. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  31. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  32. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  33. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  34. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  35. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  36. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  37. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  38. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  39. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  40. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  41. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  42. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  43. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  44. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  45. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  46. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  47. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  48. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  49. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  50. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  51. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  52. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  53. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  54. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  55. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  56. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  57. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  58. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  59. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  60. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  61. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  62. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  63. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  64. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  65. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  66. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  67. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  68. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  69. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  70. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  71. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  72. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  73. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  74. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  75. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -0
  76. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
  77. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -0
  78. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  79. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  80. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  81. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/__init__.py +0 -0
  82. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  83. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  84. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  85. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  86. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  87. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  88. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  89. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  90. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  91. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  92. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  93. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/__init__.py +0 -0
  94. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  95. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  96. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/__init__.py +0 -0
  97. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  98. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  99. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  100. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  101. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  102. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  103. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  104. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  105. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest.egg-info/SOURCES.txt +0 -0
  106. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest.egg-info/dependency_links.txt +0 -0
  107. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/nv_ingest.egg-info/top_level.txt +0 -0
  108. {nv_ingest-25.6.2 → nv_ingest-25.6.26.dev20250626}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 25.6.2
3
+ Version: 25.6.26.dev20250626
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -229,7 +229,6 @@ Requires-Dist: openai>=1.82.0
229
229
  Requires-Dist: opentelemetry-api>=1.27.0
230
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
232
- Requires-Dist: nv-ingest-api==25.6.2
233
232
  Requires-Dist: pydantic>2.0.0
234
233
  Requires-Dist: pydantic-settings>2.0.0
235
234
  Requires-Dist: pypdfium2==4.30.1
@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
22
22
  app = FastAPI(
23
23
  title="NV-Ingest Microservice",
24
24
  description="Service for ingesting heterogenous datatypes",
25
- version="25.6.2",
25
+ version="25.4.2",
26
26
  contact={
27
27
  "name": "NVIDIA Corporation",
28
28
  "url": "https://nvidia.com",
@@ -555,7 +555,7 @@ class PipelineTopology:
555
555
  return None
556
556
 
557
557
  def get_connections(self) -> Dict[str, List[Tuple[str, int]]]:
558
- """Returns a shallow copy of the connections dictionary."""
558
+ """Returns a shallow copy of the connection dictionary."""
559
559
  with self._lock:
560
560
  # Shallow copy is usually sufficient here as tuples are immutable
561
561
  return self._connections.copy()
@@ -571,7 +571,7 @@ class PipelineTopology:
571
571
  return len(self._stage_actors.get(stage_name, []))
572
572
 
573
573
  def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
574
- """Returns a shallow copy of the edge queues dictionary."""
574
+ """Returns a shallow copy of the edge queues' dictionary."""
575
575
  with self._lock:
576
576
  return self._edge_queues.copy()
577
577
 
@@ -40,7 +40,7 @@ class RayStatsCollector:
40
40
  - `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
41
41
  These methods should return snapshots suitable for iteration.
42
42
  interval : float, optional
43
- The interval in seconds between stats collection attempts, by default 5.0.
43
+ The interval in seconds between stat collection attempts, by default 5.0.
44
44
  actor_timeout : float, optional
45
45
  Timeout in seconds for waiting for stats from a single actor, by default 5.0.
46
46
  queue_timeout : float, optional
@@ -0,0 +1,161 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from datetime import datetime
6
+ import logging
7
+ import pandas as pd
8
+ from typing import Any
9
+ from pydantic import BaseModel
10
+ import ray
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
13
+ from nv_ingest_api.internal.enums.common import (
14
+ DocumentTypeEnum,
15
+ ContentTypeEnum,
16
+ AccessLevelEnum,
17
+ TextTypeEnum,
18
+ LanguageEnum,
19
+ )
20
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
21
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
22
+ from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
23
+ from nv_ingest_api.util.exception_handlers.decorators import (
24
+ nv_ingest_node_failure_try_except,
25
+ )
26
+
27
+ # logging.basicConfig(level=logging.DEBUG)
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @ray.remote
32
+ class MetadataInjectionStage(RayActorStage):
33
+ """
34
+ A Ray actor stage that performs metadata injection on IngestControlMessages.
35
+
36
+ This stage iterates over the rows of the DataFrame payload, checks if metadata
37
+ injection is required, and if so, injects the appropriate metadata.
38
+ """
39
+
40
+ def __init__(self, config: BaseModel) -> None:
41
+ # Call the base initializer to set attributes like self._running.
42
+ super().__init__(config)
43
+ # Additional initialization can be added here if necessary.
44
+ logger.info("MetadataInjectionStage initialized with config: %s", config)
45
+
46
+ @traceable("metadata_injector")
47
+ @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
48
+ def on_data(self, message: Any) -> Any:
49
+ """
50
+ Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
51
+
52
+ Parameters
53
+ ----------
54
+ message : IngestControlMessage
55
+ The incoming message containing the payload DataFrame.
56
+
57
+ Returns
58
+ -------
59
+ IngestControlMessage
60
+ The message with updated metadata if injection was required.
61
+ """
62
+ df = message.payload()
63
+ update_required = False
64
+ rows = []
65
+ logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
66
+
67
+ for _, row in df.iterrows():
68
+ try:
69
+ # Convert document type to content type using enums.
70
+ content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
71
+ # Check if metadata is missing or doesn't contain 'content'
72
+ if (
73
+ "metadata" not in row
74
+ or not isinstance(row["metadata"], dict)
75
+ or "content" not in row["metadata"].keys()
76
+ ):
77
+ update_required = True
78
+
79
+ # Initialize default structures based on MetaDataSchema
80
+ default_source_metadata = {
81
+ "source_id": row.get("source_id"),
82
+ "source_name": row.get("source_name"),
83
+ "source_type": row["document_type"],
84
+ "source_location": "",
85
+ "collection_id": "",
86
+ "date_created": datetime.now().isoformat(),
87
+ "last_modified": datetime.now().isoformat(),
88
+ "summary": "",
89
+ "partition_id": -1,
90
+ "access_level": AccessLevelEnum.UNKNOWN.value,
91
+ }
92
+
93
+ default_content_metadata = {
94
+ "type": content_type.name.lower(),
95
+ "page_number": -1,
96
+ "description": "",
97
+ "hierarchy": ContentHierarchySchema().model_dump(),
98
+ "subtype": "",
99
+ "start_time": -1,
100
+ "end_time": -1,
101
+ }
102
+
103
+ default_audio_metadata = None
104
+ if content_type == ContentTypeEnum.AUDIO:
105
+ default_audio_metadata = {
106
+ "audio_type": row["document_type"],
107
+ "audio_transcript": "",
108
+ }
109
+
110
+ default_image_metadata = None
111
+ if content_type == ContentTypeEnum.IMAGE:
112
+ default_image_metadata = {
113
+ "image_type": row["document_type"],
114
+ "structured_image_type": ContentTypeEnum.NONE.value,
115
+ "caption": "",
116
+ "text": "",
117
+ "image_location": (0, 0, 0, 0),
118
+ "image_location_max_dimensions": (0, 0),
119
+ "uploaded_image_url": "",
120
+ "width": 0,
121
+ "height": 0,
122
+ }
123
+
124
+ default_text_metadata = None
125
+ if content_type == ContentTypeEnum.TEXT:
126
+ default_text_metadata = {
127
+ "text_type": TextTypeEnum.DOCUMENT.value,
128
+ "summary": "",
129
+ "keywords": "",
130
+ "language": LanguageEnum.UNKNOWN.value,
131
+ "text_location": (0, 0, 0, 0),
132
+ "text_location_max_dimensions": (0, 0, 0, 0),
133
+ }
134
+
135
+ row["metadata"] = {
136
+ "content": row["content"],
137
+ "content_metadata": default_content_metadata,
138
+ "error_metadata": None,
139
+ "audio_metadata": default_audio_metadata,
140
+ "image_metadata": default_image_metadata,
141
+ "source_metadata": default_source_metadata,
142
+ "text_metadata": default_text_metadata,
143
+ }
144
+ logger.info(
145
+ f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
146
+ f"Metadata keys: {list(row['metadata'].keys())}."
147
+ f"'content' present: {'content' in row['metadata']}"
148
+ )
149
+ except Exception as inner_e:
150
+ logger.exception("Failed to process row during metadata injection")
151
+ raise inner_e
152
+ rows.append(row)
153
+
154
+ if update_required:
155
+ docs = pd.DataFrame(rows)
156
+ message.payload(docs)
157
+ logger.info("Metadata injection updated payload with %d rows", len(docs))
158
+ else:
159
+ logger.info("No metadata update was necessary during metadata injection")
160
+
161
+ return message
@@ -5,7 +5,6 @@
5
5
 
6
6
  import datetime
7
7
  import os
8
- import re
9
8
 
10
9
 
11
10
  def get_version():
@@ -16,13 +15,6 @@ def get_version():
16
15
  if not version:
17
16
  version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18
17
 
19
- # We only check this for dev, we assume for release the user knows what they are doing
20
- if release_type != "release":
21
- # Ensure the version is PEP 440 compatible
22
- pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23
- if not re.match(pep440_regex, version):
24
- raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25
-
26
18
  # Construct the final version string
27
19
  if release_type == "dev":
28
20
  # If rev is not specified and defaults to 0 lets create a more meaningful development
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 25.6.2
3
+ Version: 25.6.26.dev20250626
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -229,7 +229,6 @@ Requires-Dist: openai>=1.82.0
229
229
  Requires-Dist: opentelemetry-api>=1.27.0
230
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
232
- Requires-Dist: nv-ingest-api==25.6.2
233
232
  Requires-Dist: pydantic>2.0.0
234
233
  Requires-Dist: pydantic-settings>2.0.0
235
234
  Requires-Dist: pypdfium2==4.30.1
@@ -14,7 +14,6 @@ openai>=1.82.0
14
14
  opentelemetry-api>=1.27.0
15
15
  opentelemetry-exporter-otlp>=1.27.0
16
16
  opentelemetry-sdk>=1.27.0
17
- nv-ingest-api==25.6.2
18
17
  pydantic>2.0.0
19
18
  pydantic-settings>2.0.0
20
19
  pypdfium2==4.30.1
@@ -33,7 +33,6 @@ dependencies = [
33
33
  "opentelemetry-api>=1.27.0",
34
34
  "opentelemetry-exporter-otlp>=1.27.0",
35
35
  "opentelemetry-sdk>=1.27.0",
36
- "nv-ingest-api==25.6.2",
37
36
  "pydantic>2.0.0",
38
37
  "pydantic-settings>2.0.0",
39
38
  "pypdfium2==4.30.1",
@@ -1,97 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- import pandas as pd
7
- from typing import Any
8
- from pydantic import BaseModel
9
- import ray
10
-
11
- from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
- from nv_ingest_api.internal.enums.common import DocumentTypeEnum, ContentTypeEnum
13
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
- from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
15
- from nv_ingest_api.util.exception_handlers.decorators import (
16
- nv_ingest_node_failure_try_except,
17
- )
18
-
19
- # logging.basicConfig(level=logging.DEBUG)
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- @ray.remote
24
- class MetadataInjectionStage(RayActorStage):
25
- """
26
- A Ray actor stage that performs metadata injection on IngestControlMessages.
27
-
28
- This stage iterates over the rows of the DataFrame payload, checks if metadata
29
- injection is required, and if so, injects the appropriate metadata.
30
- """
31
-
32
- def __init__(self, config: BaseModel) -> None:
33
- # Call the base initializer to set attributes like self._running.
34
- super().__init__(config)
35
- # Additional initialization can be added here if necessary.
36
- logger.info("MetadataInjectionStage initialized with config: %s", config)
37
-
38
- @traceable("metadata_injector")
39
- @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
40
- def on_data(self, message: Any) -> Any:
41
- """
42
- Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
43
-
44
- Parameters
45
- ----------
46
- message : IngestControlMessage
47
- The incoming message containing the payload DataFrame.
48
-
49
- Returns
50
- -------
51
- IngestControlMessage
52
- The message with updated metadata if injection was required.
53
- """
54
- df = message.payload()
55
- update_required = False
56
- rows = []
57
- logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
58
-
59
- for _, row in df.iterrows():
60
- try:
61
- # Convert document type to content type using enums.
62
- content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
63
- # Check if metadata is missing or doesn't contain 'content'
64
- if "metadata" not in row or not isinstance(row["metadata"], dict) or "content" not in row["metadata"]:
65
- update_required = True
66
- row["metadata"] = {
67
- "content": row.get("content"),
68
- "content_metadata": {
69
- "type": content_type.name.lower(),
70
- },
71
- "error_metadata": None,
72
- "audio_metadata": (
73
- None if content_type != ContentTypeEnum.AUDIO else {"audio_type": row["document_type"]}
74
- ),
75
- "image_metadata": (
76
- None if content_type != ContentTypeEnum.IMAGE else {"image_type": row["document_type"]}
77
- ),
78
- "source_metadata": {
79
- "source_id": row.get("source_id"),
80
- "source_name": row.get("source_name"),
81
- "source_type": row["document_type"],
82
- },
83
- "text_metadata": (None if content_type != ContentTypeEnum.TEXT else {"text_type": "document"}),
84
- }
85
- except Exception as inner_e:
86
- logger.exception("Failed to process row during metadata injection")
87
- raise inner_e
88
- rows.append(row)
89
-
90
- if update_required:
91
- docs = pd.DataFrame(rows)
92
- message.payload(docs)
93
- logger.info("Metadata injection updated payload with %d rows", len(docs))
94
- else:
95
- logger.info("No metadata update was necessary during metadata injection")
96
-
97
- return message