nv-ingest 2025.6.2.dev20250602__py3-none-any.whl → 2025.6.24.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

@@ -155,7 +155,7 @@ if __name__ == "__main__":
155
155
  logger.info("Environment variables set.")
156
156
 
157
157
  image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
158
- image_caption_model_name = "meta/llama-3.2-11b-vision-instruct"
158
+ model_name = "meta/llama-3.2-11b-vision-instruct"
159
159
  yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
160
160
  (
161
161
  yolox_table_structure_grpc,
@@ -228,7 +228,7 @@ if __name__ == "__main__":
228
228
  image_caption_config = {
229
229
  "api_key": yolox_auth,
230
230
  "endpoint_url": image_caption_endpoint_url,
231
- "image_caption_model_name": image_caption_model_name,
231
+ "model_name": model_name,
232
232
  "prompt": "Caption the content of this image:",
233
233
  }
234
234
  logger.info("Service configuration retrieved from get_nim_service and environment variables.")
@@ -555,7 +555,7 @@ class PipelineTopology:
555
555
  return None
556
556
 
557
557
  def get_connections(self) -> Dict[str, List[Tuple[str, int]]]:
558
- """Returns a shallow copy of the connections dictionary."""
558
+ """Returns a shallow copy of the connection dictionary."""
559
559
  with self._lock:
560
560
  # Shallow copy is usually sufficient here as tuples are immutable
561
561
  return self._connections.copy()
@@ -571,7 +571,7 @@ class PipelineTopology:
571
571
  return len(self._stage_actors.get(stage_name, []))
572
572
 
573
573
  def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
574
- """Returns a shallow copy of the edge queues dictionary."""
574
+ """Returns a shallow copy of the edge queues' dictionary."""
575
575
  with self._lock:
576
576
  return self._edge_queues.copy()
577
577
 
@@ -40,7 +40,7 @@ class RayStatsCollector:
40
40
  - `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
41
41
  These methods should return snapshots suitable for iteration.
42
42
  interval : float, optional
43
- The interval in seconds between stats collection attempts, by default 5.0.
43
+ The interval in seconds between stat collection attempts, by default 5.0.
44
44
  actor_timeout : float, optional
45
45
  Timeout in seconds for waiting for stats from a single actor, by default 5.0.
46
46
  queue_timeout : float, optional
@@ -2,6 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ from datetime import datetime
5
6
  import logging
6
7
  import pandas as pd
7
8
  from typing import Any
@@ -9,8 +10,15 @@ from pydantic import BaseModel
9
10
  import ray
10
11
 
11
12
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
- from nv_ingest_api.internal.enums.common import DocumentTypeEnum, ContentTypeEnum
13
+ from nv_ingest_api.internal.enums.common import (
14
+ DocumentTypeEnum,
15
+ ContentTypeEnum,
16
+ AccessLevelEnum,
17
+ TextTypeEnum,
18
+ LanguageEnum,
19
+ )
13
20
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
21
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
14
22
  from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
15
23
  from nv_ingest_api.util.exception_handlers.decorators import (
16
24
  nv_ingest_node_failure_try_except,
@@ -61,27 +69,83 @@ class MetadataInjectionStage(RayActorStage):
61
69
  # Convert document type to content type using enums.
62
70
  content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
63
71
  # Check if metadata is missing or doesn't contain 'content'
64
- if "metadata" not in row or not isinstance(row["metadata"], dict) or "content" not in row["metadata"]:
72
+ if (
73
+ "metadata" not in row
74
+ or not isinstance(row["metadata"], dict)
75
+ or "content" not in row["metadata"].keys()
76
+ ):
65
77
  update_required = True
78
+
79
+ # Initialize default structures based on MetaDataSchema
80
+ default_source_metadata = {
81
+ "source_id": row.get("source_id"),
82
+ "source_name": row.get("source_name"),
83
+ "source_type": row["document_type"],
84
+ "source_location": "",
85
+ "collection_id": "",
86
+ "date_created": datetime.now().isoformat(),
87
+ "last_modified": datetime.now().isoformat(),
88
+ "summary": "",
89
+ "partition_id": -1,
90
+ "access_level": AccessLevelEnum.UNKNOWN.value,
91
+ }
92
+
93
+ default_content_metadata = {
94
+ "type": content_type.name.lower(),
95
+ "page_number": -1,
96
+ "description": "",
97
+ "hierarchy": ContentHierarchySchema().model_dump(),
98
+ "subtype": "",
99
+ "start_time": -1,
100
+ "end_time": -1,
101
+ }
102
+
103
+ default_audio_metadata = None
104
+ if content_type == ContentTypeEnum.AUDIO:
105
+ default_audio_metadata = {
106
+ "audio_type": row["document_type"],
107
+ "audio_transcript": "",
108
+ }
109
+
110
+ default_image_metadata = None
111
+ if content_type == ContentTypeEnum.IMAGE:
112
+ default_image_metadata = {
113
+ "image_type": row["document_type"],
114
+ "structured_image_type": ContentTypeEnum.NONE.value,
115
+ "caption": "",
116
+ "text": "",
117
+ "image_location": (0, 0, 0, 0),
118
+ "image_location_max_dimensions": (0, 0),
119
+ "uploaded_image_url": "",
120
+ "width": 0,
121
+ "height": 0,
122
+ }
123
+
124
+ default_text_metadata = None
125
+ if content_type == ContentTypeEnum.TEXT:
126
+ default_text_metadata = {
127
+ "text_type": TextTypeEnum.DOCUMENT.value,
128
+ "summary": "",
129
+ "keywords": "",
130
+ "language": LanguageEnum.UNKNOWN.value,
131
+ "text_location": (0, 0, 0, 0),
132
+ "text_location_max_dimensions": (0, 0, 0, 0),
133
+ }
134
+
66
135
  row["metadata"] = {
67
- "content": row.get("content"),
68
- "content_metadata": {
69
- "type": content_type.name.lower(),
70
- },
136
+ "content": row["content"],
137
+ "content_metadata": default_content_metadata,
71
138
  "error_metadata": None,
72
- "audio_metadata": (
73
- None if content_type != ContentTypeEnum.AUDIO else {"audio_type": row["document_type"]}
74
- ),
75
- "image_metadata": (
76
- None if content_type != ContentTypeEnum.IMAGE else {"image_type": row["document_type"]}
77
- ),
78
- "source_metadata": {
79
- "source_id": row.get("source_id"),
80
- "source_name": row.get("source_name"),
81
- "source_type": row["document_type"],
82
- },
83
- "text_metadata": (None if content_type != ContentTypeEnum.TEXT else {"text_type": "document"}),
139
+ "audio_metadata": default_audio_metadata,
140
+ "image_metadata": default_image_metadata,
141
+ "source_metadata": default_source_metadata,
142
+ "text_metadata": default_text_metadata,
84
143
  }
144
+ logger.info(
145
+ f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
146
+ f"Metadata keys: {list(row['metadata'].keys())}."
147
+ f"'content' present: {'content' in row['metadata']}"
148
+ )
85
149
  except Exception as inner_e:
86
150
  logger.exception("Failed to process row during metadata injection")
87
151
  raise inner_e
@@ -331,6 +331,10 @@ def run_pipeline(
331
331
  """
332
332
  if run_in_subprocess:
333
333
  logger.info("Launching pipeline in Python subprocess using multiprocessing.")
334
+ if (ingest_config.ngc_api_key is None or ingest_config.ngc_api_key == "") and (
335
+ ingest_config.nvidia_build_api_key is None or ingest_config.nvidia_build_api_key == ""
336
+ ):
337
+ logger.warning("NGC_API_KEY or NVIDIA_BUILD_API_KEY are not set. NIM Related functions will not work.")
334
338
 
335
339
  ctx = multiprocessing.get_context("fork")
336
340
  process = ctx.Process(
@@ -479,7 +479,7 @@ def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_capti
479
479
  **{
480
480
  "api_key": auth_token,
481
481
  "endpoint_url": endpoint_url,
482
- "image_caption_model_name": model_name,
482
+ "model_name": model_name,
483
483
  "prompt": "Caption the content of this image:",
484
484
  }
485
485
  )
nv_ingest/version.py CHANGED
@@ -5,7 +5,6 @@
5
5
 
6
6
  import datetime
7
7
  import os
8
- import re
9
8
 
10
9
 
11
10
  def get_version():
@@ -16,13 +15,6 @@ def get_version():
16
15
  if not version:
17
16
  version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18
17
 
19
- # We only check this for dev, we assume for release the user knows what they are doing
20
- if release_type != "release":
21
- # Ensure the version is PEP 440 compatible
22
- pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23
- if not re.match(pep440_regex, version):
24
- raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25
-
26
18
  # Construct the final version string
27
19
  if release_type == "dev":
28
20
  # If rev is not specified and defaults to 0 lets create a more meaningful development
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.6.2.dev20250602
3
+ Version: 2025.6.24.dev20250625
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -240,13 +240,13 @@ Requires-Dist: python-docx>=1.1.2
240
240
  Requires-Dist: python-dotenv>=1.0.1
241
241
  Requires-Dist: python-pptx>=1.0.2
242
242
  Requires-Dist: prometheus-client
243
- Requires-Dist: torch==2.4.1
243
+ Requires-Dist: torch>=2.4.1
244
244
  Requires-Dist: ray[all]>=2.37.0
245
245
  Requires-Dist: redis>=5.2.1
246
246
  Requires-Dist: requests>=2.28.2
247
247
  Requires-Dist: scikit-learn>=1.6.0
248
248
  Requires-Dist: scipy>=1.15.1
249
- Requires-Dist: setuptools>=58.2.0
249
+ Requires-Dist: setuptools>=78.1.1
250
250
  Requires-Dist: tabulate>=0.9.0
251
251
  Requires-Dist: torchvision
252
252
  Requires-Dist: torchaudio
@@ -259,7 +259,7 @@ Requires-Dist: opencv-python
259
259
  Requires-Dist: pymilvus>=2.5.10
260
260
  Requires-Dist: pymilvus[bulk_writer,model]
261
261
  Requires-Dist: tritonclient
262
- Requires-Dist: nvidia-riva-client>=2.18.0
262
+ Requires-Dist: nvidia-riva-client==2.20.0
263
263
  Requires-Dist: unstructured-client
264
264
  Requires-Dist: markitdown
265
265
  Dynamic: license-file
@@ -1,5 +1,5 @@
1
1
  nv_ingest/__init__.py,sha256=vJLPeuxiIHqbxXPJSu9qe3MS-GPavbOUExyRq83DxxM,895
2
- nv_ingest/version.py,sha256=Y9gMjlV_tnRSE3JbmS1rWIfVppM974_g0k30MRF3IQM,1352
2
+ nv_ingest/version.py,sha256=MG7DxlzpnoJI56vqxwzs9WeMAEI3uPhfDiNLs6GN6wI,986
3
3
  nv_ingest/api/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
4
4
  nv_ingest/api/main.py,sha256=XE-p4lJp1E7CCDOB8ENtYFrf63Dtq2bzQiGxpRfL2LA,1603
5
5
  nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -14,15 +14,15 @@ nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py,sha256=PQliU_kyG
14
14
  nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py,sha256=VFii2yxJuikimOxie3edKq5JN06g78AF8bdHSHVX8p8,2677
15
15
  nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py,sha256=N6NH4KgZJ60e_JkGRcSmfQtX37qtX4TMcavOR-n3heE,2549
16
16
  nv_ingest/framework/orchestration/ray/examples/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
17
- nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=QHhoHjELAjs1SNGUOcUBU7f9JXxq3xTwM6fHyzcQytg,16450
17
+ nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py,sha256=QOwnxSCuhmkk-Ak-k5ILkSIqDdk5umJJy4rQloIHNMM,16408
18
18
  nv_ingest/framework/orchestration/ray/examples/task_source_harness.py,sha256=Yt7uxThg7s8WuMiaHLKC8r1XAG7QixegfkT-juE5oNw,1953
19
19
  nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha256=XkvsoIzH5ftXvAZ4ox7mxbx7ESVx6D8Xupcwbqgd52w,3277
20
20
  nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
21
  nv_ingest/framework/orchestration/ray/primitives/dataclasses.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py,sha256=L8ENPiF-lxqhIXVEQwQD5CCqQMb710ynj5D_Y4ixGhs,11077
23
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=gc9gZNqPmnP76M-u8sQXyJd5aTSlyY_0CjLYNa-zvzk,29106
23
+ nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=2Xg7QoKKPPFUWkLck7NtEtb1xLnK3b5uUw8LRxPhLyw,29106
24
24
  nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=BEBLjkYFXIH396EUQcfuxhrWlIMs9i6z7YfeeqJ5cZg,59579
25
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=yPIvOhxY42P-gf5dLkcPkfvfwL_I-ay0C8k5eNaU-VA,15811
25
+ nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=AJ79OTh_NxxoTcyBNiopq3K_nLumsB9UU_axqQS3Gus,15810
26
26
  nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
27
27
  nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
28
28
  nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=KV4hvY0NTGG8CjZviTgcFLQzaH8WJJGkkb9PFYbROww,3417
@@ -35,7 +35,7 @@ nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=
35
35
  nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py,sha256=ywPGA-3GNsbp3FWFsu04foumM6ZCccRrm73ijS7oY0g,3581
36
36
  nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py,sha256=EOcjyJYAB3TuXewZFld4shnGQUQ9VysjPrIWnmb8zuI,3893
37
37
  nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
38
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py,sha256=cdGbLBH0x-4uCdhr6JH_dMVFyBqPODQ5_WYO1otk8tI,4147
38
+ nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py,sha256=K8jase7PD9kd8AuntzjdS1IO4ae8Oo_6byZsFG777D0,6838
39
39
  nv_ingest/framework/orchestration/ray/stages/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
40
40
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py,sha256=LnVqBJmpfCmcI-eJLbkwK-7SS-hpEp98P4iCRv_Zhb0,1726
41
41
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py,sha256=AhlZUbDK2Jckqnu8hVbJrckW8MsSixfmWc1bst9gRYk,3447
@@ -66,8 +66,8 @@ nv_ingest/framework/orchestration/ray/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EP
66
66
  nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
67
67
  nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=AWyCFPP41vp1NOkO2urqm7vh-sTGKypJxwhdq8HxK6Q,50681
68
68
  nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=jMYnVe_0rb1OIO9mlB4LH3uXtgaXBbUG-rDPx6fe6J8,10456
69
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=3aSYSxyunm-eKUYErDArQTHXSoNKlNJMUr9o5Ui6VTk,14037
70
- nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=_MPUbOVTo9CjkBdDA--mcpu2plQ9qFY_TCBXbfpbB_A,21477
69
+ nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=IKQHlEwe0xsjr4MgQJVL0UtnKha1qaoPFc08DF5QzMM,14351
70
+ nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=ZFJkeJNbDM_GsedUlfk2B8kI93L_MNK6gxPgeryZM6I,21463
71
71
  nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
72
72
  nv_ingest/framework/orchestration/ray/util/system_tools/memory.py,sha256=ICqY0LLB3hFTZk03iX5yffMSKFH2q_aQomtDVzS_mKw,2228
73
73
  nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py,sha256=2oHZdO_3L1LGuzpyNmZBDh19n0E-APAaHk4MEwBwSHs,12895
@@ -95,8 +95,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
95
95
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
96
96
  nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
97
97
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
98
- nv_ingest-2025.6.2.dev20250602.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
99
- nv_ingest-2025.6.2.dev20250602.dist-info/METADATA,sha256=PiLNsp7gjUH7ll310hQOEQCNk6i1Bf-7D7kg38MnLps,15141
100
- nv_ingest-2025.6.2.dev20250602.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
- nv_ingest-2025.6.2.dev20250602.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
102
- nv_ingest-2025.6.2.dev20250602.dist-info/RECORD,,
98
+ nv_ingest-2025.6.24.dev20250625.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
99
+ nv_ingest-2025.6.24.dev20250625.dist-info/METADATA,sha256=jTy6k3k5HVq3vTjd8mr9e6xDYpTKH7Skd-SmXrCN9ws,15142
100
+ nv_ingest-2025.6.24.dev20250625.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
+ nv_ingest-2025.6.24.dev20250625.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
102
+ nv_ingest-2025.6.24.dev20250625.dist-info/RECORD,,