nv-ingest 25.6.1__py3-none-any.whl → 25.6.25.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

nv_ingest/api/main.py CHANGED
@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
22
22
  app = FastAPI(
23
23
  title="NV-Ingest Microservice",
24
24
  description="Service for ingesting heterogenous datatypes",
25
- version="25.6.1",
25
+ version="25.4.2",
26
26
  contact={
27
27
  "name": "NVIDIA Corporation",
28
28
  "url": "https://nvidia.com",
@@ -555,7 +555,7 @@ class PipelineTopology:
555
555
  return None
556
556
 
557
557
  def get_connections(self) -> Dict[str, List[Tuple[str, int]]]:
558
- """Returns a shallow copy of the connections dictionary."""
558
+ """Returns a shallow copy of the connection dictionary."""
559
559
  with self._lock:
560
560
  # Shallow copy is usually sufficient here as tuples are immutable
561
561
  return self._connections.copy()
@@ -571,7 +571,7 @@ class PipelineTopology:
571
571
  return len(self._stage_actors.get(stage_name, []))
572
572
 
573
573
  def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
574
- """Returns a shallow copy of the edge queues dictionary."""
574
+ """Returns a shallow copy of the edge queues' dictionary."""
575
575
  with self._lock:
576
576
  return self._edge_queues.copy()
577
577
 
@@ -40,7 +40,7 @@ class RayStatsCollector:
40
40
  - `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
41
41
  These methods should return snapshots suitable for iteration.
42
42
  interval : float, optional
43
- The interval in seconds between stats collection attempts, by default 5.0.
43
+ The interval in seconds between stat collection attempts, by default 5.0.
44
44
  actor_timeout : float, optional
45
45
  Timeout in seconds for waiting for stats from a single actor, by default 5.0.
46
46
  queue_timeout : float, optional
@@ -2,6 +2,7 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ from datetime import datetime
5
6
  import logging
6
7
  import pandas as pd
7
8
  from typing import Any
@@ -9,8 +10,15 @@ from pydantic import BaseModel
9
10
  import ray
10
11
 
11
12
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
- from nv_ingest_api.internal.enums.common import DocumentTypeEnum, ContentTypeEnum
13
+ from nv_ingest_api.internal.enums.common import (
14
+ DocumentTypeEnum,
15
+ ContentTypeEnum,
16
+ AccessLevelEnum,
17
+ TextTypeEnum,
18
+ LanguageEnum,
19
+ )
13
20
  from nv_ingest_api.internal.primitives.tracing.tagging import traceable
21
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
14
22
  from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
15
23
  from nv_ingest_api.util.exception_handlers.decorators import (
16
24
  nv_ingest_node_failure_try_except,
@@ -61,27 +69,83 @@ class MetadataInjectionStage(RayActorStage):
61
69
  # Convert document type to content type using enums.
62
70
  content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
63
71
  # Check if metadata is missing or doesn't contain 'content'
64
- if "metadata" not in row or not isinstance(row["metadata"], dict) or "content" not in row["metadata"]:
72
+ if (
73
+ "metadata" not in row
74
+ or not isinstance(row["metadata"], dict)
75
+ or "content" not in row["metadata"].keys()
76
+ ):
65
77
  update_required = True
78
+
79
+ # Initialize default structures based on MetaDataSchema
80
+ default_source_metadata = {
81
+ "source_id": row.get("source_id"),
82
+ "source_name": row.get("source_name"),
83
+ "source_type": row["document_type"],
84
+ "source_location": "",
85
+ "collection_id": "",
86
+ "date_created": datetime.now().isoformat(),
87
+ "last_modified": datetime.now().isoformat(),
88
+ "summary": "",
89
+ "partition_id": -1,
90
+ "access_level": AccessLevelEnum.UNKNOWN.value,
91
+ }
92
+
93
+ default_content_metadata = {
94
+ "type": content_type.name.lower(),
95
+ "page_number": -1,
96
+ "description": "",
97
+ "hierarchy": ContentHierarchySchema().model_dump(),
98
+ "subtype": "",
99
+ "start_time": -1,
100
+ "end_time": -1,
101
+ }
102
+
103
+ default_audio_metadata = None
104
+ if content_type == ContentTypeEnum.AUDIO:
105
+ default_audio_metadata = {
106
+ "audio_type": row["document_type"],
107
+ "audio_transcript": "",
108
+ }
109
+
110
+ default_image_metadata = None
111
+ if content_type == ContentTypeEnum.IMAGE:
112
+ default_image_metadata = {
113
+ "image_type": row["document_type"],
114
+ "structured_image_type": ContentTypeEnum.NONE.value,
115
+ "caption": "",
116
+ "text": "",
117
+ "image_location": (0, 0, 0, 0),
118
+ "image_location_max_dimensions": (0, 0),
119
+ "uploaded_image_url": "",
120
+ "width": 0,
121
+ "height": 0,
122
+ }
123
+
124
+ default_text_metadata = None
125
+ if content_type == ContentTypeEnum.TEXT:
126
+ default_text_metadata = {
127
+ "text_type": TextTypeEnum.DOCUMENT.value,
128
+ "summary": "",
129
+ "keywords": "",
130
+ "language": LanguageEnum.UNKNOWN.value,
131
+ "text_location": (0, 0, 0, 0),
132
+ "text_location_max_dimensions": (0, 0, 0, 0),
133
+ }
134
+
66
135
  row["metadata"] = {
67
- "content": row.get("content"),
68
- "content_metadata": {
69
- "type": content_type.name.lower(),
70
- },
136
+ "content": row["content"],
137
+ "content_metadata": default_content_metadata,
71
138
  "error_metadata": None,
72
- "audio_metadata": (
73
- None if content_type != ContentTypeEnum.AUDIO else {"audio_type": row["document_type"]}
74
- ),
75
- "image_metadata": (
76
- None if content_type != ContentTypeEnum.IMAGE else {"image_type": row["document_type"]}
77
- ),
78
- "source_metadata": {
79
- "source_id": row.get("source_id"),
80
- "source_name": row.get("source_name"),
81
- "source_type": row["document_type"],
82
- },
83
- "text_metadata": (None if content_type != ContentTypeEnum.TEXT else {"text_type": "document"}),
139
+ "audio_metadata": default_audio_metadata,
140
+ "image_metadata": default_image_metadata,
141
+ "source_metadata": default_source_metadata,
142
+ "text_metadata": default_text_metadata,
84
143
  }
144
+ logger.info(
145
+ f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
146
+ f"Metadata keys: {list(row['metadata'].keys())}."
147
+ f"'content' present: {'content' in row['metadata']}"
148
+ )
85
149
  except Exception as inner_e:
86
150
  logger.exception("Failed to process row during metadata injection")
87
151
  raise inner_e
@@ -331,6 +331,10 @@ def run_pipeline(
331
331
  """
332
332
  if run_in_subprocess:
333
333
  logger.info("Launching pipeline in Python subprocess using multiprocessing.")
334
+ if (ingest_config.ngc_api_key is None or ingest_config.ngc_api_key == "") and (
335
+ ingest_config.nvidia_build_api_key is None or ingest_config.nvidia_build_api_key == ""
336
+ ):
337
+ logger.warning("NGC_API_KEY or NVIDIA_BUILD_API_KEY are not set. NIM Related functions will not work.")
334
338
 
335
339
  ctx = multiprocessing.get_context("fork")
336
340
  process = ctx.Process(
nv_ingest/version.py CHANGED
@@ -5,7 +5,6 @@
5
5
 
6
6
  import datetime
7
7
  import os
8
- import re
9
8
 
10
9
 
11
10
  def get_version():
@@ -16,13 +15,6 @@ def get_version():
16
15
  if not version:
17
16
  version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18
17
 
19
- # We only check this for dev, we assume for release the user knows what they are doing
20
- if release_type != "release":
21
- # Ensure the version is PEP 440 compatible
22
- pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23
- if not re.match(pep440_regex, version):
24
- raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25
-
26
18
  # Construct the final version string
27
19
  if release_type == "dev":
28
20
  # If rev is not specified and defaults to 0 lets create a more meaningful development
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 25.6.1
3
+ Version: 25.6.25.dev20250625
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -229,7 +229,6 @@ Requires-Dist: openai>=1.82.0
229
229
  Requires-Dist: opentelemetry-api>=1.27.0
230
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
232
- Requires-Dist: nv-ingest-api==25.6.1
233
232
  Requires-Dist: pydantic>2.0.0
234
233
  Requires-Dist: pydantic-settings>2.0.0
235
234
  Requires-Dist: pypdfium2==4.30.1
@@ -1,7 +1,7 @@
1
1
  nv_ingest/__init__.py,sha256=vJLPeuxiIHqbxXPJSu9qe3MS-GPavbOUExyRq83DxxM,895
2
- nv_ingest/version.py,sha256=Y9gMjlV_tnRSE3JbmS1rWIfVppM974_g0k30MRF3IQM,1352
2
+ nv_ingest/version.py,sha256=MG7DxlzpnoJI56vqxwzs9WeMAEI3uPhfDiNLs6GN6wI,986
3
3
  nv_ingest/api/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
4
- nv_ingest/api/main.py,sha256=P9DKJbSNWso8qrlc_QsUNapFrqU0P-mo2Z2wvZeEC6E,1603
4
+ nv_ingest/api/main.py,sha256=XE-p4lJp1E7CCDOB8ENtYFrf63Dtq2bzQiGxpRfL2LA,1603
5
5
  nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
6
6
  nv_ingest/api/v1/health.py,sha256=zqu-isMRjh4NveS4XWh5FaAZGPIlBVxpCOg3Uu8nUHQ,4746
7
7
  nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
@@ -20,9 +20,9 @@ nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha25
20
20
  nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
21
21
  nv_ingest/framework/orchestration/ray/primitives/dataclasses.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py,sha256=L8ENPiF-lxqhIXVEQwQD5CCqQMb710ynj5D_Y4ixGhs,11077
23
- nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=gc9gZNqPmnP76M-u8sQXyJd5aTSlyY_0CjLYNa-zvzk,29106
23
+ nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=2Xg7QoKKPPFUWkLck7NtEtb1xLnK3b5uUw8LRxPhLyw,29106
24
24
  nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=BEBLjkYFXIH396EUQcfuxhrWlIMs9i6z7YfeeqJ5cZg,59579
25
- nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=yPIvOhxY42P-gf5dLkcPkfvfwL_I-ay0C8k5eNaU-VA,15811
25
+ nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=AJ79OTh_NxxoTcyBNiopq3K_nLumsB9UU_axqQS3Gus,15810
26
26
  nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
27
27
  nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
28
28
  nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=KV4hvY0NTGG8CjZviTgcFLQzaH8WJJGkkb9PFYbROww,3417
@@ -35,7 +35,7 @@ nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=
35
35
  nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py,sha256=ywPGA-3GNsbp3FWFsu04foumM6ZCccRrm73ijS7oY0g,3581
36
36
  nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py,sha256=EOcjyJYAB3TuXewZFld4shnGQUQ9VysjPrIWnmb8zuI,3893
37
37
  nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
38
- nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py,sha256=cdGbLBH0x-4uCdhr6JH_dMVFyBqPODQ5_WYO1otk8tI,4147
38
+ nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py,sha256=K8jase7PD9kd8AuntzjdS1IO4ae8Oo_6byZsFG777D0,6838
39
39
  nv_ingest/framework/orchestration/ray/stages/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
40
40
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py,sha256=LnVqBJmpfCmcI-eJLbkwK-7SS-hpEp98P4iCRv_Zhb0,1726
41
41
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py,sha256=AhlZUbDK2Jckqnu8hVbJrckW8MsSixfmWc1bst9gRYk,3447
@@ -66,7 +66,7 @@ nv_ingest/framework/orchestration/ray/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EP
66
66
  nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
67
67
  nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=AWyCFPP41vp1NOkO2urqm7vh-sTGKypJxwhdq8HxK6Q,50681
68
68
  nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=jMYnVe_0rb1OIO9mlB4LH3uXtgaXBbUG-rDPx6fe6J8,10456
69
- nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=3aSYSxyunm-eKUYErDArQTHXSoNKlNJMUr9o5Ui6VTk,14037
69
+ nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=IKQHlEwe0xsjr4MgQJVL0UtnKha1qaoPFc08DF5QzMM,14351
70
70
  nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=ZFJkeJNbDM_GsedUlfk2B8kI93L_MNK6gxPgeryZM6I,21463
71
71
  nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
72
72
  nv_ingest/framework/orchestration/ray/util/system_tools/memory.py,sha256=ICqY0LLB3hFTZk03iX5yffMSKFH2q_aQomtDVzS_mKw,2228
@@ -95,8 +95,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
95
95
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
96
96
  nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
97
97
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
98
- nv_ingest-25.6.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
99
- nv_ingest-25.6.1.dist-info/METADATA,sha256=xbfL7int1fiLHvrtcuAUuuXGrDg7FkW8Bt2hdvVyhEQ,15164
100
- nv_ingest-25.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
- nv_ingest-25.6.1.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
102
- nv_ingest-25.6.1.dist-info/RECORD,,
98
+ nv_ingest-25.6.25.dev20250625.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
99
+ nv_ingest-25.6.25.dev20250625.dist-info/METADATA,sha256=uYNf7IuKHG8WrBE5U18jPcdxxF0t6prLbhma8Q8uKvI,15140
100
+ nv_ingest-25.6.25.dev20250625.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
+ nv_ingest-25.6.25.dev20250625.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
102
+ nv_ingest-25.6.25.dev20250625.dist-info/RECORD,,