nv-ingest-api 2025.6.16.dev20250616__py3-none-any.whl → 2025.6.24.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -31,9 +31,16 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
31
31
  metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {}
32
32
  metadata = copy.deepcopy(metadata)
33
33
 
34
- metadata["content"] = text
35
-
36
- documents.append({"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())})
34
+ if row.document_type == ContentTypeEnum.AUDIO:
35
+ metadata["audio_metadata"]["audio_transcript"] = text
36
+ documents.append(
37
+ {"document_type": ContentTypeEnum.AUDIO.value, "metadata": metadata, "uuid": str(uuid.uuid4())}
38
+ )
39
+ else:
40
+ metadata["content"] = text
41
+ documents.append(
42
+ {"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())}
43
+ )
37
44
 
38
45
  return documents
39
46
 
@@ -118,7 +125,7 @@ def transform_text_split_and_tokenize_internal(
118
125
  )
119
126
 
120
127
  # Filter to documents with text content.
121
- text_type_condition = df_transform_ledger["document_type"] == ContentTypeEnum.TEXT
128
+ text_type_condition = df_transform_ledger["document_type"].isin([ContentTypeEnum.TEXT, ContentTypeEnum.AUDIO])
122
129
 
123
130
  normalized_meta_df = pd.json_normalize(df_transform_ledger["metadata"], errors="ignore")
124
131
  if "source_metadata.source_type" in normalized_meta_df.columns:
@@ -147,7 +154,14 @@ def transform_text_split_and_tokenize_internal(
147
154
 
148
155
  split_docs: List[Dict[str, Any]] = []
149
156
  for _, row in df_filtered.iterrows():
150
- content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
157
+ if row["document_type"] == ContentTypeEnum.AUDIO:
158
+ content: str = (
159
+ row["metadata"]["audio_metadata"]["audio_transcript"]
160
+ if row["metadata"]["audio_metadata"]["audio_transcript"] is not None
161
+ else ""
162
+ )
163
+ else:
164
+ content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
151
165
  chunks: List[str] = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
152
166
  split_docs.extend(_build_split_documents(row, chunks))
153
167
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.6.16.dev20250616
3
+ Version: 2025.6.24.dev20250625
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -100,7 +100,7 @@ nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9
100
100
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
101
101
  nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
102
102
  nv_ingest_api/internal/transform/embed_text.py,sha256=F8kg-WXihtuUMwDQUUYjnfGDCdQp1Mkd-jeThOiJT0s,16507
103
- nv_ingest_api/internal/transform/split_text.py,sha256=DlVoyHLqZ-6_FiWwZmofPcq7TX8Ta23hIE0St9tw1IY,6822
103
+ nv_ingest_api/internal/transform/split_text.py,sha256=-kwpRWSVZrPldm1hn3-tVz_TkzuKM-kPvNU3HTp9zOY,7476
104
104
  nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
105
105
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
106
  nv_ingest_api/util/control_message/validators.py,sha256=KvvbyheJ5rbzvJbH9JKpMR9VfoI0b0uM6eTAZte1p44,1315
@@ -150,8 +150,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
150
150
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
151
151
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
152
152
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
153
- nv_ingest_api-2025.6.16.dev20250616.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
154
- nv_ingest_api-2025.6.16.dev20250616.dist-info/METADATA,sha256=nrAshcKmKQTsbPtJTNzrdu37KttrTjLkAhUybCMLrV8,13919
155
- nv_ingest_api-2025.6.16.dev20250616.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
156
- nv_ingest_api-2025.6.16.dev20250616.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
157
- nv_ingest_api-2025.6.16.dev20250616.dist-info/RECORD,,
153
+ nv_ingest_api-2025.6.24.dev20250625.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
154
+ nv_ingest_api-2025.6.24.dev20250625.dist-info/METADATA,sha256=jnzs5A1QthJjwPAHRU2Z4PmVEAgddnZMiXNX5W-sMjI,13919
155
+ nv_ingest_api-2025.6.24.dev20250625.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
156
+ nv_ingest_api-2025.6.24.dev20250625.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
157
+ nv_ingest_api-2025.6.24.dev20250625.dist-info/RECORD,,