nv-ingest-api 2025.6.15.dev20250615__py3-none-any.whl → 2025.6.17.dev20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/internal/transform/split_text.py +19 -5
- {nv_ingest_api-2025.6.15.dev20250615.dist-info → nv_ingest_api-2025.6.17.dev20250617.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.6.15.dev20250615.dist-info → nv_ingest_api-2025.6.17.dev20250617.dist-info}/RECORD +6 -6
- {nv_ingest_api-2025.6.15.dev20250615.dist-info → nv_ingest_api-2025.6.17.dev20250617.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.6.15.dev20250615.dist-info → nv_ingest_api-2025.6.17.dev20250617.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.6.15.dev20250615.dist-info → nv_ingest_api-2025.6.17.dev20250617.dist-info}/top_level.txt +0 -0
|
@@ -31,9 +31,16 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
|
|
|
31
31
|
metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {}
|
|
32
32
|
metadata = copy.deepcopy(metadata)
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
if row.document_type == ContentTypeEnum.AUDIO:
|
|
35
|
+
metadata["audio_metadata"]["audio_transcript"] = text
|
|
36
|
+
documents.append(
|
|
37
|
+
{"document_type": ContentTypeEnum.AUDIO.value, "metadata": metadata, "uuid": str(uuid.uuid4())}
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
metadata["content"] = text
|
|
41
|
+
documents.append(
|
|
42
|
+
{"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())}
|
|
43
|
+
)
|
|
37
44
|
|
|
38
45
|
return documents
|
|
39
46
|
|
|
@@ -118,7 +125,7 @@ def transform_text_split_and_tokenize_internal(
|
|
|
118
125
|
)
|
|
119
126
|
|
|
120
127
|
# Filter to documents with text content.
|
|
121
|
-
text_type_condition = df_transform_ledger["document_type"]
|
|
128
|
+
text_type_condition = df_transform_ledger["document_type"].isin([ContentTypeEnum.TEXT, ContentTypeEnum.AUDIO])
|
|
122
129
|
|
|
123
130
|
normalized_meta_df = pd.json_normalize(df_transform_ledger["metadata"], errors="ignore")
|
|
124
131
|
if "source_metadata.source_type" in normalized_meta_df.columns:
|
|
@@ -147,7 +154,14 @@ def transform_text_split_and_tokenize_internal(
|
|
|
147
154
|
|
|
148
155
|
split_docs: List[Dict[str, Any]] = []
|
|
149
156
|
for _, row in df_filtered.iterrows():
|
|
150
|
-
|
|
157
|
+
if row["document_type"] == ContentTypeEnum.AUDIO:
|
|
158
|
+
content: str = (
|
|
159
|
+
row["metadata"]["audio_metadata"]["audio_transcript"]
|
|
160
|
+
if row["metadata"]["audio_metadata"]["audio_transcript"] is not None
|
|
161
|
+
else ""
|
|
162
|
+
)
|
|
163
|
+
else:
|
|
164
|
+
content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
|
|
151
165
|
chunks: List[str] = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
|
|
152
166
|
split_docs.extend(_build_split_documents(row, chunks))
|
|
153
167
|
|
|
@@ -100,7 +100,7 @@ nv_ingest_api/internal/store/image_upload.py,sha256=GNlY4k3pfcHv3lzXxkbmGLeHFsf9
|
|
|
100
100
|
nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
101
101
|
nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
|
|
102
102
|
nv_ingest_api/internal/transform/embed_text.py,sha256=F8kg-WXihtuUMwDQUUYjnfGDCdQp1Mkd-jeThOiJT0s,16507
|
|
103
|
-
nv_ingest_api/internal/transform/split_text.py,sha256
|
|
103
|
+
nv_ingest_api/internal/transform/split_text.py,sha256=-kwpRWSVZrPldm1hn3-tVz_TkzuKM-kPvNU3HTp9zOY,7476
|
|
104
104
|
nv_ingest_api/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
|
|
105
105
|
nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
106
|
nv_ingest_api/util/control_message/validators.py,sha256=KvvbyheJ5rbzvJbH9JKpMR9VfoI0b0uM6eTAZte1p44,1315
|
|
@@ -150,8 +150,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
|
|
|
150
150
|
nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
|
|
151
151
|
nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
152
152
|
nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
|
|
153
|
-
nv_ingest_api-2025.6.
|
|
154
|
-
nv_ingest_api-2025.6.
|
|
155
|
-
nv_ingest_api-2025.6.
|
|
156
|
-
nv_ingest_api-2025.6.
|
|
157
|
-
nv_ingest_api-2025.6.
|
|
153
|
+
nv_ingest_api-2025.6.17.dev20250617.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
154
|
+
nv_ingest_api-2025.6.17.dev20250617.dist-info/METADATA,sha256=xF0jbzU0P1ZUHkkC0zAMq6sImG6D-oGUHh0gWQ89cFk,13919
|
|
155
|
+
nv_ingest_api-2025.6.17.dev20250617.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
156
|
+
nv_ingest_api-2025.6.17.dev20250617.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
|
|
157
|
+
nv_ingest_api-2025.6.17.dev20250617.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|