nv-ingest-api 2025.4.24.dev20250424__py3-none-any.whl → 2025.4.25.dev20250425__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -453,13 +453,16 @@ def transform_create_text_embeddings_internal(
453
453
 
454
454
  logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
455
455
 
456
+ def _content_type_getter(row):
457
+ return row["content_metadata"]["type"]
458
+
456
459
  # Process each supported content type.
457
460
  for content_type, content_getter in pandas_content_extractor.items():
458
461
  if not content_getter:
459
462
  logger.debug(f"Skipping unsupported content type: {content_type}")
460
463
  continue
461
464
 
462
- content_mask = df_transform_ledger["document_type"] == content_type.value
465
+ content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
463
466
  if not content_mask.any():
464
467
  continue
465
468
 
@@ -7,7 +7,7 @@ from nv_ingest_api.internal.enums.common import ContentTypeEnum
7
7
  DOC_TO_CONTENT_MAP = {
8
8
  DocumentTypeEnum.BMP: ContentTypeEnum.IMAGE,
9
9
  DocumentTypeEnum.DOCX: ContentTypeEnum.STRUCTURED,
10
- DocumentTypeEnum.HTML: ContentTypeEnum.STRUCTURED,
10
+ DocumentTypeEnum.HTML: ContentTypeEnum.TEXT,
11
11
  DocumentTypeEnum.JPEG: ContentTypeEnum.IMAGE,
12
12
  DocumentTypeEnum.MP3: ContentTypeEnum.AUDIO,
13
13
  DocumentTypeEnum.PDF: ContentTypeEnum.STRUCTURED,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.4.24.dev20250424
3
+ Version: 2025.4.25.dev20250425
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -96,7 +96,7 @@ nv_ingest_api/internal/store/embed_text_upload.py,sha256=maxb4FPsBvWgvlrjAPEBlRZ
96
96
  nv_ingest_api/internal/store/image_upload.py,sha256=J5EHNng7Z5I6M4f3UcbniKQB29Scr3Qe05wsBpaVXds,9653
97
97
  nv_ingest_api/internal/transform/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
98
98
  nv_ingest_api/internal/transform/caption_image.py,sha256=0ILCG2F8ESqKtZiPUM-6F1BHUflFZ76Dzi2GNzkE-lU,8517
99
- nv_ingest_api/internal/transform/embed_text.py,sha256=MACFgVHUxK3aVlEmymF7F4pT_aKoCcOKxrmmHalk1f0,15622
99
+ nv_ingest_api/internal/transform/embed_text.py,sha256=W-RKM9D9yQy8WAKiiqpkHESpLsL70SacvHwU9EdMeqs,15728
100
100
  nv_ingest_api/internal/transform/split_text.py,sha256=y6NYRkCEVpVsDu-AqrKx2D6JPp1vwxclw9obNZNJIIs,6561
101
101
  nv_ingest_api/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
102
  nv_ingest_api/util/control_message/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -107,7 +107,7 @@ nv_ingest_api/util/converters/containers.py,sha256=RV1ooujhq6dVujAzC0MIdZvpOZyQq
107
107
  nv_ingest_api/util/converters/datetools.py,sha256=9tskk4BkdLOMLI9ejXvRmri-otE1_Ast3oyX3HQoJZc,2579
108
108
  nv_ingest_api/util/converters/dftools.py,sha256=FjHjazIeiUd1LdFwWuummJmraqZe1a90YrWzSjZKzB4,3284
109
109
  nv_ingest_api/util/converters/formats.py,sha256=L11FtormO2SeHSebbwsGE_uuCv6Jk0D3VvVW2avU0vI,2258
110
- nv_ingest_api/util/converters/type_mappings.py,sha256=VFVK5IXfnXJjG4ijDTzKSEZQvJ7xK6iO8snsLgG2vv8,1108
110
+ nv_ingest_api/util/converters/type_mappings.py,sha256=5TVXRyU6BlQvFOdqknEuQw3ss4PXeCvSUynJnjvgQpA,1102
111
111
  nv_ingest_api/util/detectors/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
112
112
  nv_ingest_api/util/detectors/language.py,sha256=TvzcESYY0bn0U4aLY6GjB4VaCWA6XrXxAGZbVzHTMuE,965
113
113
  nv_ingest_api/util/exception_handlers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -145,8 +145,8 @@ nv_ingest_api/util/service_clients/redis/redis_client.py,sha256=Xa9eeI3kfDBDlLsG
145
145
  nv_ingest_api/util/service_clients/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=tjh9BYPscMQ1-IubbI7O7MWwvga5ZYb7OJu65JtPgGw,21732
147
147
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
148
- nv_ingest_api-2025.4.24.dev20250424.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
149
- nv_ingest_api-2025.4.24.dev20250424.dist-info/METADATA,sha256=MG9QcsfeRIGsKly5b9yFcwRO7VZf21vQwUfKgDQVBoc,13889
150
- nv_ingest_api-2025.4.24.dev20250424.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
151
- nv_ingest_api-2025.4.24.dev20250424.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
152
- nv_ingest_api-2025.4.24.dev20250424.dist-info/RECORD,,
148
+ nv_ingest_api-2025.4.25.dev20250425.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
149
+ nv_ingest_api-2025.4.25.dev20250425.dist-info/METADATA,sha256=d314O-lCnJ7Hc1ZoIGT9AeQBDvAlYhFhUEVGFU0EzLQ,13889
150
+ nv_ingest_api-2025.4.25.dev20250425.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
151
+ nv_ingest_api-2025.4.25.dev20250425.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
152
+ nv_ingest_api-2025.4.25.dev20250425.dist-info/RECORD,,