nv-ingest-api 2025.6.3.dev20250603__py3-none-any.whl → 2025.6.4.dev20250604__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -5,6 +5,8 @@
5
5
  import logging
6
6
 
7
7
  import pandas as pd
8
+ import functools
9
+ import uuid
8
10
  from typing import Any
9
11
  from typing import Dict
10
12
  from typing import Optional
@@ -21,7 +23,7 @@ logger = logging.getLogger(__name__)
21
23
 
22
24
 
23
25
  @unified_exception_handler
24
- def _update_audio_metadata(row: pd.Series, audio_client: Any, trace_info: Dict) -> Dict:
26
+ def _extract_from_audio(row: pd.Series, audio_client: Any, trace_info: Dict, segment_audio: bool = False) -> Dict:
25
27
  """
26
28
  Modifies the metadata of a row if the conditions for table extraction are met.
27
29
 
@@ -56,24 +58,42 @@ def _update_audio_metadata(row: pd.Series, audio_client: Any, trace_info: Dict)
56
58
  base64_audio = metadata.pop("content")
57
59
  content_metadata = metadata.get("content_metadata", {})
58
60
 
59
- # Only modify if content type is audio
61
+ # Only extract transcript if content type is audio
60
62
  if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")):
61
- return metadata
63
+ return [row.to_list()]
62
64
 
63
- # Modify audio metadata with the result from the inference model
64
- audio_result = audio_client.infer(
65
+ # Get the result from the inference model
66
+ segments, transcript = audio_client.infer(
65
67
  base64_audio,
66
68
  model_name="parakeet",
67
69
  trace_info=trace_info, # traceable_func arg
68
70
  stage_name="audio_extraction",
69
71
  )
70
72
 
71
- row["document_type"] = ContentTypeEnum.AUDIO
72
- audio_metadata = {"audio_transcript": audio_result}
73
- metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
74
- row["metadata"] = validate_schema(metadata, MetadataSchema).model_dump()
73
+ extracted_data = []
74
+ if segment_audio:
75
+ for segment in segments:
76
+ segment_metadata = metadata.copy()
77
+ audio_metadata = {"audio_transcript": segment["text"]}
78
+ segment_metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
79
+ segment_metadata["content_metadata"]["start_time"] = segment["start"]
80
+ segment_metadata["content_metadata"]["end_time"] = segment["end"]
81
+
82
+ extracted_data.append(
83
+ [
84
+ ContentTypeEnum.AUDIO,
85
+ validate_schema(segment_metadata, MetadataSchema).model_dump(),
86
+ str(uuid.uuid4()),
87
+ ]
88
+ )
89
+ else:
90
+ audio_metadata = {"audio_transcript": transcript}
91
+ metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
92
+ extracted_data.append(
93
+ [ContentTypeEnum.AUDIO, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]
94
+ )
75
95
 
76
- return metadata
96
+ return extracted_data
77
97
 
78
98
 
79
99
  def extract_text_from_audio_internal(
@@ -121,6 +141,7 @@ def extract_text_from_audio_internal(
121
141
  function_id = extract_params.get("function_id") or audio_extraction_config.function_id
122
142
  use_ssl = extract_params.get("use_ssl") or audio_extraction_config.use_ssl
123
143
  ssl_cert = extract_params.get("ssl_cert") or audio_extraction_config.ssl_cert
144
+ segment_audio = extract_params.get("segment_audio") or audio_extraction_config.segment_audio
124
145
 
125
146
  parakeet_client = create_audio_inference_client(
126
147
  (grpc_endpoint, http_endpoint),
@@ -136,12 +157,27 @@ def extract_text_from_audio_internal(
136
157
  logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
137
158
 
138
159
  try:
139
- # Apply the _update_metadata function to each row in the DataFrame
140
- df_extraction_ledger["metadata"] = df_extraction_ledger.apply(
141
- _update_audio_metadata, axis=1, args=(parakeet_client, execution_trace_log)
160
+ # Create a partial function to extract using the provided configurations.
161
+ _extract_from_audio_partial = functools.partial(
162
+ _extract_from_audio,
163
+ audio_client=parakeet_client,
164
+ trace_info=execution_trace_log,
165
+ segment_audio=segment_audio,
142
166
  )
143
167
 
144
- return df_extraction_ledger, execution_trace_log
168
+ # Apply the _extract_from_audio_partial function to each row in the DataFrame
169
+ extraction_series = df_extraction_ledger.apply(_extract_from_audio_partial, axis=1)
170
+
171
+ # Explode the results if the extraction returns lists.
172
+ extraction_series = extraction_series.explode().dropna()
173
+
174
+ # Convert the extracted results into a DataFrame.
175
+ if not extraction_series.empty:
176
+ extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
177
+ else:
178
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
179
+
180
+ return extracted_df, execution_trace_log
145
181
 
146
182
  except Exception as e:
147
183
  logger.exception(f"Error occurred while extracting audio data: {e}", exc_info=True)
@@ -101,7 +101,7 @@ class ParakeetClient:
101
101
  segments, transcript = process_transcription_response(response)
102
102
  logger.debug("Processing Parakeet inference results (pass-through).")
103
103
 
104
- return transcript
104
+ return segments, transcript
105
105
 
106
106
  def transcribe(
107
107
  self,
@@ -48,6 +48,7 @@ class AudioConfigSchema(BaseModel):
48
48
  function_id: Optional[str] = None
49
49
  use_ssl: Optional[bool] = None
50
50
  ssl_cert: Optional[str] = None
51
+ segment_audio: Optional[bool] = None
51
52
 
52
53
  @root_validator(pre=True)
53
54
  def validate_endpoints(cls, values):
@@ -124,6 +124,7 @@ class IngestTaskAudioExtraction(BaseModelNoExt):
124
124
  function_id: Optional[str] = None
125
125
  use_ssl: Optional[bool] = None
126
126
  ssl_cert: Optional[str] = None
127
+ segment_audio: Optional[bool] = None
127
128
 
128
129
 
129
130
  class IngestTaskTableExtraction(BaseModelNoExt):
@@ -97,6 +97,8 @@ class ContentMetadataSchema(BaseModelNoExt):
97
97
  page_number: int = -1
98
98
  hierarchy: ContentHierarchySchema = ContentHierarchySchema()
99
99
  subtype: Union[ContentTypeEnum, str] = ""
100
+ start_time: int = -1
101
+ end_time: int = -1
100
102
 
101
103
 
102
104
  class TextMetadataSchema(BaseModelNoExt):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.6.3.dev20250603
3
+ Version: 2025.6.4.dev20250604
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -10,7 +10,7 @@ nv_ingest_api/internal/enums/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8y
10
10
  nv_ingest_api/internal/enums/common.py,sha256=HSj7qqNr6KXu_FIyK_Wvel24R-r8lV7dLA173z5XFBc,12321
11
11
  nv_ingest_api/internal/extract/__init__.py,sha256=uLsBITo_XfgbwpzqXUm1IYX6XlZrTfx6T1cIhdILwG8,140
12
12
  nv_ingest_api/internal/extract/audio/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
13
- nv_ingest_api/internal/extract/audio/audio_extraction.py,sha256=L8cK7xB6QTaSx8gsrdyaYHYh0HpW6lycGfduCk7XSMg,5364
13
+ nv_ingest_api/internal/extract/audio/audio_extraction.py,sha256=_jf_UC_FTqZr-xEpwG8edwBzdDjM01gGhqm9ulOsDcY,6973
14
14
  nv_ingest_api/internal/extract/docx/__init__.py,sha256=HIHfzSig66GT0Uk8qsGBm_f13fKYcPtItBicRUWOOVA,183
15
15
  nv_ingest_api/internal/extract/docx/docx_extractor.py,sha256=jjbL12F5dtpbqHRbhL0uomSiQ90bcQq3N7M43XYsq34,8356
16
16
  nv_ingest_api/internal/extract/docx/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -57,7 +57,7 @@ nv_ingest_api/internal/primitives/nim/model_interface/deplot.py,sha256=TvKdk6PTu
57
57
  nv_ingest_api/internal/primitives/nim/model_interface/helpers.py,sha256=x35a9AyTYxpESQflLo_YnhVOKblQKVen6vGGFaXmNiE,9927
58
58
  nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py,sha256=MFWPqMTXs_MZG3ripRR21o7f_mVeoE46Q10yvJ8KNr0,7023
59
59
  nv_ingest_api/internal/primitives/nim/model_interface/paddle.py,sha256=rSUPwl5XOrqneoS6aKhatVjrNBg_LhP3nwUWS_aTwz0,17950
60
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=OYg4AGki_wm--Np9VlSm0eZC-r54GbDOISbe9v0B9fw,12967
60
+ nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py,sha256=5PqD2JuHY2rwd-6SSB4axr2Dd79vm95sAEkcmI3U7ME,12977
61
61
  nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py,sha256=lFhppNqrq5X_fzbCWKphvZQMzaJd3gHrkWsyJORzFrU,5010
62
62
  nv_ingest_api/internal/primitives/nim/model_interface/vlm.py,sha256=qJ382PU1ZrIM-SR3cqIhtY_W2rmHec2HIa2aUB2SvaU,6031
63
63
  nv_ingest_api/internal/primitives/nim/model_interface/yolox.py,sha256=uYXqdvqgkyS4Yfr9ZoikRDX4e94OV3ch3Xhv3JVg-3s,49581
@@ -67,7 +67,7 @@ nv_ingest_api/internal/primitives/tracing/logging.py,sha256=SSzIgS7afLH-e1C7VagY
67
67
  nv_ingest_api/internal/primitives/tracing/tagging.py,sha256=O5dD7Z7j43nrjqn0AxhxOPm5zIyMFo0akxaWU_FguAM,7866
68
68
  nv_ingest_api/internal/schemas/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
69
69
  nv_ingest_api/internal/schemas/extract/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
70
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=VVppZgV1lnyJCTfADexzoj3V0lOSq3t6Dw_6VhIxZ7k,3771
70
+ nv_ingest_api/internal/schemas/extract/extract_audio_schema.py,sha256=W-nEBriqiNkjpaQ5AT_8LhtVXlW8AhlcftmoeQQtKAs,3812
71
71
  nv_ingest_api/internal/schemas/extract/extract_chart_schema.py,sha256=iu8lHQC0zbBB9VRK7PZisAVzpeSpFqjcXRAnwZ9OzoM,4301
72
72
  nv_ingest_api/internal/schemas/extract/extract_docx_schema.py,sha256=M2N7WjMNvSemHcJHWeNUD_kFG0wC5VE2W3K6SVrJqvA,3761
73
73
  nv_ingest_api/internal/schemas/extract/extract_html_schema.py,sha256=lazpONTGZ6Fl420BGBAr6rogFGtlzBiZTc1uA694OIs,841
@@ -82,8 +82,8 @@ nv_ingest_api/internal/schemas/message_brokers/request_schema.py,sha256=LZX_wXDx
82
82
  nv_ingest_api/internal/schemas/message_brokers/response_schema.py,sha256=4b275HlzBSzpmuE2wdoeaGKPCdKki3wuWldtRIfrj8w,727
83
83
  nv_ingest_api/internal/schemas/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
84
84
  nv_ingest_api/internal/schemas/meta/base_model_noext.py,sha256=8hXU1uuiqZ6t8EsoZ8vlC5EFf2zSZrKEX133FcfZMwI,316
85
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=pkMOKIyfAmVcimqZ-zDXngW_lzYxdXYHv8C8cdduUvA,8083
86
- nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=_FAE-yeb01hxq05SXrV3NLM4DPUPSfnIbH6ZMliWsEg,6625
85
+ nv_ingest_api/internal/schemas/meta/ingest_job_schema.py,sha256=JqlK66fg_eRQE9kwzAX4b15FqOSbpgrzLWkcWrWbKdE,8124
86
+ nv_ingest_api/internal/schemas/meta/metadata_schema.py,sha256=KqCUMeBNNgPtpBWzzNhsMtlfr_XvUxnALBbYkI-kfT4,6673
87
87
  nv_ingest_api/internal/schemas/mutate/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
88
88
  nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py,sha256=k1JOdlPPpsipc0XhHf-9YxJ_-W0HvpVE1ZhYmr7fzj0,395
89
89
  nv_ingest_api/internal/schemas/store/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -150,8 +150,8 @@ nv_ingest_api/util/service_clients/rest/rest_client.py,sha256=dZ-jrk7IK7oNtHoXFS
150
150
  nv_ingest_api/util/string_processing/__init__.py,sha256=mkwHthyS-IILcLcL1tJYeF6mpqX3pxEw5aUzDGjTSeU,1411
151
151
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
152
152
  nv_ingest_api/util/system/hardware_info.py,sha256=ORZeKpH9kSGU_vuPhyBwkIiMyCViKUX2CP__MCjrfbU,19463
153
- nv_ingest_api-2025.6.3.dev20250603.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
154
- nv_ingest_api-2025.6.3.dev20250603.dist-info/METADATA,sha256=A_3Cb69Ptv-NHBK1-GtK-W1txbWwJcQ0c1-11_gknDc,13918
155
- nv_ingest_api-2025.6.3.dev20250603.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
156
- nv_ingest_api-2025.6.3.dev20250603.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
157
- nv_ingest_api-2025.6.3.dev20250603.dist-info/RECORD,,
153
+ nv_ingest_api-2025.6.4.dev20250604.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
154
+ nv_ingest_api-2025.6.4.dev20250604.dist-info/METADATA,sha256=kDYJOI53puRP8V4DtVLQ1knOiOeqXsnzArjuqywQhNs,13918
155
+ nv_ingest_api-2025.6.4.dev20250604.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
156
+ nv_ingest_api-2025.6.4.dev20250604.dist-info/top_level.txt,sha256=abjYMlTJGoG5tOdfIB-IWvLyKclw6HLaRSc8MxX4X6I,14
157
+ nv_ingest_api-2025.6.4.dev20250604.dist-info/RECORD,,