nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,9 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2025, NVIDIA CORPORATION.
6
+
7
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader, MediaInterface
8
+
9
+ __all__ = ["DataLoader", "MediaInterface"]
@@ -0,0 +1,409 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2025, NVIDIA CORPORATION.
6
+ from pathlib import Path
7
+ from abc import ABC, abstractmethod
8
+ import queue
9
+ import threading
10
+ import subprocess
11
+ import json
12
+ import logging
13
+ import math
14
+ import importlib.util
15
+ from enum import Enum
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ from tqdm import tqdm
18
+ import os
19
+ import glob
20
+
21
+ from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ try:
26
+ importlib.util.find_spec("ffmpeg")
27
+ subprocess.run(["ffmpeg", "-version"], capture_output=True)
28
+ except Exception:
29
+ logger.error(
30
+ "Unable to load the Dataloader, ffmpeg was not installed, "
31
+ "please install it using `pip install ffmpeg-python` and `apt-get install ffmpeg`"
32
+ )
33
+ ffmpeg = None
34
+ else:
35
+ import ffmpeg
36
+
37
+ if not ffmpeg:
38
+ DataLoader = None
39
+ MediaInterface = None
40
+ else:
41
+
42
+ class SplitType(Enum):
43
+ FRAME = "frame"
44
+ TIME = "time"
45
+ SIZE = "size"
46
+
47
+ class LoaderInterface(ABC):
48
+
49
+ @abstractmethod
50
+ def split(self, input_path: str, output_dir: str, split_interval: int = 0):
51
+ pass
52
+
53
+ @abstractmethod
54
+ def _get_path_metadata(self, path: str = None):
55
+ pass
56
+
57
+ def _probe(filename, format=None, file_handle=None, timeout=None, **kwargs):
58
+ args = ["ffprobe", "-show_format", "-show_streams", "-of", "json"]
59
+ args += ffmpeg._utils.convert_kwargs_to_cmd_line_args(kwargs)
60
+ if file_handle:
61
+ args += ["pipe:"]
62
+ else:
63
+ args += [filename]
64
+ p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
65
+ communicate_kwargs = {}
66
+ if timeout is not None:
67
+ communicate_kwargs["timeout"] = timeout
68
+ if file_handle:
69
+ communicate_kwargs["input"] = file_handle if file_handle else filename
70
+ out, err = p.communicate(**communicate_kwargs)
71
+ if p.returncode != 0:
72
+ raise ffmpeg._run.Error("ffprobe", out, err)
73
+ return json.loads(out.decode("utf-8"))
74
+
75
+ def _get_audio_from_video(input_path: str, output_file: str, cache_path: str = None):
76
+ """
77
+ Get the audio from a video file. if audio extraction fails, return None.
78
+ input_path: str, path to the video file
79
+ output_dir: str, path to the output directory
80
+ cache_path: str, path to the cache directory
81
+ """
82
+ output_path = Path(output_file)
83
+ output_dir = output_path.parent
84
+ output_dir.mkdir(parents=True, exist_ok=True)
85
+ try:
86
+ capture_output, capture_error = (
87
+ ffmpeg.input(str(input_path))
88
+ .output(str(output_path), acodec="libmp3lame", map="0:a")
89
+ .overwrite_output()
90
+ .run(capture_stdout=True, capture_stderr=True)
91
+ )
92
+ return output_path
93
+ except ffmpeg.Error as e:
94
+ logging.error(f"FFmpeg error for file {input_path}: {e.stderr.decode()}")
95
+ return None
96
+
97
+ def strip_audio_from_video_files(input_path: str, output_dir: str, cache_path: str = None, file_type=".mp4"):
98
+ """
99
+ Strip the audio from a series of video files and return the paths to the new files.
100
+ input_path: str, path to the video file
101
+ output_dir: str, path to the output directory
102
+ cache_path: str, path to the cache directory
103
+ """
104
+ output_path = Path(output_dir)
105
+ output_path.mkdir(parents=True, exist_ok=True)
106
+ futures = []
107
+ results = None
108
+ path = Path(input_path)
109
+ files = [path] if path.is_file() else glob.glob(os.path.join(path, f"*{file_type}"))
110
+ files = [Path(file) for file in files]
111
+ with ThreadPoolExecutor(max_workers=15) as executor:
112
+ futures = [executor.submit(_get_audio_from_video, file, output_path / f"{file.stem}.mp3") for file in files]
113
+ results = [str(future.result()) for future in tqdm(futures)]
114
+ return results
115
+
116
+ class MediaInterface(LoaderInterface):
117
+
118
+ def __init__(self):
119
+ self.path_metadata = {}
120
+
121
+ def probe_media(self, path_file: Path, split_interval: int, split_type: SplitType, file_handle=None):
122
+ num_splits = None
123
+ duration = None
124
+ probe = None
125
+ sample_rate = None
126
+ try:
127
+ file_size = path_file.stat().st_size # in bytes
128
+ if file_handle:
129
+ probe = _probe("pipe:", format=path_file.suffix, file_handle=file_handle)
130
+ else:
131
+ probe = _probe(str(path_file), format=path_file.suffix)
132
+ if probe["streams"][0]["codec_type"] == "video":
133
+ sample_rate = float(probe["streams"][0]["avg_frame_rate"].split("/")[0])
134
+ duration = float(probe["format"]["duration"])
135
+ elif probe["streams"][0]["codec_type"] == "audio":
136
+ sample_rate = float(probe["streams"][0]["sample_rate"])
137
+ bitrate = probe["format"]["bit_rate"]
138
+ duration = (file_size * 8) / float(bitrate)
139
+ num_splits = self.find_num_splits(file_size, sample_rate, duration, split_interval, split_type)
140
+ except ffmpeg.Error as e:
141
+ logging.error(f"FFmpeg error for file {path_file}: {e.stderr.decode()}")
142
+ except ValueError as e:
143
+ logging.error(f"Error finding number of splits for file {path_file}: {e}")
144
+ return probe, num_splits, duration
145
+
146
+ def get_audio_from_video(self, input_path: str, output_file: str, cache_path: str = None):
147
+ return _get_audio_from_video(input_path, output_file, cache_path)
148
+
149
+ def split(
150
+ self,
151
+ input_path: str,
152
+ output_dir: str,
153
+ split_interval: int = 0,
154
+ split_type: SplitType = SplitType.SIZE,
155
+ cache_path: str = None,
156
+ video_audio_separate: bool = False,
157
+ audio_only: bool = False,
158
+ ):
159
+ """
160
+ Split a media file into smaller chunks of `split_interval` size. if
161
+ video_audio_separate is True and the file is a video, the audio will be
162
+ extracted from the video and saved to a separate files. Data can be returned
163
+ as a tuple of (video_files, audio_files) or just files (i.e. audio files).
164
+ input_path: str, path to the media file
165
+ output_dir: str, path to the output directory
166
+ split_interval: the size of the chunk to split the media file into depending on the split type
167
+ split_type: SplitType, type of split to perform, either size, time, or frame
168
+ video_audio_separate: bool, whether to separate the video and audio files
169
+ audio_only: bool, whether to only return the audio files
170
+ """
171
+ import ffmpeg
172
+
173
+ files_to_remove = []
174
+ output_dir = Path(output_dir)
175
+ output_dir.mkdir(parents=True, exist_ok=True)
176
+ original_input_path = input_path
177
+ if audio_only and Path(input_path).suffix in [".mp4", ".mov", ".avi", ".mkv"]:
178
+ input_path = self.get_audio_from_video(input_path, output_dir / f"{input_path.stem}.mp3")
179
+ files_to_remove.append(input_path)
180
+ path_file = Path(input_path)
181
+ file_name = path_file.stem
182
+ suffix = path_file.suffix
183
+ output_pattern = output_dir / f"{file_name}_chunk_%04d{suffix}"
184
+
185
+ num_splits = 0
186
+ cache_path = cache_path if cache_path else output_dir
187
+ try:
188
+ probe = None
189
+ probe, num_splits, duration = self.probe_media(path_file, split_interval, split_type)
190
+ segment_time = math.ceil(duration / num_splits)
191
+ output_kwargs = {
192
+ "f": "segment",
193
+ "segment_time": segment_time,
194
+ "c": "copy",
195
+ "map": "0",
196
+ # use 10% of the available cores, but at least 4 threads
197
+ # each core has 2 threads
198
+ "threads": int(max(SystemResourceProbe().get_effective_cores() * 0.2, 4)),
199
+ }
200
+ if suffix == ".mp4":
201
+ output_kwargs.update(
202
+ {
203
+ "force_key_frames": f"expr:gte(t,n_forced*{segment_time})",
204
+ "crf": 22,
205
+ "g": 50,
206
+ "sc_threshold": 0,
207
+ }
208
+ )
209
+ capture_output, capture_error = (
210
+ ffmpeg.input(str(input_path))
211
+ .output(str(output_pattern), **output_kwargs)
212
+ .run(capture_stdout=True, capture_stderr=True)
213
+ )
214
+ logging.debug(f"Split {input_path} into {num_splits} chunks")
215
+ self.path_metadata[input_path] = probe
216
+ logging.debug(capture_output)
217
+ logging.debug(f"{original_input_path} - {capture_error}")
218
+ except ffmpeg.Error as e:
219
+ logging.error(
220
+ f"FFmpeg error for file {original_input_path}: {e.stderr.decode()} {capture_output} {capture_error}"
221
+ )
222
+ return []
223
+ files = [str(output_dir / f"{file_name}_chunk_{i:04d}{suffix}") for i in range(int(num_splits))]
224
+ if video_audio_separate and suffix in [".mp4", ".mov", ".avi", ".mkv"]:
225
+ video_audio_files = []
226
+ for file in files:
227
+ file = Path(file)
228
+ audio_path = self.get_audio_from_video(file, file.with_suffix(".mp3"), cache_path)
229
+ if audio_path is not None:
230
+ video_audio_files.append(audio_path)
231
+ else:
232
+ logging.error(f"Failed to extract audio from {file}")
233
+ return files + video_audio_files
234
+ for to_remove in files_to_remove:
235
+ to_remove = Path(to_remove)
236
+ if to_remove.is_file():
237
+ logger.debug(f"Removing file {to_remove}")
238
+ to_remove.unlink()
239
+ return files
240
+
241
+ def find_num_splits(
242
+ self,
243
+ file_size: int,
244
+ sample_rate: float,
245
+ duration: float,
246
+ split_interval: int,
247
+ split_type: SplitType,
248
+ ):
249
+ """
250
+ Find the number of splits for a media file based on the split type and interval.
251
+ file_size: int, size of the media file in bytes
252
+ sample_rate: float, sample rate of the media file in samples per second
253
+ duration: float, duration of the media file in seconds
254
+ split_interval: int, size of the chunk to split the media file into depending on the split type
255
+ split_type: SplitType, type of split to perform, either size, time, or frame
256
+ """
257
+ if split_type == SplitType.SIZE:
258
+ return math.ceil(file_size / split_interval)
259
+ elif split_type == SplitType.TIME:
260
+ return math.ceil(duration / split_interval)
261
+ elif split_type == SplitType.FRAME:
262
+ seconds_cap = split_interval / sample_rate
263
+ return math.ceil(duration / seconds_cap)
264
+ else:
265
+ raise ValueError(f"Invalid split type: {split_type}")
266
+
267
+ def _get_path_metadata(self):
268
+ """
269
+ Get the metadata for a path.
270
+ path: str, path to get the metadata for if None, get the metadata for all paths
271
+ """
272
+ return self.path_metadata
273
+
274
+ def load_data(queue: queue.Queue, paths: list[str], thread_stop: threading.Event):
275
+ file = None
276
+ logger.info(f"Loading data for {len(paths)} files")
277
+ try:
278
+ for file in paths:
279
+ if thread_stop.is_set():
280
+ return
281
+ with open(file, "rb") as f:
282
+ queue.put(f.read())
283
+ except Exception as e:
284
+ logging.error(f"Error processing file {file} type: {type(file)} {e}")
285
+ queue.put(RuntimeError(f"Error processing file {file}: {e}"))
286
+ finally:
287
+ queue.put(StopIteration)
288
+
289
+ class DataLoader:
290
+ """
291
+ DataLoader is a class that is used to load data from a list of paths and push it to a queue.
292
+ paths: list[str], list of paths to process
293
+ size: int, size of the queue
294
+ """
295
+
296
+ def __init__(
297
+ self,
298
+ path: str,
299
+ output_dir: str,
300
+ split_type: SplitType = SplitType.SIZE,
301
+ split_interval: int = 450,
302
+ interface: LoaderInterface = None,
303
+ size: int = 2,
304
+ video_audio_separate: bool = False,
305
+ audio_only: bool = False,
306
+ ):
307
+ interface = interface if interface else MediaInterface()
308
+ self.thread = None
309
+ self.thread_stop = threading.Event()
310
+ self.queue = queue.Queue(size)
311
+ self.path = Path(path)
312
+ self.output_dir = output_dir
313
+ self.split_interval = split_interval
314
+ self.interface = interface
315
+ self.files_completed = []
316
+ self.split_type = split_type
317
+ self.video_audio_separate = video_audio_separate
318
+ self.audio_only = audio_only
319
+ # process the file immediately on instantiation
320
+ self._process()
321
+
322
+ def _process(self):
323
+ files_completed = self.interface.split(
324
+ self.path,
325
+ self.output_dir,
326
+ split_interval=self.split_interval,
327
+ split_type=self.split_type,
328
+ video_audio_separate=self.video_audio_separate,
329
+ audio_only=self.audio_only,
330
+ )
331
+ # get durations for files in self.files_completed
332
+ durations = []
333
+ for file in files_completed:
334
+ _, _, duration = self.interface.probe_media(
335
+ Path(file), split_interval=self.split_interval, split_type=self.split_type
336
+ )
337
+ durations.append(duration)
338
+
339
+ self.files_completed = list(zip(files_completed, durations))
340
+
341
+ def __next__(self):
342
+ payload = self.queue.get()
343
+ if payload == StopIteration:
344
+ raise payload
345
+ else:
346
+ return payload
347
+
348
+ def stop(self):
349
+ """
350
+ Reset itertor by stopping the thread and clearing the queue.
351
+ """
352
+ if self.thread:
353
+ self.thread_stop.set()
354
+ self.thread.join()
355
+ self.thread = None
356
+ try:
357
+ while True:
358
+ self.queue.get_nowait()
359
+ except Exception:
360
+ pass
361
+ finally:
362
+ self.thread_stop.clear()
363
+
364
+ def __iter__(self):
365
+ self.stop()
366
+ self.thread_stop.clear()
367
+ self.thread = threading.Thread(
368
+ target=load_data,
369
+ args=(
370
+ self.queue,
371
+ [file for file, _ in self.files_completed],
372
+ self.thread_stop,
373
+ ),
374
+ daemon=True,
375
+ )
376
+ self.thread.start()
377
+ return self
378
+
379
+ def __len__(self):
380
+ return len(self.files_completed)
381
+
382
+ def __getitem__(self, index):
383
+ file_path = self.files_completed[index]
384
+ if isinstance(file_path, tuple):
385
+ file_path = file_path[0]
386
+ results = None
387
+ try:
388
+ if isinstance(file_path, tuple):
389
+ file_path = file_path[0]
390
+ with open(file_path, "rb") as f:
391
+ results = f.read()
392
+ return results
393
+ except Exception as e:
394
+ logging.error(f"Error getting item {index}: {e}")
395
+ raise e
396
+
397
+ def __del__(self):
398
+ self.stop()
399
+
400
+ def __exit__(self, exc_type, exc_value, traceback):
401
+ self.stop()
402
+
403
+ def get_metadata(self):
404
+ """
405
+ Get the metadata for a path.
406
+ path: str, path to get the metadata for if None, get the metadata for all paths
407
+ """
408
+
409
+ return self.interface._get_path_metadata()
@@ -0,0 +1,5 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2024, NVIDIA CORPORATION.
@@ -0,0 +1,38 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import langdetect
7
+
8
+ from nv_ingest_api.internal.enums.common import LanguageEnum
9
+ from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler
10
+
11
+
12
+ @langdetect_exception_handler
13
+ def detect_language(text):
14
+ """
15
+ Detect spoken language from a string of text.
16
+
17
+ Parameters
18
+ ----------
19
+ text : str
20
+ A string of text.
21
+
22
+ Returns
23
+ -------
24
+ LanguageEnum
25
+ A value from `LanguageEnum` detected language code.
26
+ """
27
+
28
+ try:
29
+ language = langdetect.detect(text)
30
+
31
+ if LanguageEnum.has_value(language):
32
+ language = LanguageEnum[language.upper().replace("-", "_")]
33
+ else:
34
+ language = LanguageEnum.UNKNOWN
35
+ except langdetect.lang_detect_exception.LangDetectException:
36
+ language = LanguageEnum.UNKNOWN
37
+
38
+ return language
File without changes
@@ -0,0 +1,72 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ from datetime import datetime
8
+ from datetime import timezone
9
+ from typing import Any
10
+ from typing import Callable
11
+ from typing import Dict
12
+
13
+ from nv_ingest_api.util.converters import datetools
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def datetools_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
19
+ """
20
+ A decorator that handles exceptions for date-related functions.
21
+
22
+ This decorator wraps a function that processes dates and catches any exceptions that occur during its execution.
23
+ If an exception is raised, it logs a warning and returns the current UTC time as an ISO 8601 formatted string.
24
+
25
+ Parameters
26
+ ----------
27
+ func : Callable
28
+ The function to be decorated. This function is expected to handle date operations.
29
+
30
+ kwargs : dict
31
+ Additional keyword arguments to be passed to the function.
32
+
33
+ Returns
34
+ -------
35
+ Callable
36
+ The wrapped function that executes `func` with exception handling.
37
+
38
+ Notes
39
+ -----
40
+ If an exception is raised while executing the wrapped function, the current UTC time (with timezone information
41
+ removed)
42
+ will be returned as an ISO 8601 formatted string.
43
+
44
+ Examples
45
+ --------
46
+ >>> @datetools_exception_handler
47
+ ... def parse_date(date_str):
48
+ ... return datetime.strptime(date_str, '%Y-%m-%d')
49
+ ...
50
+ >>> parse_date('2024-08-22')
51
+ datetime.datetime(2024, 8, 22, 0, 0)
52
+
53
+ If the input is invalid, the current UTC time without timezone information is returned:
54
+
55
+ >>> parse_date('invalid-date')
56
+ '2024-08-22T12:34:56'
57
+
58
+ Raises
59
+ ------
60
+ Exception
61
+ Any exception raised by the wrapped function is caught, logged, and handled by returning the current UTC time.
62
+ """
63
+
64
+ def inner_function(*args, **kwargs):
65
+ try:
66
+ return func(*args, **kwargs)
67
+ except Exception as e:
68
+ log_error_message = f"Invalid date format: {e}"
69
+ logger.debug(log_error_message)
70
+ return datetools.remove_tz(datetime.now(timezone.utc)).isoformat()
71
+
72
+ return inner_function