nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
6
|
+
|
|
7
|
+
from nv_ingest_api.util.dataloader.dataloader import DataLoader, MediaInterface
|
|
8
|
+
|
|
9
|
+
__all__ = ["DataLoader", "MediaInterface"]
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
import queue
|
|
9
|
+
import threading
|
|
10
|
+
import subprocess
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import math
|
|
14
|
+
import importlib.util
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
import os
|
|
19
|
+
import glob
|
|
20
|
+
|
|
21
|
+
from nv_ingest_api.util.system.hardware_info import SystemResourceProbe
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
importlib.util.find_spec("ffmpeg")
|
|
27
|
+
subprocess.run(["ffmpeg", "-version"], capture_output=True)
|
|
28
|
+
except Exception:
|
|
29
|
+
logger.error(
|
|
30
|
+
"Unable to load the Dataloader, ffmpeg was not installed, "
|
|
31
|
+
"please install it using `pip install ffmpeg-python` and `apt-get install ffmpeg`"
|
|
32
|
+
)
|
|
33
|
+
ffmpeg = None
|
|
34
|
+
else:
|
|
35
|
+
import ffmpeg
|
|
36
|
+
|
|
37
|
+
if not ffmpeg:
|
|
38
|
+
DataLoader = None
|
|
39
|
+
MediaInterface = None
|
|
40
|
+
else:
|
|
41
|
+
|
|
42
|
+
class SplitType(Enum):
|
|
43
|
+
FRAME = "frame"
|
|
44
|
+
TIME = "time"
|
|
45
|
+
SIZE = "size"
|
|
46
|
+
|
|
47
|
+
class LoaderInterface(ABC):
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def split(self, input_path: str, output_dir: str, split_interval: int = 0):
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def _get_path_metadata(self, path: str = None):
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def _probe(filename, format=None, file_handle=None, timeout=None, **kwargs):
|
|
58
|
+
args = ["ffprobe", "-show_format", "-show_streams", "-of", "json"]
|
|
59
|
+
args += ffmpeg._utils.convert_kwargs_to_cmd_line_args(kwargs)
|
|
60
|
+
if file_handle:
|
|
61
|
+
args += ["pipe:"]
|
|
62
|
+
else:
|
|
63
|
+
args += [filename]
|
|
64
|
+
p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
65
|
+
communicate_kwargs = {}
|
|
66
|
+
if timeout is not None:
|
|
67
|
+
communicate_kwargs["timeout"] = timeout
|
|
68
|
+
if file_handle:
|
|
69
|
+
communicate_kwargs["input"] = file_handle if file_handle else filename
|
|
70
|
+
out, err = p.communicate(**communicate_kwargs)
|
|
71
|
+
if p.returncode != 0:
|
|
72
|
+
raise ffmpeg._run.Error("ffprobe", out, err)
|
|
73
|
+
return json.loads(out.decode("utf-8"))
|
|
74
|
+
|
|
75
|
+
def _get_audio_from_video(input_path: str, output_file: str, cache_path: str = None):
|
|
76
|
+
"""
|
|
77
|
+
Get the audio from a video file. if audio extraction fails, return None.
|
|
78
|
+
input_path: str, path to the video file
|
|
79
|
+
output_dir: str, path to the output directory
|
|
80
|
+
cache_path: str, path to the cache directory
|
|
81
|
+
"""
|
|
82
|
+
output_path = Path(output_file)
|
|
83
|
+
output_dir = output_path.parent
|
|
84
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
try:
|
|
86
|
+
capture_output, capture_error = (
|
|
87
|
+
ffmpeg.input(str(input_path))
|
|
88
|
+
.output(str(output_path), acodec="libmp3lame", map="0:a")
|
|
89
|
+
.overwrite_output()
|
|
90
|
+
.run(capture_stdout=True, capture_stderr=True)
|
|
91
|
+
)
|
|
92
|
+
return output_path
|
|
93
|
+
except ffmpeg.Error as e:
|
|
94
|
+
logging.error(f"FFmpeg error for file {input_path}: {e.stderr.decode()}")
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
def strip_audio_from_video_files(input_path: str, output_dir: str, cache_path: str = None, file_type=".mp4"):
|
|
98
|
+
"""
|
|
99
|
+
Strip the audio from a series of video files and return the paths to the new files.
|
|
100
|
+
input_path: str, path to the video file
|
|
101
|
+
output_dir: str, path to the output directory
|
|
102
|
+
cache_path: str, path to the cache directory
|
|
103
|
+
"""
|
|
104
|
+
output_path = Path(output_dir)
|
|
105
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
futures = []
|
|
107
|
+
results = None
|
|
108
|
+
path = Path(input_path)
|
|
109
|
+
files = [path] if path.is_file() else glob.glob(os.path.join(path, f"*{file_type}"))
|
|
110
|
+
files = [Path(file) for file in files]
|
|
111
|
+
with ThreadPoolExecutor(max_workers=15) as executor:
|
|
112
|
+
futures = [executor.submit(_get_audio_from_video, file, output_path / f"{file.stem}.mp3") for file in files]
|
|
113
|
+
results = [str(future.result()) for future in tqdm(futures)]
|
|
114
|
+
return results
|
|
115
|
+
|
|
116
|
+
class MediaInterface(LoaderInterface):
|
|
117
|
+
|
|
118
|
+
def __init__(self):
|
|
119
|
+
self.path_metadata = {}
|
|
120
|
+
|
|
121
|
+
def probe_media(self, path_file: Path, split_interval: int, split_type: SplitType, file_handle=None):
|
|
122
|
+
num_splits = None
|
|
123
|
+
duration = None
|
|
124
|
+
probe = None
|
|
125
|
+
sample_rate = None
|
|
126
|
+
try:
|
|
127
|
+
file_size = path_file.stat().st_size # in bytes
|
|
128
|
+
if file_handle:
|
|
129
|
+
probe = _probe("pipe:", format=path_file.suffix, file_handle=file_handle)
|
|
130
|
+
else:
|
|
131
|
+
probe = _probe(str(path_file), format=path_file.suffix)
|
|
132
|
+
if probe["streams"][0]["codec_type"] == "video":
|
|
133
|
+
sample_rate = float(probe["streams"][0]["avg_frame_rate"].split("/")[0])
|
|
134
|
+
duration = float(probe["format"]["duration"])
|
|
135
|
+
elif probe["streams"][0]["codec_type"] == "audio":
|
|
136
|
+
sample_rate = float(probe["streams"][0]["sample_rate"])
|
|
137
|
+
bitrate = probe["format"]["bit_rate"]
|
|
138
|
+
duration = (file_size * 8) / float(bitrate)
|
|
139
|
+
num_splits = self.find_num_splits(file_size, sample_rate, duration, split_interval, split_type)
|
|
140
|
+
except ffmpeg.Error as e:
|
|
141
|
+
logging.error(f"FFmpeg error for file {path_file}: {e.stderr.decode()}")
|
|
142
|
+
except ValueError as e:
|
|
143
|
+
logging.error(f"Error finding number of splits for file {path_file}: {e}")
|
|
144
|
+
return probe, num_splits, duration
|
|
145
|
+
|
|
146
|
+
def get_audio_from_video(self, input_path: str, output_file: str, cache_path: str = None):
|
|
147
|
+
return _get_audio_from_video(input_path, output_file, cache_path)
|
|
148
|
+
|
|
149
|
+
def split(
|
|
150
|
+
self,
|
|
151
|
+
input_path: str,
|
|
152
|
+
output_dir: str,
|
|
153
|
+
split_interval: int = 0,
|
|
154
|
+
split_type: SplitType = SplitType.SIZE,
|
|
155
|
+
cache_path: str = None,
|
|
156
|
+
video_audio_separate: bool = False,
|
|
157
|
+
audio_only: bool = False,
|
|
158
|
+
):
|
|
159
|
+
"""
|
|
160
|
+
Split a media file into smaller chunks of `split_interval` size. if
|
|
161
|
+
video_audio_separate is True and the file is a video, the audio will be
|
|
162
|
+
extracted from the video and saved to a separate files. Data can be returned
|
|
163
|
+
as a tuple of (video_files, audio_files) or just files (i.e. audio files).
|
|
164
|
+
input_path: str, path to the media file
|
|
165
|
+
output_dir: str, path to the output directory
|
|
166
|
+
split_interval: the size of the chunk to split the media file into depending on the split type
|
|
167
|
+
split_type: SplitType, type of split to perform, either size, time, or frame
|
|
168
|
+
video_audio_separate: bool, whether to separate the video and audio files
|
|
169
|
+
audio_only: bool, whether to only return the audio files
|
|
170
|
+
"""
|
|
171
|
+
import ffmpeg
|
|
172
|
+
|
|
173
|
+
files_to_remove = []
|
|
174
|
+
output_dir = Path(output_dir)
|
|
175
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
original_input_path = input_path
|
|
177
|
+
if audio_only and Path(input_path).suffix in [".mp4", ".mov", ".avi", ".mkv"]:
|
|
178
|
+
input_path = self.get_audio_from_video(input_path, output_dir / f"{input_path.stem}.mp3")
|
|
179
|
+
files_to_remove.append(input_path)
|
|
180
|
+
path_file = Path(input_path)
|
|
181
|
+
file_name = path_file.stem
|
|
182
|
+
suffix = path_file.suffix
|
|
183
|
+
output_pattern = output_dir / f"{file_name}_chunk_%04d{suffix}"
|
|
184
|
+
|
|
185
|
+
num_splits = 0
|
|
186
|
+
cache_path = cache_path if cache_path else output_dir
|
|
187
|
+
try:
|
|
188
|
+
probe = None
|
|
189
|
+
probe, num_splits, duration = self.probe_media(path_file, split_interval, split_type)
|
|
190
|
+
segment_time = math.ceil(duration / num_splits)
|
|
191
|
+
output_kwargs = {
|
|
192
|
+
"f": "segment",
|
|
193
|
+
"segment_time": segment_time,
|
|
194
|
+
"c": "copy",
|
|
195
|
+
"map": "0",
|
|
196
|
+
# use 10% of the available cores, but at least 4 threads
|
|
197
|
+
# each core has 2 threads
|
|
198
|
+
"threads": int(max(SystemResourceProbe().get_effective_cores() * 0.2, 4)),
|
|
199
|
+
}
|
|
200
|
+
if suffix == ".mp4":
|
|
201
|
+
output_kwargs.update(
|
|
202
|
+
{
|
|
203
|
+
"force_key_frames": f"expr:gte(t,n_forced*{segment_time})",
|
|
204
|
+
"crf": 22,
|
|
205
|
+
"g": 50,
|
|
206
|
+
"sc_threshold": 0,
|
|
207
|
+
}
|
|
208
|
+
)
|
|
209
|
+
capture_output, capture_error = (
|
|
210
|
+
ffmpeg.input(str(input_path))
|
|
211
|
+
.output(str(output_pattern), **output_kwargs)
|
|
212
|
+
.run(capture_stdout=True, capture_stderr=True)
|
|
213
|
+
)
|
|
214
|
+
logging.debug(f"Split {input_path} into {num_splits} chunks")
|
|
215
|
+
self.path_metadata[input_path] = probe
|
|
216
|
+
logging.debug(capture_output)
|
|
217
|
+
logging.debug(f"{original_input_path} - {capture_error}")
|
|
218
|
+
except ffmpeg.Error as e:
|
|
219
|
+
logging.error(
|
|
220
|
+
f"FFmpeg error for file {original_input_path}: {e.stderr.decode()} {capture_output} {capture_error}"
|
|
221
|
+
)
|
|
222
|
+
return []
|
|
223
|
+
files = [str(output_dir / f"{file_name}_chunk_{i:04d}{suffix}") for i in range(int(num_splits))]
|
|
224
|
+
if video_audio_separate and suffix in [".mp4", ".mov", ".avi", ".mkv"]:
|
|
225
|
+
video_audio_files = []
|
|
226
|
+
for file in files:
|
|
227
|
+
file = Path(file)
|
|
228
|
+
audio_path = self.get_audio_from_video(file, file.with_suffix(".mp3"), cache_path)
|
|
229
|
+
if audio_path is not None:
|
|
230
|
+
video_audio_files.append(audio_path)
|
|
231
|
+
else:
|
|
232
|
+
logging.error(f"Failed to extract audio from {file}")
|
|
233
|
+
return files + video_audio_files
|
|
234
|
+
for to_remove in files_to_remove:
|
|
235
|
+
to_remove = Path(to_remove)
|
|
236
|
+
if to_remove.is_file():
|
|
237
|
+
logger.debug(f"Removing file {to_remove}")
|
|
238
|
+
to_remove.unlink()
|
|
239
|
+
return files
|
|
240
|
+
|
|
241
|
+
def find_num_splits(
|
|
242
|
+
self,
|
|
243
|
+
file_size: int,
|
|
244
|
+
sample_rate: float,
|
|
245
|
+
duration: float,
|
|
246
|
+
split_interval: int,
|
|
247
|
+
split_type: SplitType,
|
|
248
|
+
):
|
|
249
|
+
"""
|
|
250
|
+
Find the number of splits for a media file based on the split type and interval.
|
|
251
|
+
file_size: int, size of the media file in bytes
|
|
252
|
+
sample_rate: float, sample rate of the media file in samples per second
|
|
253
|
+
duration: float, duration of the media file in seconds
|
|
254
|
+
split_interval: int, size of the chunk to split the media file into depending on the split type
|
|
255
|
+
split_type: SplitType, type of split to perform, either size, time, or frame
|
|
256
|
+
"""
|
|
257
|
+
if split_type == SplitType.SIZE:
|
|
258
|
+
return math.ceil(file_size / split_interval)
|
|
259
|
+
elif split_type == SplitType.TIME:
|
|
260
|
+
return math.ceil(duration / split_interval)
|
|
261
|
+
elif split_type == SplitType.FRAME:
|
|
262
|
+
seconds_cap = split_interval / sample_rate
|
|
263
|
+
return math.ceil(duration / seconds_cap)
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError(f"Invalid split type: {split_type}")
|
|
266
|
+
|
|
267
|
+
def _get_path_metadata(self):
|
|
268
|
+
"""
|
|
269
|
+
Get the metadata for a path.
|
|
270
|
+
path: str, path to get the metadata for if None, get the metadata for all paths
|
|
271
|
+
"""
|
|
272
|
+
return self.path_metadata
|
|
273
|
+
|
|
274
|
+
def load_data(queue: queue.Queue, paths: list[str], thread_stop: threading.Event):
|
|
275
|
+
file = None
|
|
276
|
+
logger.info(f"Loading data for {len(paths)} files")
|
|
277
|
+
try:
|
|
278
|
+
for file in paths:
|
|
279
|
+
if thread_stop.is_set():
|
|
280
|
+
return
|
|
281
|
+
with open(file, "rb") as f:
|
|
282
|
+
queue.put(f.read())
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logging.error(f"Error processing file {file} type: {type(file)} {e}")
|
|
285
|
+
queue.put(RuntimeError(f"Error processing file {file}: {e}"))
|
|
286
|
+
finally:
|
|
287
|
+
queue.put(StopIteration)
|
|
288
|
+
|
|
289
|
+
class DataLoader:
|
|
290
|
+
"""
|
|
291
|
+
DataLoader is a class that is used to load data from a list of paths and push it to a queue.
|
|
292
|
+
paths: list[str], list of paths to process
|
|
293
|
+
size: int, size of the queue
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def __init__(
|
|
297
|
+
self,
|
|
298
|
+
path: str,
|
|
299
|
+
output_dir: str,
|
|
300
|
+
split_type: SplitType = SplitType.SIZE,
|
|
301
|
+
split_interval: int = 450,
|
|
302
|
+
interface: LoaderInterface = None,
|
|
303
|
+
size: int = 2,
|
|
304
|
+
video_audio_separate: bool = False,
|
|
305
|
+
audio_only: bool = False,
|
|
306
|
+
):
|
|
307
|
+
interface = interface if interface else MediaInterface()
|
|
308
|
+
self.thread = None
|
|
309
|
+
self.thread_stop = threading.Event()
|
|
310
|
+
self.queue = queue.Queue(size)
|
|
311
|
+
self.path = Path(path)
|
|
312
|
+
self.output_dir = output_dir
|
|
313
|
+
self.split_interval = split_interval
|
|
314
|
+
self.interface = interface
|
|
315
|
+
self.files_completed = []
|
|
316
|
+
self.split_type = split_type
|
|
317
|
+
self.video_audio_separate = video_audio_separate
|
|
318
|
+
self.audio_only = audio_only
|
|
319
|
+
# process the file immediately on instantiation
|
|
320
|
+
self._process()
|
|
321
|
+
|
|
322
|
+
def _process(self):
|
|
323
|
+
files_completed = self.interface.split(
|
|
324
|
+
self.path,
|
|
325
|
+
self.output_dir,
|
|
326
|
+
split_interval=self.split_interval,
|
|
327
|
+
split_type=self.split_type,
|
|
328
|
+
video_audio_separate=self.video_audio_separate,
|
|
329
|
+
audio_only=self.audio_only,
|
|
330
|
+
)
|
|
331
|
+
# get durations for files in self.files_completed
|
|
332
|
+
durations = []
|
|
333
|
+
for file in files_completed:
|
|
334
|
+
_, _, duration = self.interface.probe_media(
|
|
335
|
+
Path(file), split_interval=self.split_interval, split_type=self.split_type
|
|
336
|
+
)
|
|
337
|
+
durations.append(duration)
|
|
338
|
+
|
|
339
|
+
self.files_completed = list(zip(files_completed, durations))
|
|
340
|
+
|
|
341
|
+
def __next__(self):
|
|
342
|
+
payload = self.queue.get()
|
|
343
|
+
if payload == StopIteration:
|
|
344
|
+
raise payload
|
|
345
|
+
else:
|
|
346
|
+
return payload
|
|
347
|
+
|
|
348
|
+
def stop(self):
|
|
349
|
+
"""
|
|
350
|
+
Reset itertor by stopping the thread and clearing the queue.
|
|
351
|
+
"""
|
|
352
|
+
if self.thread:
|
|
353
|
+
self.thread_stop.set()
|
|
354
|
+
self.thread.join()
|
|
355
|
+
self.thread = None
|
|
356
|
+
try:
|
|
357
|
+
while True:
|
|
358
|
+
self.queue.get_nowait()
|
|
359
|
+
except Exception:
|
|
360
|
+
pass
|
|
361
|
+
finally:
|
|
362
|
+
self.thread_stop.clear()
|
|
363
|
+
|
|
364
|
+
def __iter__(self):
|
|
365
|
+
self.stop()
|
|
366
|
+
self.thread_stop.clear()
|
|
367
|
+
self.thread = threading.Thread(
|
|
368
|
+
target=load_data,
|
|
369
|
+
args=(
|
|
370
|
+
self.queue,
|
|
371
|
+
[file for file, _ in self.files_completed],
|
|
372
|
+
self.thread_stop,
|
|
373
|
+
),
|
|
374
|
+
daemon=True,
|
|
375
|
+
)
|
|
376
|
+
self.thread.start()
|
|
377
|
+
return self
|
|
378
|
+
|
|
379
|
+
def __len__(self):
|
|
380
|
+
return len(self.files_completed)
|
|
381
|
+
|
|
382
|
+
def __getitem__(self, index):
|
|
383
|
+
file_path = self.files_completed[index]
|
|
384
|
+
if isinstance(file_path, tuple):
|
|
385
|
+
file_path = file_path[0]
|
|
386
|
+
results = None
|
|
387
|
+
try:
|
|
388
|
+
if isinstance(file_path, tuple):
|
|
389
|
+
file_path = file_path[0]
|
|
390
|
+
with open(file_path, "rb") as f:
|
|
391
|
+
results = f.read()
|
|
392
|
+
return results
|
|
393
|
+
except Exception as e:
|
|
394
|
+
logging.error(f"Error getting item {index}: {e}")
|
|
395
|
+
raise e
|
|
396
|
+
|
|
397
|
+
def __del__(self):
|
|
398
|
+
self.stop()
|
|
399
|
+
|
|
400
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
401
|
+
self.stop()
|
|
402
|
+
|
|
403
|
+
def get_metadata(self):
|
|
404
|
+
"""
|
|
405
|
+
Get the metadata for a path.
|
|
406
|
+
path: str, path to get the metadata for if None, get the metadata for all paths
|
|
407
|
+
"""
|
|
408
|
+
|
|
409
|
+
return self.interface._get_path_metadata()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import langdetect
|
|
7
|
+
|
|
8
|
+
from nv_ingest_api.internal.enums.common import LanguageEnum
|
|
9
|
+
from nv_ingest_api.util.exception_handlers.detectors import langdetect_exception_handler
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@langdetect_exception_handler
|
|
13
|
+
def detect_language(text):
|
|
14
|
+
"""
|
|
15
|
+
Detect spoken language from a string of text.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
text : str
|
|
20
|
+
A string of text.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
LanguageEnum
|
|
25
|
+
A value from `LanguageEnum` detected language code.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
language = langdetect.detect(text)
|
|
30
|
+
|
|
31
|
+
if LanguageEnum.has_value(language):
|
|
32
|
+
language = LanguageEnum[language.upper().replace("-", "_")]
|
|
33
|
+
else:
|
|
34
|
+
language = LanguageEnum.UNKNOWN
|
|
35
|
+
except langdetect.lang_detect_exception.LangDetectException:
|
|
36
|
+
language = LanguageEnum.UNKNOWN
|
|
37
|
+
|
|
38
|
+
return language
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from datetime import timezone
|
|
9
|
+
from typing import Any
|
|
10
|
+
from typing import Callable
|
|
11
|
+
from typing import Dict
|
|
12
|
+
|
|
13
|
+
from nv_ingest_api.util.converters import datetools
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def datetools_exception_handler(func: Callable, **kwargs: Dict[str, Any]) -> Callable:
|
|
19
|
+
"""
|
|
20
|
+
A decorator that handles exceptions for date-related functions.
|
|
21
|
+
|
|
22
|
+
This decorator wraps a function that processes dates and catches any exceptions that occur during its execution.
|
|
23
|
+
If an exception is raised, it logs a warning and returns the current UTC time as an ISO 8601 formatted string.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
func : Callable
|
|
28
|
+
The function to be decorated. This function is expected to handle date operations.
|
|
29
|
+
|
|
30
|
+
kwargs : dict
|
|
31
|
+
Additional keyword arguments to be passed to the function.
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
Callable
|
|
36
|
+
The wrapped function that executes `func` with exception handling.
|
|
37
|
+
|
|
38
|
+
Notes
|
|
39
|
+
-----
|
|
40
|
+
If an exception is raised while executing the wrapped function, the current UTC time (with timezone information
|
|
41
|
+
removed)
|
|
42
|
+
will be returned as an ISO 8601 formatted string.
|
|
43
|
+
|
|
44
|
+
Examples
|
|
45
|
+
--------
|
|
46
|
+
>>> @datetools_exception_handler
|
|
47
|
+
... def parse_date(date_str):
|
|
48
|
+
... return datetime.strptime(date_str, '%Y-%m-%d')
|
|
49
|
+
...
|
|
50
|
+
>>> parse_date('2024-08-22')
|
|
51
|
+
datetime.datetime(2024, 8, 22, 0, 0)
|
|
52
|
+
|
|
53
|
+
If the input is invalid, the current UTC time without timezone information is returned:
|
|
54
|
+
|
|
55
|
+
>>> parse_date('invalid-date')
|
|
56
|
+
'2024-08-22T12:34:56'
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
Exception
|
|
61
|
+
Any exception raised by the wrapped function is caught, logged, and handled by returning the current UTC time.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def inner_function(*args, **kwargs):
|
|
65
|
+
try:
|
|
66
|
+
return func(*args, **kwargs)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
log_error_message = f"Invalid date format: {e}"
|
|
69
|
+
logger.debug(log_error_message)
|
|
70
|
+
return datetools.remove_tz(datetime.now(timezone.utc)).isoformat()
|
|
71
|
+
|
|
72
|
+
return inner_function
|