torchaudio 2.8.0__cp310-cp310-win_amd64.whl → 2.9.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show
  1. torchaudio/__init__.py +179 -39
  2. torchaudio/_extension/__init__.py +1 -14
  3. torchaudio/_extension/utils.py +0 -47
  4. torchaudio/_internal/module_utils.py +12 -3
  5. torchaudio/_torchcodec.py +73 -85
  6. torchaudio/datasets/cmuarctic.py +1 -1
  7. torchaudio/datasets/utils.py +1 -1
  8. torchaudio/functional/__init__.py +0 -2
  9. torchaudio/functional/_alignment.py +1 -1
  10. torchaudio/functional/filtering.py +70 -55
  11. torchaudio/functional/functional.py +26 -60
  12. torchaudio/lib/_torchaudio.pyd +0 -0
  13. torchaudio/lib/libtorchaudio.pyd +0 -0
  14. torchaudio/models/decoder/__init__.py +14 -2
  15. torchaudio/models/decoder/_ctc_decoder.py +6 -6
  16. torchaudio/models/decoder/_cuda_ctc_decoder.py +1 -1
  17. torchaudio/models/squim/objective.py +2 -2
  18. torchaudio/pipelines/_source_separation_pipeline.py +1 -1
  19. torchaudio/pipelines/_squim_pipeline.py +2 -2
  20. torchaudio/pipelines/_tts/utils.py +1 -1
  21. torchaudio/pipelines/rnnt_pipeline.py +4 -4
  22. torchaudio/transforms/__init__.py +1 -0
  23. torchaudio/transforms/_transforms.py +2 -2
  24. torchaudio/utils/__init__.py +2 -9
  25. torchaudio/utils/download.py +1 -3
  26. torchaudio/version.py +2 -2
  27. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/METADATA +8 -11
  28. torchaudio-2.9.0.dist-info/RECORD +85 -0
  29. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/top_level.txt +0 -1
  30. torchaudio/_backend/__init__.py +0 -61
  31. torchaudio/_backend/backend.py +0 -53
  32. torchaudio/_backend/common.py +0 -52
  33. torchaudio/_backend/ffmpeg.py +0 -334
  34. torchaudio/_backend/soundfile.py +0 -54
  35. torchaudio/_backend/soundfile_backend.py +0 -457
  36. torchaudio/_backend/sox.py +0 -91
  37. torchaudio/_backend/utils.py +0 -350
  38. torchaudio/backend/__init__.py +0 -8
  39. torchaudio/backend/_no_backend.py +0 -25
  40. torchaudio/backend/_sox_io_backend.py +0 -294
  41. torchaudio/backend/common.py +0 -13
  42. torchaudio/backend/no_backend.py +0 -14
  43. torchaudio/backend/soundfile_backend.py +0 -14
  44. torchaudio/backend/sox_io_backend.py +0 -14
  45. torchaudio/io/__init__.py +0 -20
  46. torchaudio/io/_effector.py +0 -347
  47. torchaudio/io/_playback.py +0 -72
  48. torchaudio/kaldi_io.py +0 -150
  49. torchaudio/prototype/__init__.py +0 -0
  50. torchaudio/prototype/datasets/__init__.py +0 -4
  51. torchaudio/prototype/datasets/musan.py +0 -68
  52. torchaudio/prototype/functional/__init__.py +0 -26
  53. torchaudio/prototype/functional/_dsp.py +0 -441
  54. torchaudio/prototype/functional/_rir.py +0 -382
  55. torchaudio/prototype/functional/functional.py +0 -193
  56. torchaudio/prototype/models/__init__.py +0 -39
  57. torchaudio/prototype/models/_conformer_wav2vec2.py +0 -801
  58. torchaudio/prototype/models/_emformer_hubert.py +0 -337
  59. torchaudio/prototype/models/conv_emformer.py +0 -529
  60. torchaudio/prototype/models/hifi_gan.py +0 -342
  61. torchaudio/prototype/models/rnnt.py +0 -717
  62. torchaudio/prototype/models/rnnt_decoder.py +0 -402
  63. torchaudio/prototype/pipelines/__init__.py +0 -21
  64. torchaudio/prototype/pipelines/_vggish/__init__.py +0 -7
  65. torchaudio/prototype/pipelines/_vggish/_vggish_impl.py +0 -236
  66. torchaudio/prototype/pipelines/_vggish/_vggish_pipeline.py +0 -83
  67. torchaudio/prototype/pipelines/hifigan_pipeline.py +0 -233
  68. torchaudio/prototype/pipelines/rnnt_pipeline.py +0 -58
  69. torchaudio/prototype/transforms/__init__.py +0 -9
  70. torchaudio/prototype/transforms/_transforms.py +0 -461
  71. torchaudio/sox_effects/__init__.py +0 -10
  72. torchaudio/sox_effects/sox_effects.py +0 -275
  73. torchaudio/utils/ffmpeg_utils.py +0 -11
  74. torchaudio/utils/sox_utils.py +0 -118
  75. torchaudio-2.8.0.dist-info/RECORD +0 -145
  76. torio/__init__.py +0 -8
  77. torio/_extension/__init__.py +0 -13
  78. torio/_extension/utils.py +0 -147
  79. torio/io/__init__.py +0 -9
  80. torio/io/_streaming_media_decoder.py +0 -977
  81. torio/io/_streaming_media_encoder.py +0 -502
  82. torio/lib/__init__.py +0 -0
  83. torio/lib/_torio_ffmpeg4.pyd +0 -0
  84. torio/lib/_torio_ffmpeg5.pyd +0 -0
  85. torio/lib/_torio_ffmpeg6.pyd +0 -0
  86. torio/lib/libtorio_ffmpeg4.pyd +0 -0
  87. torio/lib/libtorio_ffmpeg5.pyd +0 -0
  88. torio/lib/libtorio_ffmpeg6.pyd +0 -0
  89. torio/utils/__init__.py +0 -4
  90. torio/utils/ffmpeg_utils.py +0 -275
  91. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/WHEEL +0 -0
  92. {torchaudio-2.8.0.dist-info → torchaudio-2.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,977 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from typing import BinaryIO, Dict, Iterator, Optional, Tuple, TypeVar, Union
7
-
8
- import torch
9
- import torio
10
- from torch.utils._pytree import tree_map
11
-
12
- ffmpeg_ext = torio._extension.lazy_import_ffmpeg_ext()
13
-
14
- __all__ = [
15
- "StreamingMediaDecoder",
16
- ]
17
-
18
-
19
- @dataclass
20
- class SourceStream:
21
- """The metadata of a source stream, returned by :meth:`~torio.io.StreamingMediaDecoder.get_src_stream_info`.
22
-
23
- This class is used when representing streams of media type other than `audio` or `video`.
24
-
25
- When source stream is `audio` or `video` type, :class:`SourceAudioStream` and
26
- :class:`SourceVideoStream`, which reports additional media-specific attributes,
27
- are used respectively.
28
- """
29
-
30
- media_type: str
31
- """The type of the stream.
32
- One of ``"audio"``, ``"video"``, ``"data"``, ``"subtitle"``, ``"attachment"`` and empty string.
33
-
34
- .. note::
35
- Only audio and video streams are supported for output.
36
- .. note::
37
- Still images, such as PNG and JPEG formats are reported as video.
38
- """
39
- codec: str
40
- """Short name of the codec. Such as ``"pcm_s16le"`` and ``"h264"``."""
41
- codec_long_name: str
42
- """Detailed name of the codec.
43
-
44
- Such as "`PCM signed 16-bit little-endian`" and "`H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10`".
45
- """
46
- format: Optional[str]
47
- """Media format. Such as ``"s16"`` and ``"yuv420p"``.
48
-
49
- Commonly found audio values are;
50
-
51
- - ``"u8"``, ``"u8p"``: Unsigned 8-bit unsigned interger.
52
- - ``"s16"``, ``"s16p"``: 16-bit signed integer.
53
- - ``"s32"``, ``"s32p"``: 32-bit signed integer.
54
- - ``"flt"``, ``"fltp"``: 32-bit floating-point.
55
-
56
- .. note::
57
-
58
- `p` at the end indicates the format is `planar`.
59
- Channels are grouped together instead of interspersed in memory.
60
- """
61
- bit_rate: Optional[int]
62
- """Bit rate of the stream in bits-per-second.
63
- This is an estimated values based on the initial few frames of the stream.
64
- For container formats and variable bit rate, it can be 0.
65
- """
66
- num_frames: Optional[int]
67
- """The number of frames in the stream"""
68
- bits_per_sample: Optional[int]
69
- """This is the number of valid bits in each output sample.
70
- For compressed format, it can be 0.
71
- """
72
- metadata: Dict[str, str]
73
- """Metadata attached to the source stream."""
74
-
75
-
76
- @dataclass
77
- class SourceAudioStream(SourceStream):
78
- """The metadata of an audio source stream, returned by :meth:`~torio.io.StreamingMediaDecoder.get_src_stream_info`.
79
-
80
- This class is used when representing audio stream.
81
-
82
- In addition to the attributes reported by :class:`SourceStream`,
83
- the following attributes are reported.
84
- """
85
-
86
- sample_rate: float
87
- """Sample rate of the audio."""
88
- num_channels: int
89
- """Number of channels."""
90
-
91
-
92
- @dataclass
93
- class SourceVideoStream(SourceStream):
94
- """The metadata of a video source stream, returned by :meth:`~torio.io.StreamingMediaDecoder.get_src_stream_info`.
95
-
96
- This class is used when representing video stream.
97
-
98
- In addition to the attributes reported by :class:`SourceStream`,
99
- the following attributes are reported.
100
- """
101
-
102
- width: int
103
- """Width of the video frame in pixel."""
104
- height: int
105
- """Height of the video frame in pixel."""
106
- frame_rate: float
107
- """Frame rate."""
108
-
109
-
110
- def _parse_si(i):
111
- media_type = i.media_type
112
- if media_type == "audio":
113
- return SourceAudioStream(
114
- media_type=i.media_type,
115
- codec=i.codec_name,
116
- codec_long_name=i.codec_long_name,
117
- format=i.format,
118
- bit_rate=i.bit_rate,
119
- num_frames=i.num_frames,
120
- bits_per_sample=i.bits_per_sample,
121
- metadata=i.metadata,
122
- sample_rate=i.sample_rate,
123
- num_channels=i.num_channels,
124
- )
125
- if media_type == "video":
126
- return SourceVideoStream(
127
- media_type=i.media_type,
128
- codec=i.codec_name,
129
- codec_long_name=i.codec_long_name,
130
- format=i.format,
131
- bit_rate=i.bit_rate,
132
- num_frames=i.num_frames,
133
- bits_per_sample=i.bits_per_sample,
134
- metadata=i.metadata,
135
- width=i.width,
136
- height=i.height,
137
- frame_rate=i.frame_rate,
138
- )
139
- return SourceStream(
140
- media_type=i.media_type,
141
- codec=i.codec_name,
142
- codec_long_name=i.codec_long_name,
143
- format=None,
144
- bit_rate=None,
145
- num_frames=None,
146
- bits_per_sample=None,
147
- metadata=i.metadata,
148
- )
149
-
150
-
151
- @dataclass
152
- class OutputStream:
153
- """Output stream configured on :class:`StreamingMediaDecoder`,
154
- returned by :meth:`~torio.io.StreamingMediaDecoder.get_out_stream_info`.
155
- """
156
-
157
- source_index: int
158
- """Index of the source stream that this output stream is connected."""
159
- filter_description: str
160
- """Description of filter graph applied to the source stream."""
161
- media_type: str
162
- """The type of the stream. ``"audio"`` or ``"video"``."""
163
- format: str
164
- """Media format. Such as ``"s16"`` and ``"yuv420p"``.
165
-
166
- Commonly found audio values are;
167
-
168
- - ``"u8"``, ``"u8p"``: Unsigned 8-bit unsigned interger.
169
- - ``"s16"``, ``"s16p"``: 16-bit signed integer.
170
- - ``"s32"``, ``"s32p"``: 32-bit signed integer.
171
- - ``"flt"``, ``"fltp"``: 32-bit floating-point.
172
-
173
- .. note::
174
-
175
- `p` at the end indicates the format is `planar`.
176
- Channels are grouped together instead of interspersed in memory."""
177
-
178
-
179
- @dataclass
180
- class OutputAudioStream(OutputStream):
181
- """Information about an audio output stream configured with
182
- :meth:`~torio.io.StreamingMediaDecoder.add_audio_stream` or
183
- :meth:`~torio.io.StreamingMediaDecoder.add_basic_audio_stream`.
184
-
185
- In addition to the attributes reported by :class:`OutputStream`,
186
- the following attributes are reported.
187
- """
188
-
189
- sample_rate: float
190
- """Sample rate of the audio."""
191
- num_channels: int
192
- """Number of channels."""
193
-
194
-
195
- @dataclass
196
- class OutputVideoStream(OutputStream):
197
- """Information about a video output stream configured with
198
- :meth:`~torio.io.StreamingMediaDecoder.add_video_stream` or
199
- :meth:`~torio.io.StreamingMediaDecoder.add_basic_video_stream`.
200
-
201
- In addition to the attributes reported by :class:`OutputStream`,
202
- the following attributes are reported.
203
- """
204
-
205
- width: int
206
- """Width of the video frame in pixel."""
207
- height: int
208
- """Height of the video frame in pixel."""
209
- frame_rate: float
210
- """Frame rate."""
211
-
212
-
213
- def _parse_oi(i):
214
- media_type = i.media_type
215
- if media_type == "audio":
216
- return OutputAudioStream(
217
- source_index=i.source_index,
218
- filter_description=i.filter_description,
219
- media_type=i.media_type,
220
- format=i.format,
221
- sample_rate=i.sample_rate,
222
- num_channels=i.num_channels,
223
- )
224
- if media_type == "video":
225
- return OutputVideoStream(
226
- source_index=i.source_index,
227
- filter_description=i.filter_description,
228
- media_type=i.media_type,
229
- format=i.format,
230
- width=i.width,
231
- height=i.height,
232
- frame_rate=i.frame_rate,
233
- )
234
- raise ValueError(f"Unexpected media_type: {i.media_type}({i})")
235
-
236
-
237
- def _get_afilter_desc(sample_rate: Optional[int], fmt: Optional[str], num_channels: Optional[int]):
238
- descs = []
239
- if sample_rate is not None:
240
- descs.append(f"aresample={sample_rate}")
241
- if fmt is not None or num_channels is not None:
242
- parts = []
243
- if fmt is not None:
244
- parts.append(f"sample_fmts={fmt}")
245
- if num_channels is not None:
246
- parts.append(f"channel_layouts={num_channels}c")
247
- descs.append(f"aformat={':'.join(parts)}")
248
- return ",".join(descs) if descs else None
249
-
250
-
251
- def _get_vfilter_desc(frame_rate: Optional[float], width: Optional[int], height: Optional[int], fmt: Optional[str]):
252
- descs = []
253
- if frame_rate is not None:
254
- descs.append(f"fps={frame_rate}")
255
- scales = []
256
- if width is not None:
257
- scales.append(f"width={width}")
258
- if height is not None:
259
- scales.append(f"height={height}")
260
- if scales:
261
- descs.append(f"scale={':'.join(scales)}")
262
- if fmt is not None:
263
- descs.append(f"format=pix_fmts={fmt}")
264
- return ",".join(descs) if descs else None
265
-
266
-
267
- # Base class for ChunkTensor
268
- # Based off of TrivialTensorViaComposition
269
- # https://github.com/albanD/subclass_zoo/blob/0eeb1d68fb59879029c610bc407f2997ae43ba0a/trivial_tensors.py#L83
270
- class ChunkTensorBase(torch.Tensor):
271
- __torch_function__ = torch._C._disabled_torch_function_impl
272
-
273
- @staticmethod
274
- def __new__(cls, _elem, *_):
275
- return super().__new__(cls, _elem)
276
-
277
- @classmethod
278
- def __torch_dispatch__(cls, func, _, args=(), kwargs=None):
279
- def unwrap(t):
280
- return t._elem if isinstance(t, cls) else t
281
-
282
- return func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
283
-
284
-
285
- @dataclass
286
- class ChunkTensor(ChunkTensorBase):
287
- """Decoded media frames with metadata.
288
-
289
- The instance of this class represents the decoded video/audio frames with
290
- metadata, and the instance itself behave like :py:class:`~torch.Tensor`.
291
-
292
- Client codes can pass instance of this class as-if it's
293
- :py:class:`~torch.Tensor` class, or call the methods defined on
294
- :py:class:`~torch.Tensor` class.
295
-
296
- Example:
297
- >>> # Define input streams
298
- >>> reader = StreamingMediaDecoder(...)
299
- >>> reader.add_audio_stream(frames_per_chunk=4000, sample_rate=8000)
300
- >>> reader.add_video_stream(frames_per_chunk=7, frame_rate=28)
301
- >>> # Decode the streams and fetch frames
302
- >>> reader.fill_buffer()
303
- >>> audio_chunk, video_chunk = reader.pop_chunks()
304
-
305
- >>> # Access metadata
306
- >>> (audio_chunk.pts, video_chunks.pts)
307
- (0.0, 0.0)
308
- >>>
309
- >>> # The second time the PTS is different
310
- >>> reader.fill_buffer()
311
- >>> audio_chunk, video_chunk = reader.pop_chunks()
312
- >>> (audio_chunk.pts, video_chunks.pts)
313
- (0.5, 0.25)
314
-
315
- >>> # Call PyTorch ops on chunk
316
- >>> audio_chunk.shape
317
- torch.Size([4000, 2]
318
- >>> power = torch.pow(video_chunk, 2)
319
- >>>
320
- >>> # the result is a plain torch.Tensor class
321
- >>> type(power)
322
- <class 'torch.Tensor'>
323
- >>>
324
- >>> # Metadata is not available on the result
325
- >>> power.pts
326
- AttributeError: 'Tensor' object has no attribute 'pts'
327
- """
328
-
329
- # Keep it private for now
330
- _elem: torch.Tensor
331
-
332
- pts: float
333
- """Presentation time stamp of the first frame in the chunk.
334
-
335
- Unit: second.
336
- """
337
-
338
-
339
- def _format_doc(**kwargs):
340
- def decorator(obj):
341
- obj.__doc__ = obj.__doc__.format(**kwargs)
342
- return obj
343
-
344
- return decorator
345
-
346
-
347
- _frames_per_chunk = """Number of frames returned as one chunk.
348
- If the source stream is exhausted before enough frames are buffered,
349
- then the chunk is returned as-is.
350
-
351
- Providing ``-1`` disables chunking and :py:func:`pop_chunks` method
352
- will concatenate all the buffered frames and return it."""
353
-
354
- _buffer_chunk_size = """Internal buffer size.
355
- When the number of chunks buffered exceeds this number, old frames are
356
- dropped. For example, if ``frames_per_chunk`` is 5 and ``buffer_chunk_size`` is
357
- 3, then frames older than ``15`` are dropped.
358
- Providing ``-1`` disables this behavior.
359
-
360
- Default: ``3``."""
361
-
362
- _audio_stream_index = """The source audio stream index.
363
- If omitted, :py:attr:`default_audio_stream` is used."""
364
-
365
-
366
- _video_stream_index = """The source video stream index.
367
- If omitted, :py:attr:`default_video_stream` is used."""
368
-
369
- _decoder = """The name of the decoder to be used.
370
- When provided, use the specified decoder instead of the default one.
371
-
372
- To list the available decoders, please use
373
- :py:func:`~torio.utils.ffmpeg_utils.get_audio_decoders` for audio, and
374
- :py:func:`~torio.utils.ffmpeg_utils.get_video_decoders` for video.
375
-
376
- Default: ``None``."""
377
-
378
- _decoder_option = """Options passed to decoder.
379
- Mapping from str to str. (Default: ``None``)
380
-
381
- To list decoder options for a decoder, you can use
382
- ``ffmpeg -h decoder=<DECODER>`` command.
383
-
384
- |
385
-
386
- In addition to decoder-specific options, you can also pass options related
387
- to multithreading. They are effective only if the decoder support them.
388
- If neither of them are provided, StreamingMediaDecoder defaults to single thread.
389
-
390
- ``"threads"``: The number of threads (in str).
391
- Providing the value ``"0"`` will let FFmpeg decides based on its heuristics.
392
-
393
- ``"thread_type"``: Which multithreading method to use.
394
- The valid values are ``"frame"`` or ``"slice"``.
395
- Note that each decoder supports different set of methods.
396
- If not provided, a default value is used.
397
-
398
- - ``"frame"``: Decode more than one frame at once.
399
- Each thread handles one frame.
400
- This will increase decoding delay by one frame per thread
401
- - ``"slice"``: Decode more than one part of a single frame at once.
402
-
403
- |
404
- """
405
-
406
-
407
- _hw_accel = """Enable hardware acceleration.
408
-
409
- When video is decoded on CUDA hardware, for example
410
- `decoder="h264_cuvid"`, passing CUDA device indicator to `hw_accel`
411
- (i.e. `hw_accel="cuda:0"`) will make StreamingMediaDecoder place the resulting
412
- frames directly on the specified CUDA device as CUDA tensor.
413
-
414
- If `None`, the frame will be moved to CPU memory.
415
- Default: ``None``."""
416
-
417
-
418
- _format_audio_args = _format_doc(
419
- frames_per_chunk=_frames_per_chunk,
420
- buffer_chunk_size=_buffer_chunk_size,
421
- stream_index=_audio_stream_index,
422
- decoder=_decoder,
423
- decoder_option=_decoder_option,
424
- )
425
-
426
-
427
- _format_video_args = _format_doc(
428
- frames_per_chunk=_frames_per_chunk,
429
- buffer_chunk_size=_buffer_chunk_size,
430
- stream_index=_video_stream_index,
431
- decoder=_decoder,
432
- decoder_option=_decoder_option,
433
- hw_accel=_hw_accel,
434
- )
435
-
436
-
437
- InputStreamTypes = TypeVar("InputStream", bound=SourceStream)
438
- OutputStreamTypes = TypeVar("OutputStream", bound=OutputStream)
439
-
440
- class StreamingMediaDecoder:
441
- """Fetch and decode audio/video streams chunk by chunk.
442
-
443
- For the detailed usage of this class, please refer to the tutorial.
444
-
445
- Args:
446
- src (str, path-like, bytes or file-like object): The media source.
447
- If string-type, it must be a resource indicator that FFmpeg can
448
- handle. This includes a file path, URL, device identifier or
449
- filter expression. The supported value depends on the FFmpeg found
450
- in the system.
451
-
452
- If bytes, it must be an encoded media data in contiguous memory.
453
-
454
- If file-like object, it must support `read` method with the signature
455
- `read(size: int) -> bytes`.
456
- Additionally, if the file-like object has `seek` method, it uses
457
- the method when parsing media metadata. This improves the reliability
458
- of codec detection. The signagure of `seek` method must be
459
- `seek(offset: int, whence: int) -> int`.
460
-
461
- Please refer to the following for the expected signature and behavior
462
- of `read` and `seek` method.
463
-
464
- - https://docs.python.org/3/library/io.html#io.BufferedIOBase.read
465
- - https://docs.python.org/3/library/io.html#io.IOBase.seek
466
-
467
- format (str or None, optional):
468
- Override the input format, or specify the source sound device.
469
- Default: ``None`` (no override nor device input).
470
-
471
- This argument serves two different usecases.
472
-
473
- 1) Override the source format.
474
- This is useful when the input data do not contain a header.
475
-
476
- 2) Specify the input source device.
477
- This allows to load media stream from hardware devices,
478
- such as microphone, camera and screen, or a virtual device.
479
-
480
-
481
- .. note::
482
-
483
- This option roughly corresponds to ``-f`` option of ``ffmpeg`` command.
484
- Please refer to the ffmpeg documentations for the possible values.
485
-
486
- https://ffmpeg.org/ffmpeg-formats.html#Demuxers
487
-
488
- Please use :py:func:`~torio.utils.ffmpeg_utils.get_demuxers` to list the
489
- demultiplexers available in the current environment.
490
-
491
- For device access, the available values vary based on hardware (AV device) and
492
- software configuration (ffmpeg build).
493
-
494
- https://ffmpeg.org/ffmpeg-devices.html#Input-Devices
495
-
496
- Please use :py:func:`~torio.utils.ffmpeg_utils.get_input_devices` to list
497
- the input devices available in the current environment.
498
-
499
- option (dict of str to str, optional):
500
- Custom option passed when initializing format context (opening source).
501
-
502
- You can use this argument to change the input source before it is passed to decoder.
503
-
504
- Default: ``None``.
505
-
506
- buffer_size (int):
507
- The internal buffer size in byte. Used only when `src` is file-like object.
508
-
509
- Default: `4096`.
510
- """
511
-
512
- def __init__(
513
- self,
514
- src: Union[str, Path, BinaryIO],
515
- format: Optional[str] = None,
516
- option: Optional[Dict[str, str]] = None,
517
- buffer_size: int = 4096,
518
- ):
519
- self.src = src
520
- if isinstance(src, bytes):
521
- self._be = ffmpeg_ext.StreamingMediaDecoderBytes(src, format, option, buffer_size)
522
- elif hasattr(src, "read"):
523
- self._be = ffmpeg_ext.StreamingMediaDecoderFileObj(src, format, option, buffer_size)
524
- else:
525
- self._be = ffmpeg_ext.StreamingMediaDecoder(os.path.normpath(src), format, option)
526
-
527
- i = self._be.find_best_audio_stream()
528
- self._default_audio_stream = None if i < 0 else i
529
- i = self._be.find_best_video_stream()
530
- self._default_video_stream = None if i < 0 else i
531
-
532
- @property
533
- def num_src_streams(self):
534
- """Number of streams found in the provided media source.
535
-
536
- :type: int
537
- """
538
- return self._be.num_src_streams()
539
-
540
- @property
541
- def num_out_streams(self):
542
- """Number of output streams configured by client code.
543
-
544
- :type: int
545
- """
546
- return self._be.num_out_streams()
547
-
548
- @property
549
- def default_audio_stream(self):
550
- """The index of default audio stream. ``None`` if there is no audio stream
551
-
552
- :type: Optional[int]
553
- """
554
- return self._default_audio_stream
555
-
556
- @property
557
- def default_video_stream(self):
558
- """The index of default video stream. ``None`` if there is no video stream
559
-
560
- :type: Optional[int]
561
- """
562
- return self._default_video_stream
563
-
564
- def get_metadata(self) -> Dict[str, str]:
565
- """Get the metadata of the source media.
566
-
567
- Returns:
568
- dict
569
- """
570
- return self._be.get_metadata()
571
-
572
- def get_src_stream_info(self, i: int) -> InputStreamTypes:
573
- """Get the metadata of source stream
574
-
575
- Args:
576
- i (int): Stream index.
577
- Returns:
578
- InputStreamTypes:
579
- Information about the source stream.
580
- If the source stream is audio type, then
581
- :class:`~torio.io._stream_reader.SourceAudioStream` is returned.
582
- If it is video type, then
583
- :class:`~torio.io._stream_reader.SourceVideoStream` is returned.
584
- Otherwise :class:`~torio.io._stream_reader.SourceStream` class is returned.
585
- """
586
- return _parse_si(self._be.get_src_stream_info(i))
587
-
588
- def get_out_stream_info(self, i: int) -> OutputStreamTypes:
589
- """Get the metadata of output stream
590
-
591
- Args:
592
- i (int): Stream index.
593
- Returns:
594
- OutputStreamTypes
595
- Information about the output stream.
596
- If the output stream is audio type, then
597
- :class:`~torio.io._stream_reader.OutputAudioStream` is returned.
598
- If it is video type, then
599
- :class:`~torio.io._stream_reader.OutputVideoStream` is returned.
600
- """
601
- info = self._be.get_out_stream_info(i)
602
- return _parse_oi(info)
603
-
604
- def seek(self, timestamp: float, mode: str = "precise"):
605
- """Seek the stream to the given timestamp [second]
606
-
607
- Args:
608
- timestamp (float): Target time in second.
609
- mode (str): Controls how seek is done.
610
- Valid choices are;
611
-
612
- * "key": Seek into the nearest key frame before the given timestamp.
613
- * "any": Seek into any frame (including non-key frames) before the given timestamp.
614
- * "precise": First seek into the nearest key frame before the given timestamp, then
615
- decode frames until it reaches the closes frame to the given timestamp.
616
-
617
- Note:
618
- All the modes invalidate and reset the internal state of decoder.
619
- When using "any" mode and if it ends up seeking into non-key frame,
620
- the image decoded may be invalid due to lack of key frame.
621
- Using "precise" will workaround this issue by decoding frames from previous
622
- key frame, but will be slower.
623
- """
624
- modes = {
625
- "key": 0,
626
- "any": 1,
627
- "precise": 2,
628
- }
629
- if mode not in modes:
630
- raise ValueError(f"The value of mode must be one of {list(modes.keys())}. Found: {mode}")
631
- self._be.seek(timestamp, modes[mode])
632
-
633
- @_format_audio_args
634
- def add_basic_audio_stream(
635
- self,
636
- frames_per_chunk: int,
637
- buffer_chunk_size: int = 3,
638
- *,
639
- stream_index: Optional[int] = None,
640
- decoder: Optional[str] = None,
641
- decoder_option: Optional[Dict[str, str]] = None,
642
- format: Optional[str] = "fltp",
643
- sample_rate: Optional[int] = None,
644
- num_channels: Optional[int] = None,
645
- ):
646
- """Add output audio stream
647
-
648
- Args:
649
- frames_per_chunk (int): {frames_per_chunk}
650
-
651
- buffer_chunk_size (int, optional): {buffer_chunk_size}
652
-
653
- stream_index (int or None, optional): {stream_index}
654
-
655
- decoder (str or None, optional): {decoder}
656
-
657
- decoder_option (dict or None, optional): {decoder_option}
658
-
659
- format (str, optional): Output sample format (precision).
660
-
661
- If ``None``, the output chunk has dtype corresponding to
662
- the precision of the source audio.
663
-
664
- Otherwise, the sample is converted and the output dtype is changed
665
- as following.
666
-
667
- - ``"u8p"``: The output is ``torch.uint8`` type.
668
- - ``"s16p"``: The output is ``torch.int16`` type.
669
- - ``"s32p"``: The output is ``torch.int32`` type.
670
- - ``"s64p"``: The output is ``torch.int64`` type.
671
- - ``"fltp"``: The output is ``torch.float32`` type.
672
- - ``"dblp"``: The output is ``torch.float64`` type.
673
-
674
- Default: ``"fltp"``.
675
-
676
- sample_rate (int or None, optional): If provided, resample the audio.
677
-
678
- num_channels (int, or None, optional): If provided, change the number of channels.
679
- """
680
- self.add_audio_stream(
681
- frames_per_chunk,
682
- buffer_chunk_size,
683
- stream_index=stream_index,
684
- decoder=decoder,
685
- decoder_option=decoder_option,
686
- filter_desc=_get_afilter_desc(sample_rate, format, num_channels),
687
- )
688
-
689
- @_format_video_args
690
- def add_basic_video_stream(
691
- self,
692
- frames_per_chunk: int,
693
- buffer_chunk_size: int = 3,
694
- *,
695
- stream_index: Optional[int] = None,
696
- decoder: Optional[str] = None,
697
- decoder_option: Optional[Dict[str, str]] = None,
698
- format: Optional[str] = "rgb24",
699
- frame_rate: Optional[int] = None,
700
- width: Optional[int] = None,
701
- height: Optional[int] = None,
702
- hw_accel: Optional[str] = None,
703
- ):
704
- """Add output video stream
705
-
706
- Args:
707
- frames_per_chunk (int): {frames_per_chunk}
708
-
709
- buffer_chunk_size (int, optional): {buffer_chunk_size}
710
-
711
- stream_index (int or None, optional): {stream_index}
712
-
713
- decoder (str or None, optional): {decoder}
714
-
715
- decoder_option (dict or None, optional): {decoder_option}
716
-
717
- format (str, optional): Change the format of image channels. Valid values are,
718
-
719
- - ``"rgb24"``: 8 bits * 3 channels (R, G, B)
720
- - ``"bgr24"``: 8 bits * 3 channels (B, G, R)
721
- - ``"yuv420p"``: 8 bits * 3 channels (Y, U, V)
722
- - ``"gray"``: 8 bits * 1 channels
723
-
724
- Default: ``"rgb24"``.
725
-
726
- frame_rate (int or None, optional): If provided, change the frame rate.
727
-
728
- width (int or None, optional): If provided, change the image width. Unit: Pixel.
729
-
730
- height (int or None, optional): If provided, change the image height. Unit: Pixel.
731
-
732
- hw_accel (str or None, optional): {hw_accel}
733
- """
734
- self.add_video_stream(
735
- frames_per_chunk,
736
- buffer_chunk_size,
737
- stream_index=stream_index,
738
- decoder=decoder,
739
- decoder_option=decoder_option,
740
- filter_desc=_get_vfilter_desc(frame_rate, width, height, format),
741
- hw_accel=hw_accel,
742
- )
743
-
744
- @_format_audio_args
745
- def add_audio_stream(
746
- self,
747
- frames_per_chunk: int,
748
- buffer_chunk_size: int = 3,
749
- *,
750
- stream_index: Optional[int] = None,
751
- decoder: Optional[str] = None,
752
- decoder_option: Optional[Dict[str, str]] = None,
753
- filter_desc: Optional[str] = None,
754
- ):
755
- """Add output audio stream
756
-
757
- Args:
758
- frames_per_chunk (int): {frames_per_chunk}
759
-
760
- buffer_chunk_size (int, optional): {buffer_chunk_size}
761
-
762
- stream_index (int or None, optional): {stream_index}
763
-
764
- decoder (str or None, optional): {decoder}
765
-
766
- decoder_option (dict or None, optional): {decoder_option}
767
-
768
- filter_desc (str or None, optional): Filter description.
769
- The list of available filters can be found at
770
- https://ffmpeg.org/ffmpeg-filters.html
771
- Note that complex filters are not supported.
772
-
773
- """
774
- i = self.default_audio_stream if stream_index is None else stream_index
775
- if i is None:
776
- raise RuntimeError("There is no audio stream.")
777
- self._be.add_audio_stream(
778
- i,
779
- frames_per_chunk,
780
- buffer_chunk_size,
781
- filter_desc,
782
- decoder,
783
- decoder_option or {},
784
- )
785
-
786
- @_format_video_args
787
- def add_video_stream(
788
- self,
789
- frames_per_chunk: int,
790
- buffer_chunk_size: int = 3,
791
- *,
792
- stream_index: Optional[int] = None,
793
- decoder: Optional[str] = None,
794
- decoder_option: Optional[Dict[str, str]] = None,
795
- filter_desc: Optional[str] = None,
796
- hw_accel: Optional[str] = None,
797
- ):
798
- """Add output video stream
799
-
800
- Args:
801
- frames_per_chunk (int): {frames_per_chunk}
802
-
803
- buffer_chunk_size (int, optional): {buffer_chunk_size}
804
-
805
- stream_index (int or None, optional): {stream_index}
806
-
807
- decoder (str or None, optional): {decoder}
808
-
809
- decoder_option (dict or None, optional): {decoder_option}
810
-
811
- hw_accel (str or None, optional): {hw_accel}
812
-
813
- filter_desc (str or None, optional): Filter description.
814
- The list of available filters can be found at
815
- https://ffmpeg.org/ffmpeg-filters.html
816
- Note that complex filters are not supported.
817
- """
818
- i = self.default_video_stream if stream_index is None else stream_index
819
- if i is None:
820
- raise RuntimeError("There is no video stream.")
821
- self._be.add_video_stream(
822
- i,
823
- frames_per_chunk,
824
- buffer_chunk_size,
825
- filter_desc,
826
- decoder,
827
- decoder_option or {},
828
- hw_accel,
829
- )
830
-
831
- def remove_stream(self, i: int):
832
- """Remove an output stream.
833
-
834
- Args:
835
- i (int): Index of the output stream to be removed.
836
- """
837
- self._be.remove_stream(i)
838
-
839
- def process_packet(self, timeout: Optional[float] = None, backoff: float = 10.0) -> int:
840
- """Read the source media and process one packet.
841
-
842
- If a packet is read successfully, then the data in the packet will
843
- be decoded and passed to corresponding output stream processors.
844
-
845
- If the packet belongs to a source stream that is not connected to
846
- an output stream, then the data are discarded.
847
-
848
- When the source reaches EOF, then it triggers all the output stream
849
- processors to enter drain mode. All the output stream processors
850
- flush the pending frames.
851
-
852
- Args:
853
- timeout (float or None, optional): Timeout in milli seconds.
854
-
855
- This argument changes the retry behavior when it failed to
856
- process a packet due to the underlying media resource being
857
- temporarily unavailable.
858
-
859
- When using a media device such as a microphone, there are cases
860
- where the underlying buffer is not ready.
861
- Calling this function in such case would cause the system to report
862
- `EAGAIN (resource temporarily unavailable)`.
863
-
864
- * ``>=0``: Keep retrying until the given time passes.
865
-
866
- * ``0<``: Keep retrying forever.
867
-
868
- * ``None`` : No retrying and raise an exception immediately.
869
-
870
- Default: ``None``.
871
-
872
- Note:
873
-
874
- The retry behavior is applicable only when the reason is the
875
- unavailable resource. It is not invoked if the reason of failure is
876
- other.
877
-
878
- backoff (float, optional): Time to wait before retrying in milli seconds.
879
-
880
- This option is effective only when `timeout` is effective. (not ``None``)
881
-
882
- When `timeout` is effective, this `backoff` controls how long the function
883
- should wait before retrying. Default: ``10.0``.
884
-
885
- Returns:
886
- int:
887
- ``0``
888
- A packet was processed properly. The caller can keep
889
- calling this function to buffer more frames.
890
-
891
- ``1``
892
- The streamer reached EOF. All the output stream processors
893
- flushed the pending frames. The caller should stop calling
894
- this method.
895
- """
896
- return self._be.process_packet(timeout, backoff)
897
-
898
- def process_all_packets(self):
899
- """Process packets until it reaches EOF."""
900
- self._be.process_all_packets()
901
-
902
- def is_buffer_ready(self) -> bool:
903
- """Returns true if all the output streams have at least one chunk filled."""
904
- return self._be.is_buffer_ready()
905
-
906
- def pop_chunks(self) -> Tuple[Optional[ChunkTensor]]:
907
- """Pop one chunk from all the output stream buffers.
908
-
909
- Returns:
910
- Tuple[Optional[ChunkTensor]]:
911
- Buffer contents.
912
- If a buffer does not contain any frame, then `None` is returned instead.
913
- """
914
- ret = []
915
- for chunk in self._be.pop_chunks():
916
- if chunk is None:
917
- ret.append(None)
918
- else:
919
- ret.append(ChunkTensor(chunk.frames, chunk.pts))
920
- return ret
921
-
922
- def fill_buffer(self, timeout: Optional[float] = None, backoff: float = 10.0) -> int:
923
- """Keep processing packets until all buffers have at least one chunk
924
-
925
- Arguments:
926
- timeout (float or None, optional): See
927
- :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``None``)
928
-
929
- backoff (float, optional): See
930
- :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``10.0``)
931
-
932
- Returns:
933
- int:
934
- ``0``
935
- Packets are processed properly and buffers are
936
- ready to be popped once.
937
-
938
- ``1``
939
- The streamer reached EOF. All the output stream processors
940
- flushed the pending frames. The caller should stop calling
941
- this method.
942
- """
943
- return self._be.fill_buffer(timeout, backoff)
944
-
945
- def stream(
946
- self, timeout: Optional[float] = None, backoff: float = 10.0
947
- ) -> Iterator[Tuple[Optional[ChunkTensor], ...]]:
948
- """Return an iterator that generates output tensors
949
-
950
- Arguments:
951
- timeout (float or None, optional): See
952
- :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``None``)
953
-
954
- backoff (float, optional): See
955
- :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``10.0``)
956
-
957
- Returns:
958
- Iterator[Tuple[Optional[ChunkTensor], ...]]:
959
- Iterator that yields a tuple of chunks that correspond to the output
960
- streams defined by client code.
961
- If an output stream is exhausted, then the chunk Tensor is substituted
962
- with ``None``.
963
- The iterator stops if all the output streams are exhausted.
964
- """
965
- if self.num_out_streams == 0:
966
- raise RuntimeError("No output stream is configured.")
967
-
968
- while True:
969
- if self.fill_buffer(timeout, backoff):
970
- break
971
- yield self.pop_chunks()
972
-
973
- while True:
974
- chunks = self.pop_chunks()
975
- if all(c is None for c in chunks):
976
- return
977
- yield chunks