torchaudio 2.0.2__cp310-cp310-win_amd64.whl → 2.1.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchaudio might be problematic. Click here for more details.

Files changed (88) hide show
  1. torchaudio/__init__.py +22 -3
  2. torchaudio/_backend/__init__.py +55 -4
  3. torchaudio/_backend/backend.py +53 -0
  4. torchaudio/_backend/common.py +52 -0
  5. torchaudio/_backend/ffmpeg.py +373 -0
  6. torchaudio/_backend/soundfile.py +54 -0
  7. torchaudio/_backend/soundfile_backend.py +457 -0
  8. torchaudio/_backend/sox.py +91 -0
  9. torchaudio/_backend/utils.py +81 -323
  10. torchaudio/_extension/__init__.py +55 -36
  11. torchaudio/_extension/utils.py +109 -17
  12. torchaudio/_internal/__init__.py +4 -1
  13. torchaudio/_internal/module_utils.py +37 -6
  14. torchaudio/backend/__init__.py +7 -11
  15. torchaudio/backend/_no_backend.py +24 -0
  16. torchaudio/backend/_sox_io_backend.py +297 -0
  17. torchaudio/backend/common.py +12 -52
  18. torchaudio/backend/no_backend.py +11 -21
  19. torchaudio/backend/soundfile_backend.py +11 -448
  20. torchaudio/backend/sox_io_backend.py +11 -435
  21. torchaudio/backend/utils.py +9 -18
  22. torchaudio/datasets/__init__.py +2 -0
  23. torchaudio/datasets/cmuarctic.py +1 -1
  24. torchaudio/datasets/cmudict.py +61 -62
  25. torchaudio/datasets/dr_vctk.py +1 -1
  26. torchaudio/datasets/gtzan.py +1 -1
  27. torchaudio/datasets/librilight_limited.py +1 -1
  28. torchaudio/datasets/librispeech.py +1 -1
  29. torchaudio/datasets/librispeech_biasing.py +189 -0
  30. torchaudio/datasets/libritts.py +1 -1
  31. torchaudio/datasets/ljspeech.py +1 -1
  32. torchaudio/datasets/musdb_hq.py +1 -1
  33. torchaudio/datasets/quesst14.py +1 -1
  34. torchaudio/datasets/speechcommands.py +1 -1
  35. torchaudio/datasets/tedlium.py +1 -1
  36. torchaudio/datasets/vctk.py +1 -1
  37. torchaudio/datasets/voxceleb1.py +1 -1
  38. torchaudio/datasets/yesno.py +1 -1
  39. torchaudio/functional/__init__.py +6 -2
  40. torchaudio/functional/_alignment.py +128 -0
  41. torchaudio/functional/filtering.py +69 -92
  42. torchaudio/functional/functional.py +99 -148
  43. torchaudio/io/__init__.py +4 -1
  44. torchaudio/io/_effector.py +347 -0
  45. torchaudio/io/_stream_reader.py +158 -90
  46. torchaudio/io/_stream_writer.py +196 -10
  47. torchaudio/lib/_torchaudio.pyd +0 -0
  48. torchaudio/lib/_torchaudio_ffmpeg4.pyd +0 -0
  49. torchaudio/lib/_torchaudio_ffmpeg5.pyd +0 -0
  50. torchaudio/lib/_torchaudio_ffmpeg6.pyd +0 -0
  51. torchaudio/lib/libtorchaudio.pyd +0 -0
  52. torchaudio/lib/libtorchaudio_ffmpeg4.pyd +0 -0
  53. torchaudio/lib/libtorchaudio_ffmpeg5.pyd +0 -0
  54. torchaudio/lib/libtorchaudio_ffmpeg6.pyd +0 -0
  55. torchaudio/models/__init__.py +14 -0
  56. torchaudio/models/decoder/__init__.py +22 -7
  57. torchaudio/models/decoder/_ctc_decoder.py +123 -69
  58. torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
  59. torchaudio/models/rnnt_decoder.py +10 -14
  60. torchaudio/models/squim/__init__.py +11 -0
  61. torchaudio/models/squim/objective.py +326 -0
  62. torchaudio/models/squim/subjective.py +150 -0
  63. torchaudio/models/wav2vec2/components.py +6 -10
  64. torchaudio/pipelines/__init__.py +9 -0
  65. torchaudio/pipelines/_squim_pipeline.py +176 -0
  66. torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
  67. torchaudio/pipelines/_wav2vec2/impl.py +198 -68
  68. torchaudio/pipelines/_wav2vec2/utils.py +120 -0
  69. torchaudio/sox_effects/sox_effects.py +7 -30
  70. torchaudio/transforms/__init__.py +2 -0
  71. torchaudio/transforms/_transforms.py +99 -54
  72. torchaudio/utils/download.py +2 -2
  73. torchaudio/utils/ffmpeg_utils.py +20 -15
  74. torchaudio/utils/sox_utils.py +8 -9
  75. torchaudio/version.py +2 -2
  76. torchaudio-2.1.1.dist-info/METADATA +113 -0
  77. torchaudio-2.1.1.dist-info/RECORD +115 -0
  78. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +1 -1
  79. torchaudio/io/_compat.py +0 -241
  80. torchaudio/lib/_torchaudio_ffmpeg.pyd +0 -0
  81. torchaudio/lib/flashlight_lib_text_decoder.pyd +0 -0
  82. torchaudio/lib/flashlight_lib_text_dictionary.pyd +0 -0
  83. torchaudio/lib/libflashlight-text.pyd +0 -0
  84. torchaudio/lib/libtorchaudio_ffmpeg.pyd +0 -0
  85. torchaudio-2.0.2.dist-info/METADATA +0 -26
  86. torchaudio-2.0.2.dist-info/RECORD +0 -98
  87. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
  88. {torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import BinaryIO, Dict, Iterator, Optional, Tuple, Union
4
+ from typing import BinaryIO, Dict, Iterator, Optional, Tuple, TypeVar, Union
5
5
 
6
6
  import torch
7
7
  import torchaudio
8
8
  from torch.utils._pytree import tree_map
9
9
 
10
+ if torchaudio._extension._FFMPEG_EXT is not None:
11
+ _StreamReader = torchaudio._extension._FFMPEG_EXT.StreamReader
12
+ _StreamReaderFileObj = torchaudio._extension._FFMPEG_EXT.StreamReaderFileObj
13
+
14
+
10
15
  __all__ = [
11
16
  "StreamReader",
12
17
  ]
@@ -103,70 +108,44 @@ class SourceVideoStream(SourceStream):
103
108
  """Frame rate."""
104
109
 
105
110
 
106
- # Indices of SrcInfo returned by low-level `get_src_stream_info`
107
- # - COMMON
108
- _MEDIA_TYPE = 0
109
- _CODEC = 1
110
- _CODEC_LONG = 2
111
- _FORMAT = 3
112
- _BIT_RATE = 4
113
- _NUM_FRAMES = 5
114
- _BPS = 6
115
- _METADATA = 7
116
- # - AUDIO
117
- _SAMPLE_RATE = 8
118
- _NUM_CHANNELS = 9
119
- # - VIDEO
120
- _WIDTH = 10
121
- _HEIGHT = 11
122
- _FRAME_RATE = 12
123
-
124
-
125
111
  def _parse_si(i):
126
- media_type = i[_MEDIA_TYPE]
127
- codec_name = i[_CODEC]
128
- codec_long_name = i[_CODEC_LONG]
129
- fmt = i[_FORMAT]
130
- bit_rate = i[_BIT_RATE]
131
- num_frames = i[_NUM_FRAMES]
132
- bps = i[_BPS]
133
- metadata = i[_METADATA]
112
+ media_type = i.media_type
134
113
  if media_type == "audio":
135
114
  return SourceAudioStream(
136
- media_type=media_type,
137
- codec=codec_name,
138
- codec_long_name=codec_long_name,
139
- format=fmt,
140
- bit_rate=bit_rate,
141
- num_frames=num_frames,
142
- bits_per_sample=bps,
143
- metadata=metadata,
144
- sample_rate=i[_SAMPLE_RATE],
145
- num_channels=i[_NUM_CHANNELS],
115
+ media_type=i.media_type,
116
+ codec=i.codec_name,
117
+ codec_long_name=i.codec_long_name,
118
+ format=i.format,
119
+ bit_rate=i.bit_rate,
120
+ num_frames=i.num_frames,
121
+ bits_per_sample=i.bits_per_sample,
122
+ metadata=i.metadata,
123
+ sample_rate=i.sample_rate,
124
+ num_channels=i.num_channels,
146
125
  )
147
126
  if media_type == "video":
148
127
  return SourceVideoStream(
149
- media_type=media_type,
150
- codec=codec_name,
151
- codec_long_name=codec_long_name,
152
- format=fmt,
153
- bit_rate=bit_rate,
154
- num_frames=num_frames,
155
- bits_per_sample=bps,
156
- metadata=metadata,
157
- width=i[_WIDTH],
158
- height=i[_HEIGHT],
159
- frame_rate=i[_FRAME_RATE],
128
+ media_type=i.media_type,
129
+ codec=i.codec_name,
130
+ codec_long_name=i.codec_long_name,
131
+ format=i.format,
132
+ bit_rate=i.bit_rate,
133
+ num_frames=i.num_frames,
134
+ bits_per_sample=i.bits_per_sample,
135
+ metadata=i.metadata,
136
+ width=i.width,
137
+ height=i.height,
138
+ frame_rate=i.frame_rate,
160
139
  )
161
140
  return SourceStream(
162
- media_type=media_type,
163
- codec=codec_name,
164
- codec_long_name=codec_long_name,
141
+ media_type=i.media_type,
142
+ codec=i.codec_name,
143
+ codec_long_name=i.codec_long_name,
165
144
  format=None,
166
145
  bit_rate=None,
167
146
  num_frames=None,
168
147
  bits_per_sample=None,
169
- metadata=metadata,
148
+ metadata=i.metadata,
170
149
  )
171
150
 
172
151
 
@@ -180,18 +159,93 @@ class OutputStream:
180
159
  """Index of the source stream that this output stream is connected."""
181
160
  filter_description: str
182
161
  """Description of filter graph applied to the source stream."""
162
+ media_type: str
163
+ """The type of the stream. ``"audio"`` or ``"video"``."""
164
+ format: str
165
+ """Media format. Such as ``"s16"`` and ``"yuv420p"``.
166
+
167
+ Commonly found audio values are;
168
+
169
+ - ``"u8"``, ``"u8p"``: Unsigned 8-bit unsigned interger.
170
+ - ``"s16"``, ``"s16p"``: 16-bit signed integer.
171
+ - ``"s32"``, ``"s32p"``: 32-bit signed integer.
172
+ - ``"flt"``, ``"fltp"``: 32-bit floating-point.
173
+
174
+ .. note::
175
+
176
+ `p` at the end indicates the format is `planar`.
177
+ Channels are grouped together instead of interspersed in memory."""
178
+
179
+
180
+ @dataclass
181
+ class OutputAudioStream(OutputStream):
182
+ """Information about an audio output stream configured with
183
+ :meth:`~torchaudio.io.StreamReader.add_audio_stream` or
184
+ :meth:`~torchaudio.io.StreamReader.add_basic_audio_stream`.
185
+
186
+ In addition to the attributes reported by :class:`OutputStream`,
187
+ the following attributes are reported.
188
+ """
189
+
190
+ sample_rate: float
191
+ """Sample rate of the audio."""
192
+ num_channels: int
193
+ """Number of channels."""
194
+
195
+
196
+ @dataclass
197
+ class OutputVideoStream(OutputStream):
198
+ """Information about a video output stream configured with
199
+ :meth:`~torchaudio.io.StreamReader.add_video_stream` or
200
+ :meth:`~torchaudio.io.StreamReader.add_basic_video_stream`.
201
+
202
+ In addition to the attributes reported by :class:`OutputStream`,
203
+ the following attributes are reported.
204
+ """
205
+
206
+ width: int
207
+ """Width of the video frame in pixel."""
208
+ height: int
209
+ """Height of the video frame in pixel."""
210
+ frame_rate: float
211
+ """Frame rate."""
183
212
 
184
213
 
185
214
  def _parse_oi(i):
186
- return OutputStream(i[0], i[1])
215
+ media_type = i.media_type
216
+ if media_type == "audio":
217
+ return OutputAudioStream(
218
+ source_index=i.source_index,
219
+ filter_description=i.filter_description,
220
+ media_type=i.media_type,
221
+ format=i.format,
222
+ sample_rate=i.sample_rate,
223
+ num_channels=i.num_channels,
224
+ )
225
+ if media_type == "video":
226
+ return OutputVideoStream(
227
+ source_index=i.source_index,
228
+ filter_description=i.filter_description,
229
+ media_type=i.media_type,
230
+ format=i.format,
231
+ width=i.width,
232
+ height=i.height,
233
+ frame_rate=i.frame_rate,
234
+ )
235
+ raise ValueError(f"Unexpected media_type: {i.media_type}({i})")
187
236
 
188
237
 
189
- def _get_afilter_desc(sample_rate: Optional[int], fmt: Optional[str]):
238
+ def _get_afilter_desc(sample_rate: Optional[int], fmt: Optional[str], num_channels: Optional[int]):
190
239
  descs = []
191
240
  if sample_rate is not None:
192
241
  descs.append(f"aresample={sample_rate}")
193
- if fmt is not None:
194
- descs.append(f"aformat=sample_fmts={fmt}")
242
+ if fmt is not None or num_channels is not None:
243
+ parts = []
244
+ if fmt is not None:
245
+ parts.append(f"sample_fmts={fmt}")
246
+ if num_channels is not None:
247
+ parts.append(f"channel_layouts={num_channels}c")
248
+ descs.append(f"aformat={':'.join(parts)}")
195
249
  return ",".join(descs) if descs else None
196
250
 
197
251
 
@@ -381,6 +435,10 @@ _format_video_args = _format_doc(
381
435
  )
382
436
 
383
437
 
438
+ InputStreamTypes = TypeVar("InputStream", bound=SourceStream)
439
+ OutputStreamTypes = TypeVar("OutputStream", bound=OutputStream)
440
+
441
+
384
442
  @torchaudio._extension.fail_if_no_ffmpeg
385
443
  class StreamReader:
386
444
  """Fetch and decode audio/video streams chunk by chunk.
@@ -388,7 +446,7 @@ class StreamReader:
388
446
  For the detailed usage of this class, please refer to the tutorial.
389
447
 
390
448
  Args:
391
- src (str, file-like object or Tensor): The media source.
449
+ src (str, file-like object): The media source.
392
450
  If string-type, it must be a resource indicator that FFmpeg can
393
451
  handle. This includes a file path, URL, device identifier or
394
452
  filter expression. The supported value depends on the FFmpeg found
@@ -401,9 +459,6 @@ class StreamReader:
401
459
  of codec detection. The signagure of `seek` method must be
402
460
  `seek(offset: int, whence: int) -> int`.
403
461
 
404
- If Tensor, it is interpreted as byte buffer.
405
- It must be one-dimensional, of type ``torch.uint8``.
406
-
407
462
  Please refer to the following for the expected signature and behavior
408
463
  of `read` and `seek` method.
409
464
 
@@ -457,20 +512,17 @@ class StreamReader:
457
512
 
458
513
  def __init__(
459
514
  self,
460
- src: Union[str, BinaryIO, torch.Tensor],
515
+ src: Union[str, BinaryIO],
461
516
  format: Optional[str] = None,
462
517
  option: Optional[Dict[str, str]] = None,
463
518
  buffer_size: int = 4096,
464
519
  ):
465
- torch._C._log_api_usage_once("torchaudio.io.StreamReader")
466
520
  if isinstance(src, str):
467
- self._be = torch.classes.torchaudio.ffmpeg_StreamReader(src, format, option)
468
- elif isinstance(src, torch.Tensor):
469
- self._be = torch.classes.torchaudio.ffmpeg_StreamReaderTensor(src, format, option, buffer_size)
521
+ self._be = _StreamReader(src, format, option)
470
522
  elif hasattr(src, "read"):
471
- self._be = torchaudio.lib._torchaudio_ffmpeg.StreamReaderFileObj(src, format, option, buffer_size)
523
+ self._be = _StreamReaderFileObj(src, format, option, buffer_size)
472
524
  else:
473
- raise ValueError("`src` must be either string, Tensor or file-like object.")
525
+ raise ValueError("`src` must be either a string or file-like object.")
474
526
 
475
527
  i = self._be.find_best_audio_stream()
476
528
  self._default_audio_stream = None if i < 0 else i
@@ -517,28 +569,37 @@ class StreamReader:
517
569
  """
518
570
  return self._be.get_metadata()
519
571
 
520
- def get_src_stream_info(self, i: int) -> Union[SourceStream, SourceAudioStream, SourceVideoStream]:
572
+ def get_src_stream_info(self, i: int) -> InputStreamTypes:
521
573
  """Get the metadata of source stream
522
574
 
523
575
  Args:
524
576
  i (int): Stream index.
525
577
  Returns:
526
- Information about the source stream.
527
- If the source stream is audio type, then :class:`SourceAudioStream` returned.
528
- If it is video type, then :class:`SourceVideoStream` is returned.
529
- Otherwise :class:`SourceStream` class is returned.
578
+ InputStreamTypes:
579
+ Information about the source stream.
580
+ If the source stream is audio type, then
581
+ :class:`~torchaudio.io._stream_reader.SourceAudioStream` is returned.
582
+ If it is video type, then
583
+ :class:`~torchaudio.io._stream_reader.SourceVideoStream` is returned.
584
+ Otherwise :class:`~torchaudio.io._stream_reader.SourceStream` class is returned.
530
585
  """
531
586
  return _parse_si(self._be.get_src_stream_info(i))
532
587
 
533
- def get_out_stream_info(self, i: int) -> OutputStream:
588
+ def get_out_stream_info(self, i: int) -> OutputStreamTypes:
534
589
  """Get the metadata of output stream
535
590
 
536
591
  Args:
537
592
  i (int): Stream index.
538
593
  Returns:
539
- OutputStream
594
+ OutputStreamTypes
595
+ Information about the output stream.
596
+ If the output stream is audio type, then
597
+ :class:`~torchaudio.io._stream_reader.OutputAudioStream` is returned.
598
+ If it is video type, then
599
+ :class:`~torchaudio.io._stream_reader.OutputVideoStream` is returned.
540
600
  """
541
- return _parse_oi(self._be.get_out_stream_info(i))
601
+ info = self._be.get_out_stream_info(i)
602
+ return _parse_oi(info)
542
603
 
543
604
  def seek(self, timestamp: float, mode: str = "precise"):
544
605
  """Seek the stream to the given timestamp [second]
@@ -574,11 +635,13 @@ class StreamReader:
574
635
  self,
575
636
  frames_per_chunk: int,
576
637
  buffer_chunk_size: int = 3,
638
+ *,
577
639
  stream_index: Optional[int] = None,
578
640
  decoder: Optional[str] = None,
579
641
  decoder_option: Optional[Dict[str, str]] = None,
580
642
  format: Optional[str] = "fltp",
581
643
  sample_rate: Optional[int] = None,
644
+ num_channels: Optional[int] = None,
582
645
  ):
583
646
  """Add output audio stream
584
647
 
@@ -611,14 +674,16 @@ class StreamReader:
611
674
  Default: ``"fltp"``.
612
675
 
613
676
  sample_rate (int or None, optional): If provided, resample the audio.
677
+
678
+ num_channels (int, or None, optional): If provided, change the number of channels.
614
679
  """
615
680
  self.add_audio_stream(
616
681
  frames_per_chunk,
617
682
  buffer_chunk_size,
618
- stream_index,
619
- decoder,
620
- decoder_option,
621
- _get_afilter_desc(sample_rate, format),
683
+ stream_index=stream_index,
684
+ decoder=decoder,
685
+ decoder_option=decoder_option,
686
+ filter_desc=_get_afilter_desc(sample_rate, format, num_channels),
622
687
  )
623
688
 
624
689
  @_format_video_args
@@ -626,14 +691,15 @@ class StreamReader:
626
691
  self,
627
692
  frames_per_chunk: int,
628
693
  buffer_chunk_size: int = 3,
694
+ *,
629
695
  stream_index: Optional[int] = None,
630
696
  decoder: Optional[str] = None,
631
697
  decoder_option: Optional[Dict[str, str]] = None,
632
- hw_accel: Optional[str] = None,
633
698
  format: Optional[str] = "rgb24",
634
699
  frame_rate: Optional[int] = None,
635
700
  width: Optional[int] = None,
636
701
  height: Optional[int] = None,
702
+ hw_accel: Optional[str] = None,
637
703
  ):
638
704
  """Add output video stream
639
705
 
@@ -648,8 +714,6 @@ class StreamReader:
648
714
 
649
715
  decoder_option (dict or None, optional): {decoder_option}
650
716
 
651
- hw_accel (str or None, optional): {hw_accel}
652
-
653
717
  format (str, optional): Change the format of image channels. Valid values are,
654
718
 
655
719
  - ``"rgb24"``: 8 bits * 3 channels (R, G, B)
@@ -664,15 +728,17 @@ class StreamReader:
664
728
  width (int or None, optional): If provided, change the image width. Unit: Pixel.
665
729
 
666
730
  height (int or None, optional): If provided, change the image height. Unit: Pixel.
731
+
732
+ hw_accel (str or None, optional): {hw_accel}
667
733
  """
668
734
  self.add_video_stream(
669
735
  frames_per_chunk,
670
736
  buffer_chunk_size,
671
- stream_index,
672
- decoder,
673
- decoder_option,
674
- hw_accel,
675
- _get_vfilter_desc(frame_rate, width, height, format),
737
+ stream_index=stream_index,
738
+ decoder=decoder,
739
+ decoder_option=decoder_option,
740
+ filter_desc=_get_vfilter_desc(frame_rate, width, height, format),
741
+ hw_accel=hw_accel,
676
742
  )
677
743
 
678
744
  @_format_audio_args
@@ -680,6 +746,7 @@ class StreamReader:
680
746
  self,
681
747
  frames_per_chunk: int,
682
748
  buffer_chunk_size: int = 3,
749
+ *,
683
750
  stream_index: Optional[int] = None,
684
751
  decoder: Optional[str] = None,
685
752
  decoder_option: Optional[Dict[str, str]] = None,
@@ -721,11 +788,12 @@ class StreamReader:
721
788
  self,
722
789
  frames_per_chunk: int,
723
790
  buffer_chunk_size: int = 3,
791
+ *,
724
792
  stream_index: Optional[int] = None,
725
793
  decoder: Optional[str] = None,
726
794
  decoder_option: Optional[Dict[str, str]] = None,
727
- hw_accel: Optional[str] = None,
728
795
  filter_desc: Optional[str] = None,
796
+ hw_accel: Optional[str] = None,
729
797
  ):
730
798
  """Add output video stream
731
799
 
@@ -848,7 +916,7 @@ class StreamReader:
848
916
  if chunk is None:
849
917
  ret.append(None)
850
918
  else:
851
- ret.append(ChunkTensor(chunk[0], chunk[1]))
919
+ ret.append(ChunkTensor(chunk.frames, chunk.pts))
852
920
  return ret
853
921
 
854
922
  def fill_buffer(self, timeout: Optional[float] = None, backoff: float = 10.0) -> int: