phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,860 @@
1
+ """
2
+ Module to read / write wav files using NumPy arrays
3
+
4
+ Functions
5
+ ---------
6
+ `read`: Return the sample rate (in samples/sec) and data from a WAV file.
7
+
8
+ `write`: Write a NumPy array as a WAV file.
9
+
10
+ """
11
+ import io
12
+ import struct
13
+ import sys
14
+ import warnings
15
+ from enum import IntEnum
16
+
17
+ import numpy
18
+
19
+ __all__ = ["WavFileWarning", "read", "write"]
20
+
21
+
22
+ class WavFileWarning(UserWarning):
23
+ pass
24
+
25
+
26
+ class WAVE_FORMAT(IntEnum):
27
+ """
28
+ WAVE form wFormatTag IDs
29
+
30
+ Complete list is in mmreg.h in Windows 10 SDK. ALAC and OPUS are the
31
+ newest additions, in v10.0.14393 2016-07
32
+ """
33
+
34
+ UNKNOWN = 0x0000
35
+ PCM = 0x0001
36
+ ADPCM = 0x0002
37
+ IEEE_FLOAT = 0x0003
38
+ VSELP = 0x0004
39
+ IBM_CVSD = 0x0005
40
+ ALAW = 0x0006
41
+ MULAW = 0x0007
42
+ DTS = 0x0008
43
+ DRM = 0x0009
44
+ WMAVOICE9 = 0x000A
45
+ WMAVOICE10 = 0x000B
46
+ OKI_ADPCM = 0x0010
47
+ DVI_ADPCM = 0x0011
48
+ IMA_ADPCM = 0x0011 # Duplicate
49
+ MEDIASPACE_ADPCM = 0x0012
50
+ SIERRA_ADPCM = 0x0013
51
+ G723_ADPCM = 0x0014
52
+ DIGISTD = 0x0015
53
+ DIGIFIX = 0x0016
54
+ DIALOGIC_OKI_ADPCM = 0x0017
55
+ MEDIAVISION_ADPCM = 0x0018
56
+ CU_CODEC = 0x0019
57
+ HP_DYN_VOICE = 0x001A
58
+ YAMAHA_ADPCM = 0x0020
59
+ SONARC = 0x0021
60
+ DSPGROUP_TRUESPEECH = 0x0022
61
+ ECHOSC1 = 0x0023
62
+ AUDIOFILE_AF36 = 0x0024
63
+ APTX = 0x0025
64
+ AUDIOFILE_AF10 = 0x0026
65
+ PROSODY_1612 = 0x0027
66
+ LRC = 0x0028
67
+ DOLBY_AC2 = 0x0030
68
+ GSM610 = 0x0031
69
+ MSNAUDIO = 0x0032
70
+ ANTEX_ADPCME = 0x0033
71
+ CONTROL_RES_VQLPC = 0x0034
72
+ DIGIREAL = 0x0035
73
+ DIGIADPCM = 0x0036
74
+ CONTROL_RES_CR10 = 0x0037
75
+ NMS_VBXADPCM = 0x0038
76
+ CS_IMAADPCM = 0x0039
77
+ ECHOSC3 = 0x003A
78
+ ROCKWELL_ADPCM = 0x003B
79
+ ROCKWELL_DIGITALK = 0x003C
80
+ XEBEC = 0x003D
81
+ G721_ADPCM = 0x0040
82
+ G728_CELP = 0x0041
83
+ MSG723 = 0x0042
84
+ INTEL_G723_1 = 0x0043
85
+ INTEL_G729 = 0x0044
86
+ SHARP_G726 = 0x0045
87
+ MPEG = 0x0050
88
+ RT24 = 0x0052
89
+ PAC = 0x0053
90
+ MPEGLAYER3 = 0x0055
91
+ LUCENT_G723 = 0x0059
92
+ CIRRUS = 0x0060
93
+ ESPCM = 0x0061
94
+ VOXWARE = 0x0062
95
+ CANOPUS_ATRAC = 0x0063
96
+ G726_ADPCM = 0x0064
97
+ G722_ADPCM = 0x0065
98
+ DSAT = 0x0066
99
+ DSAT_DISPLAY = 0x0067
100
+ VOXWARE_BYTE_ALIGNED = 0x0069
101
+ VOXWARE_AC8 = 0x0070
102
+ VOXWARE_AC10 = 0x0071
103
+ VOXWARE_AC16 = 0x0072
104
+ VOXWARE_AC20 = 0x0073
105
+ VOXWARE_RT24 = 0x0074
106
+ VOXWARE_RT29 = 0x0075
107
+ VOXWARE_RT29HW = 0x0076
108
+ VOXWARE_VR12 = 0x0077
109
+ VOXWARE_VR18 = 0x0078
110
+ VOXWARE_TQ40 = 0x0079
111
+ VOXWARE_SC3 = 0x007A
112
+ VOXWARE_SC3_1 = 0x007B
113
+ SOFTSOUND = 0x0080
114
+ VOXWARE_TQ60 = 0x0081
115
+ MSRT24 = 0x0082
116
+ G729A = 0x0083
117
+ MVI_MVI2 = 0x0084
118
+ DF_G726 = 0x0085
119
+ DF_GSM610 = 0x0086
120
+ ISIAUDIO = 0x0088
121
+ ONLIVE = 0x0089
122
+ MULTITUDE_FT_SX20 = 0x008A
123
+ INFOCOM_ITS_G721_ADPCM = 0x008B
124
+ CONVEDIA_G729 = 0x008C
125
+ CONGRUENCY = 0x008D
126
+ SBC24 = 0x0091
127
+ DOLBY_AC3_SPDIF = 0x0092
128
+ MEDIASONIC_G723 = 0x0093
129
+ PROSODY_8KBPS = 0x0094
130
+ ZYXEL_ADPCM = 0x0097
131
+ PHILIPS_LPCBB = 0x0098
132
+ PACKED = 0x0099
133
+ MALDEN_PHONYTALK = 0x00A0
134
+ RACAL_RECORDER_GSM = 0x00A1
135
+ RACAL_RECORDER_G720_A = 0x00A2
136
+ RACAL_RECORDER_G723_1 = 0x00A3
137
+ RACAL_RECORDER_TETRA_ACELP = 0x00A4
138
+ NEC_AAC = 0x00B0
139
+ RAW_AAC1 = 0x00FF
140
+ RHETOREX_ADPCM = 0x0100
141
+ IRAT = 0x0101
142
+ VIVO_G723 = 0x0111
143
+ VIVO_SIREN = 0x0112
144
+ PHILIPS_CELP = 0x0120
145
+ PHILIPS_GRUNDIG = 0x0121
146
+ DIGITAL_G723 = 0x0123
147
+ SANYO_LD_ADPCM = 0x0125
148
+ SIPROLAB_ACEPLNET = 0x0130
149
+ SIPROLAB_ACELP4800 = 0x0131
150
+ SIPROLAB_ACELP8V3 = 0x0132
151
+ SIPROLAB_G729 = 0x0133
152
+ SIPROLAB_G729A = 0x0134
153
+ SIPROLAB_KELVIN = 0x0135
154
+ VOICEAGE_AMR = 0x0136
155
+ G726ADPCM = 0x0140
156
+ DICTAPHONE_CELP68 = 0x0141
157
+ DICTAPHONE_CELP54 = 0x0142
158
+ QUALCOMM_PUREVOICE = 0x0150
159
+ QUALCOMM_HALFRATE = 0x0151
160
+ TUBGSM = 0x0155
161
+ MSAUDIO1 = 0x0160
162
+ WMAUDIO2 = 0x0161
163
+ WMAUDIO3 = 0x0162
164
+ WMAUDIO_LOSSLESS = 0x0163
165
+ WMASPDIF = 0x0164
166
+ UNISYS_NAP_ADPCM = 0x0170
167
+ UNISYS_NAP_ULAW = 0x0171
168
+ UNISYS_NAP_ALAW = 0x0172
169
+ UNISYS_NAP_16K = 0x0173
170
+ SYCOM_ACM_SYC008 = 0x0174
171
+ SYCOM_ACM_SYC701_G726L = 0x0175
172
+ SYCOM_ACM_SYC701_CELP54 = 0x0176
173
+ SYCOM_ACM_SYC701_CELP68 = 0x0177
174
+ KNOWLEDGE_ADVENTURE_ADPCM = 0x0178
175
+ FRAUNHOFER_IIS_MPEG2_AAC = 0x0180
176
+ DTS_DS = 0x0190
177
+ CREATIVE_ADPCM = 0x0200
178
+ CREATIVE_FASTSPEECH8 = 0x0202
179
+ CREATIVE_FASTSPEECH10 = 0x0203
180
+ UHER_ADPCM = 0x0210
181
+ ULEAD_DV_AUDIO = 0x0215
182
+ ULEAD_DV_AUDIO_1 = 0x0216
183
+ QUARTERDECK = 0x0220
184
+ ILINK_VC = 0x0230
185
+ RAW_SPORT = 0x0240
186
+ ESST_AC3 = 0x0241
187
+ GENERIC_PASSTHRU = 0x0249
188
+ IPI_HSX = 0x0250
189
+ IPI_RPELP = 0x0251
190
+ CS2 = 0x0260
191
+ SONY_SCX = 0x0270
192
+ SONY_SCY = 0x0271
193
+ SONY_ATRAC3 = 0x0272
194
+ SONY_SPC = 0x0273
195
+ TELUM_AUDIO = 0x0280
196
+ TELUM_IA_AUDIO = 0x0281
197
+ NORCOM_VOICE_SYSTEMS_ADPCM = 0x0285
198
+ FM_TOWNS_SND = 0x0300
199
+ MICRONAS = 0x0350
200
+ MICRONAS_CELP833 = 0x0351
201
+ BTV_DIGITAL = 0x0400
202
+ INTEL_MUSIC_CODER = 0x0401
203
+ INDEO_AUDIO = 0x0402
204
+ QDESIGN_MUSIC = 0x0450
205
+ ON2_VP7_AUDIO = 0x0500
206
+ ON2_VP6_AUDIO = 0x0501
207
+ VME_VMPCM = 0x0680
208
+ TPC = 0x0681
209
+ LIGHTWAVE_LOSSLESS = 0x08AE
210
+ OLIGSM = 0x1000
211
+ OLIADPCM = 0x1001
212
+ OLICELP = 0x1002
213
+ OLISBC = 0x1003
214
+ OLIOPR = 0x1004
215
+ LH_CODEC = 0x1100
216
+ LH_CODEC_CELP = 0x1101
217
+ LH_CODEC_SBC8 = 0x1102
218
+ LH_CODEC_SBC12 = 0x1103
219
+ LH_CODEC_SBC16 = 0x1104
220
+ NORRIS = 0x1400
221
+ ISIAUDIO_2 = 0x1401
222
+ SOUNDSPACE_MUSICOMPRESS = 0x1500
223
+ MPEG_ADTS_AAC = 0x1600
224
+ MPEG_RAW_AAC = 0x1601
225
+ MPEG_LOAS = 0x1602
226
+ NOKIA_MPEG_ADTS_AAC = 0x1608
227
+ NOKIA_MPEG_RAW_AAC = 0x1609
228
+ VODAFONE_MPEG_ADTS_AAC = 0x160A
229
+ VODAFONE_MPEG_RAW_AAC = 0x160B
230
+ MPEG_HEAAC = 0x1610
231
+ VOXWARE_RT24_SPEECH = 0x181C
232
+ SONICFOUNDRY_LOSSLESS = 0x1971
233
+ INNINGS_TELECOM_ADPCM = 0x1979
234
+ LUCENT_SX8300P = 0x1C07
235
+ LUCENT_SX5363S = 0x1C0C
236
+ CUSEEME = 0x1F03
237
+ NTCSOFT_ALF2CM_ACM = 0x1FC4
238
+ DVM = 0x2000
239
+ DTS2 = 0x2001
240
+ MAKEAVIS = 0x3313
241
+ DIVIO_MPEG4_AAC = 0x4143
242
+ NOKIA_ADAPTIVE_MULTIRATE = 0x4201
243
+ DIVIO_G726 = 0x4243
244
+ LEAD_SPEECH = 0x434C
245
+ LEAD_VORBIS = 0x564C
246
+ WAVPACK_AUDIO = 0x5756
247
+ OGG_VORBIS_MODE_1 = 0x674F
248
+ OGG_VORBIS_MODE_2 = 0x6750
249
+ OGG_VORBIS_MODE_3 = 0x6751
250
+ OGG_VORBIS_MODE_1_PLUS = 0x676F
251
+ OGG_VORBIS_MODE_2_PLUS = 0x6770
252
+ OGG_VORBIS_MODE_3_PLUS = 0x6771
253
+ ALAC = 0x6C61
254
+ _3COM_NBX = 0x7000 # Can't have leading digit
255
+ OPUS = 0x704F
256
+ FAAD_AAC = 0x706D
257
+ AMR_NB = 0x7361
258
+ AMR_WB = 0x7362
259
+ AMR_WP = 0x7363
260
+ GSM_AMR_CBR = 0x7A21
261
+ GSM_AMR_VBR_SID = 0x7A22
262
+ COMVERSE_INFOSYS_G723_1 = 0xA100
263
+ COMVERSE_INFOSYS_AVQSBC = 0xA101
264
+ COMVERSE_INFOSYS_SBC = 0xA102
265
+ SYMBOL_G729_A = 0xA103
266
+ VOICEAGE_AMR_WB = 0xA104
267
+ INGENIENT_G726 = 0xA105
268
+ MPEG4_AAC = 0xA106
269
+ ENCORE_G726 = 0xA107
270
+ ZOLL_ASAO = 0xA108
271
+ SPEEX_VOICE = 0xA109
272
+ VIANIX_MASC = 0xA10A
273
+ WM9_SPECTRUM_ANALYZER = 0xA10B
274
+ WMF_SPECTRUM_ANAYZER = 0xA10C
275
+ GSM_610 = 0xA10D
276
+ GSM_620 = 0xA10E
277
+ GSM_660 = 0xA10F
278
+ GSM_690 = 0xA110
279
+ GSM_ADAPTIVE_MULTIRATE_WB = 0xA111
280
+ POLYCOM_G722 = 0xA112
281
+ POLYCOM_G728 = 0xA113
282
+ POLYCOM_G729_A = 0xA114
283
+ POLYCOM_SIREN = 0xA115
284
+ GLOBAL_IP_ILBC = 0xA116
285
+ RADIOTIME_TIME_SHIFT_RADIO = 0xA117
286
+ NICE_ACA = 0xA118
287
+ NICE_ADPCM = 0xA119
288
+ VOCORD_G721 = 0xA11A
289
+ VOCORD_G726 = 0xA11B
290
+ VOCORD_G722_1 = 0xA11C
291
+ VOCORD_G728 = 0xA11D
292
+ VOCORD_G729 = 0xA11E
293
+ VOCORD_G729_A = 0xA11F
294
+ VOCORD_G723_1 = 0xA120
295
+ VOCORD_LBC = 0xA121
296
+ NICE_G728 = 0xA122
297
+ FRACE_TELECOM_G729 = 0xA123
298
+ CODIAN = 0xA124
299
+ FLAC = 0xF1AC
300
+ EXTENSIBLE = 0xFFFE
301
+ DEVELOPMENT = 0xFFFF
302
+
303
+
304
+ KNOWN_WAVE_FORMATS = {WAVE_FORMAT.PCM, WAVE_FORMAT.IEEE_FLOAT}
305
+
306
+
307
+ def _raise_bad_format(format_tag):
308
+ try:
309
+ format_name = WAVE_FORMAT(format_tag).name
310
+ except ValueError:
311
+ format_name = f"{format_tag:#06x}"
312
+ raise ValueError(
313
+ f"Unknown wave file format: {format_name}. Supported "
314
+ "formats: " + ", ".join(x.name for x in KNOWN_WAVE_FORMATS)
315
+ )
316
+
317
+
318
+ def _read_fmt_chunk(fid, is_big_endian):
319
+ """
320
+ Returns
321
+ -------
322
+ size : int
323
+ size of format subchunk in bytes (minus 8 for "fmt " and itself)
324
+ format_tag : int
325
+ PCM, float, or compressed format
326
+ channels : int
327
+ number of channels
328
+ fs : int
329
+ sampling frequency in samples per second
330
+ bytes_per_second : int
331
+ overall byte rate for the file
332
+ block_align : int
333
+ bytes per sample, including all channels
334
+ bit_depth : int
335
+ bits per sample
336
+
337
+ Notes
338
+ -----
339
+ Assumes file pointer is immediately after the 'fmt ' id
340
+ """
341
+ if is_big_endian:
342
+ fmt = ">"
343
+ else:
344
+ fmt = "<"
345
+
346
+ size = struct.unpack(fmt + "I", fid.read(4))[0]
347
+
348
+ if size < 16:
349
+ raise ValueError("Binary structure of wave file is not compliant")
350
+
351
+ res = struct.unpack(fmt + "HHIIHH", fid.read(16))
352
+ bytes_read = 16
353
+
354
+ format_tag, channels, fs, bytes_per_second, block_align, bit_depth = res
355
+
356
+ if format_tag == WAVE_FORMAT.EXTENSIBLE and size >= (16 + 2):
357
+ ext_chunk_size = struct.unpack(fmt + "H", fid.read(2))[0]
358
+ bytes_read += 2
359
+ if ext_chunk_size >= 22:
360
+ extensible_chunk_data = fid.read(22)
361
+ bytes_read += 22
362
+ raw_guid = extensible_chunk_data[2 + 4 : 2 + 4 + 16]
363
+ # GUID template {XXXXXXXX-0000-0010-8000-00AA00389B71} (RFC-2361)
364
+ # MS GUID byte order: first three groups are native byte order,
365
+ # rest is Big Endian
366
+ if is_big_endian:
367
+ tail = b"\x00\x00\x00\x10\x80\x00\x00\xAA\x00\x38\x9B\x71"
368
+ else:
369
+ tail = b"\x00\x00\x10\x00\x80\x00\x00\xAA\x00\x38\x9B\x71"
370
+ if raw_guid.endswith(tail):
371
+ format_tag = struct.unpack(fmt + "I", raw_guid[:4])[0]
372
+ else:
373
+ raise ValueError("Binary structure of wave file is not compliant")
374
+
375
+ if format_tag not in KNOWN_WAVE_FORMATS:
376
+ _raise_bad_format(format_tag)
377
+
378
+ # move file pointer to next chunk
379
+ if size > bytes_read:
380
+ fid.read(size - bytes_read)
381
+
382
+ # fmt should always be 16, 18 or 40, but handle it just in case
383
+ _handle_pad_byte(fid, size)
384
+
385
+ return (size, format_tag, channels, fs, bytes_per_second, block_align, bit_depth)
386
+
387
+
388
+ def _read_data_chunk(
389
+ fid, format_tag, channels, bit_depth, is_big_endian, block_align, mmap=False
390
+ ):
391
+ """
392
+ Notes
393
+ -----
394
+ Assumes file pointer is immediately after the 'data' id
395
+
396
+ It's possible to not use all available bits in a container, or to store
397
+ samples in a container bigger than necessary, so bytes_per_sample uses
398
+ the actual reported container size (nBlockAlign / nChannels). Real-world
399
+ examples:
400
+
401
+ Adobe Audition's "24-bit packed int (type 1, 20-bit)"
402
+
403
+ nChannels = 2, nBlockAlign = 6, wBitsPerSample = 20
404
+
405
+ http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Samples/AFsp/M1F1-int12-AFsp.wav
406
+ is:
407
+
408
+ nChannels = 2, nBlockAlign = 4, wBitsPerSample = 12
409
+
410
+ http://www-mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/Docs/multichaudP.pdf
411
+ gives an example of:
412
+
413
+ nChannels = 2, nBlockAlign = 8, wBitsPerSample = 20
414
+ """
415
+ if is_big_endian:
416
+ fmt = ">"
417
+ else:
418
+ fmt = "<"
419
+
420
+ # Size of the data subchunk in bytes
421
+ size = struct.unpack(fmt + "I", fid.read(4))[0]
422
+
423
+ # Number of bytes per sample (sample container size)
424
+ bytes_per_sample = block_align // channels
425
+ n_samples = size // bytes_per_sample
426
+
427
+ if format_tag == WAVE_FORMAT.PCM:
428
+ if 1 <= bit_depth <= 8:
429
+ dtype = "u1" # WAV of 8-bit integer or less are unsigned
430
+ elif bytes_per_sample in {3, 5, 6, 7}:
431
+ # No compatible dtype. Load as raw bytes for reshaping later.
432
+ dtype = "V1"
433
+ elif bit_depth <= 64:
434
+ # Remaining bit depths can map directly to signed numpy dtypes
435
+ dtype = f"{fmt}i{bytes_per_sample}"
436
+ else:
437
+ raise ValueError(
438
+ "Unsupported bit depth: the WAV file "
439
+ f"has {bit_depth}-bit integer data."
440
+ )
441
+ elif format_tag == WAVE_FORMAT.IEEE_FLOAT:
442
+ if bit_depth in {32, 64}:
443
+ dtype = f"{fmt}f{bytes_per_sample}"
444
+ else:
445
+ raise ValueError(
446
+ "Unsupported bit depth: the WAV file "
447
+ f"has {bit_depth}-bit floating-point data."
448
+ )
449
+ else:
450
+ _raise_bad_format(format_tag)
451
+
452
+ start = fid.tell()
453
+ if not mmap:
454
+ try:
455
+ count = size if dtype == "V1" else n_samples
456
+ data = numpy.fromfile(fid, dtype=dtype, count=count)
457
+ except io.UnsupportedOperation: # not a C-like file
458
+ fid.seek(start, 0) # just in case it seeked, though it shouldn't
459
+ data = numpy.frombuffer(fid.read(size), dtype=dtype)
460
+
461
+ if dtype == "V1":
462
+ # Rearrange raw bytes into smallest compatible numpy dtype
463
+ dt = f"{fmt}i4" if bytes_per_sample == 3 else f"{fmt}i8"
464
+ a = numpy.zeros(
465
+ (len(data) // bytes_per_sample, numpy.dtype(dt).itemsize), dtype="V1"
466
+ )
467
+ if is_big_endian:
468
+ a[:, :bytes_per_sample] = data.reshape((-1, bytes_per_sample))
469
+ else:
470
+ a[:, -bytes_per_sample:] = data.reshape((-1, bytes_per_sample))
471
+ data = a.view(dt).reshape(a.shape[:-1])
472
+ else:
473
+ if bytes_per_sample in {1, 2, 4, 8}:
474
+ start = fid.tell()
475
+ data = numpy.memmap(
476
+ fid, dtype=dtype, mode="c", offset=start, shape=(n_samples,)
477
+ )
478
+ fid.seek(start + size)
479
+ else:
480
+ raise ValueError(
481
+ "mmap=True not compatible with "
482
+ f"{bytes_per_sample}-byte container size."
483
+ )
484
+
485
+ _handle_pad_byte(fid, size)
486
+
487
+ if channels > 1:
488
+ data = data.reshape(-1, channels)
489
+ return data
490
+
491
+
492
+ def _skip_unknown_chunk(fid, is_big_endian):
493
+ if is_big_endian:
494
+ fmt = ">I"
495
+ else:
496
+ fmt = "<I"
497
+
498
+ data = fid.read(4)
499
+ # call unpack() and seek() only if we have really read data from file
500
+ # otherwise empty read at the end of the file would trigger
501
+ # unnecessary exception at unpack() call
502
+ # in case data equals somehow to 0, there is no need for seek() anyway
503
+ if data:
504
+ size = struct.unpack(fmt, data)[0]
505
+ fid.seek(size, 1)
506
+ _handle_pad_byte(fid, size)
507
+
508
+
509
+ def _read_riff_chunk(fid):
510
+ str1 = fid.read(4) # File signature
511
+ if str1 == b"RIFF":
512
+ is_big_endian = False
513
+ fmt = "<I"
514
+ elif str1 == b"RIFX":
515
+ is_big_endian = True
516
+ fmt = ">I"
517
+ else:
518
+ # There are also .wav files with "FFIR" or "XFIR" signatures?
519
+ raise ValueError(
520
+ f"File format {repr(str1)} not understood. Only "
521
+ "'RIFF' and 'RIFX' supported."
522
+ )
523
+
524
+ # Size of entire file
525
+ file_size = struct.unpack(fmt, fid.read(4))[0] + 8
526
+
527
+ str2 = fid.read(4)
528
+ if str2 != b"WAVE":
529
+ raise ValueError(f"Not a WAV file. RIFF form type is {repr(str2)}.")
530
+
531
+ return file_size, is_big_endian
532
+
533
+
534
+ def _handle_pad_byte(fid, size):
535
+ # "If the chunk size is an odd number of bytes, a pad byte with value zero
536
+ # is written after ckData." So we need to seek past this after each chunk.
537
+ if size % 2:
538
+ fid.seek(1, 1)
539
+
540
+
541
+ def read(filename, mmap=False):
542
+ """
543
+ Open a WAV file.
544
+
545
+ Return the sample rate (in samples/sec) and data from an LPCM WAV file.
546
+
547
+ Parameters
548
+ ----------
549
+ filename : string or open file handle
550
+ Input WAV file.
551
+ mmap : bool, optional
552
+ Whether to read data as memory-mapped (default: False). Not compatible
553
+ with some bit depths; see Notes. Only to be used on real files.
554
+
555
+ .. versionadded:: 0.12.0
556
+
557
+ Returns
558
+ -------
559
+ rate : int
560
+ Sample rate of WAV file.
561
+ data : numpy array
562
+ Data read from WAV file. Data-type is determined from the file;
563
+ see Notes. Data is 1-D for 1-channel WAV, or 2-D of shape
564
+ (Nsamples, Nchannels) otherwise. If a file-like input without a
565
+ C-like file descriptor (e.g., :class:`python:io.BytesIO`) is
566
+ passed, this will not be writeable.
567
+
568
+ Notes
569
+ -----
570
+ Common data types: [1]_
571
+
572
+ ===================== =========== =========== =============
573
+ WAV format Min Max NumPy dtype
574
+ ===================== =========== =========== =============
575
+ 32-bit floating-point -1.0 +1.0 float32
576
+ 32-bit integer PCM -2147483648 +2147483647 int32
577
+ 24-bit integer PCM -2147483648 +2147483392 int32
578
+ 16-bit integer PCM -32768 +32767 int16
579
+ 8-bit integer PCM 0 255 uint8
580
+ ===================== =========== =========== =============
581
+
582
+ WAV files can specify arbitrary bit depth, and this function supports
583
+ reading any integer PCM depth from 1 to 64 bits. Data is returned in the
584
+ smallest compatible numpy int type, in left-justified format. 8-bit and
585
+ lower is unsigned, while 9-bit and higher is signed.
586
+
587
+ For example, 24-bit data will be stored as int32, with the MSB of the
588
+ 24-bit data stored at the MSB of the int32, and typically the least
589
+ significant byte is 0x00. (However, if a file actually contains data past
590
+ its specified bit depth, those bits will be read and output, too. [2]_)
591
+
592
+ This bit justification and sign matches WAV's native internal format, which
593
+ allows memory mapping of WAV files that use 1, 2, 4, or 8 bytes per sample
594
+ (so 24-bit files cannot be memory-mapped, but 32-bit can).
595
+
596
+ IEEE float PCM in 32- or 64-bit format is supported, with or without mmap.
597
+ Values exceeding [-1, +1] are not clipped.
598
+
599
+ Non-linear PCM (mu-law, A-law) is not supported.
600
+
601
+ References
602
+ ----------
603
+ .. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
604
+ Interface and Data Specifications 1.0", section "Data Format of the
605
+ Samples", August 1991
606
+ http://www.tactilemedia.com/info/MCI_Control_Info.html
607
+ .. [2] Adobe Systems Incorporated, "Adobe Audition 3 User Guide", section
608
+ "Audio file formats: 24-bit Packed Int (type 1, 20-bit)", 2007
609
+
610
+ Examples
611
+ --------
612
+ >>> from os.path import dirname, join as pjoin
613
+ >>> from scipy.io import wavfile
614
+ >>> import scipy.io
615
+
616
+ Get the filename for an example .wav file from the tests/data directory.
617
+
618
+ >>> data_dir = pjoin(dirname(scipy.io.__file__), 'tests', 'data')
619
+ >>> wav_fname = pjoin(data_dir, 'test-44100Hz-2ch-32bit-float-be.wav')
620
+
621
+ Load the .wav file contents.
622
+
623
+ >>> samplerate, data = wavfile.read(wav_fname)
624
+ >>> print(f"number of channels = {data.shape[1]}")
625
+ number of channels = 2
626
+ >>> length = data.shape[0] / samplerate
627
+ >>> print(f"length = {length}s")
628
+ length = 0.01s
629
+
630
+ Plot the waveform.
631
+
632
+ >>> import matplotlib.pyplot as plt
633
+ >>> import numpy as np
634
+ >>> time = np.linspace(0., length, data.shape[0])
635
+ >>> plt.plot(time, data[:, 0], label="Left channel")
636
+ >>> plt.plot(time, data[:, 1], label="Right channel")
637
+ >>> plt.legend()
638
+ >>> plt.xlabel("Time [s]")
639
+ >>> plt.ylabel("Amplitude")
640
+ >>> plt.show()
641
+
642
+ """
643
+ if hasattr(filename, "read"):
644
+ fid = filename
645
+ mmap = False
646
+ else:
647
+ # pylint: disable=consider-using-with
648
+ fid = open(filename, "rb")
649
+
650
+ try:
651
+ file_size, is_big_endian = _read_riff_chunk(fid)
652
+ fmt_chunk_received = False
653
+ data_chunk_received = False
654
+ while fid.tell() < file_size:
655
+ # read the next chunk
656
+ chunk_id = fid.read(4)
657
+
658
+ if not chunk_id:
659
+ if data_chunk_received:
660
+ # End of file but data successfully read
661
+ warnings.warn(
662
+ f"Reached EOF prematurely; finished at {fid.tell()} bytes, "
663
+ "expected {file_size} bytes from header.",
664
+ WavFileWarning,
665
+ stacklevel=2,
666
+ )
667
+ break
668
+
669
+ raise ValueError("Unexpected end of file.")
670
+ if len(chunk_id) < 4:
671
+ msg = f"Incomplete chunk ID: {repr(chunk_id)}"
672
+ # If we have the data, ignore the broken chunk
673
+ if fmt_chunk_received and data_chunk_received:
674
+ warnings.warn(msg + ", ignoring it.", WavFileWarning, stacklevel=2)
675
+ else:
676
+ raise ValueError(msg)
677
+
678
+ if chunk_id == b"fmt ":
679
+ fmt_chunk_received = True
680
+ fmt_chunk = _read_fmt_chunk(fid, is_big_endian)
681
+ format_tag, channels, fs = fmt_chunk[1:4]
682
+ bit_depth = fmt_chunk[6]
683
+ block_align = fmt_chunk[5]
684
+ elif chunk_id == b"fact":
685
+ _skip_unknown_chunk(fid, is_big_endian)
686
+ elif chunk_id == b"data":
687
+ data_chunk_received = True
688
+ if not fmt_chunk_received:
689
+ raise ValueError("No fmt chunk before data")
690
+ data = _read_data_chunk(
691
+ fid,
692
+ format_tag,
693
+ channels,
694
+ bit_depth,
695
+ is_big_endian,
696
+ block_align,
697
+ mmap,
698
+ )
699
+ elif chunk_id == b"LIST":
700
+ # Someday this could be handled properly but for now skip it
701
+ _skip_unknown_chunk(fid, is_big_endian)
702
+ elif chunk_id in {b"JUNK", b"Fake"}:
703
+ # Skip alignment chunks without warning
704
+ _skip_unknown_chunk(fid, is_big_endian)
705
+ else:
706
+ warnings.warn(
707
+ "Chunk (non-data) not understood, skipping it.",
708
+ WavFileWarning,
709
+ stacklevel=2,
710
+ )
711
+ _skip_unknown_chunk(fid, is_big_endian)
712
+ finally:
713
+ if not hasattr(filename, "read"):
714
+ fid.close()
715
+ else:
716
+ fid.seek(0)
717
+
718
+ return fs, data
719
+
720
+
721
+ def write(filename, rate, data):
722
+ """
723
+ Write a NumPy array as a WAV file.
724
+
725
+ Parameters
726
+ ----------
727
+ filename : string or open file handle
728
+ Output wav file.
729
+ rate : int
730
+ The sample rate (in samples/sec).
731
+ data : ndarray
732
+ A 1-D or 2-D NumPy array of either integer or float data-type.
733
+
734
+ Notes
735
+ -----
736
+ * Writes a simple uncompressed WAV file.
737
+ * To write multiple-channels, use a 2-D array of shape
738
+ (Nsamples, Nchannels).
739
+ * The bits-per-sample and PCM/float will be determined by the data-type.
740
+
741
+ Common data types: [1]_
742
+
743
+ ===================== =========== =========== =============
744
+ WAV format Min Max NumPy dtype
745
+ ===================== =========== =========== =============
746
+ 32-bit floating-point -1.0 +1.0 float32
747
+ 32-bit PCM -2147483648 +2147483647 int32
748
+ 16-bit PCM -32768 +32767 int16
749
+ 8-bit PCM 0 255 uint8
750
+ ===================== =========== =========== =============
751
+
752
+ Note that 8-bit PCM is unsigned.
753
+
754
+ References
755
+ ----------
756
+ .. [1] IBM Corporation and Microsoft Corporation, "Multimedia Programming
757
+ Interface and Data Specifications 1.0", section "Data Format of the
758
+ Samples", August 1991
759
+ http://www.tactilemedia.com/info/MCI_Control_Info.html
760
+
761
+ Examples
762
+ --------
763
+ Create a 100Hz sine wave, sampled at 44100Hz.
764
+ Write to 16-bit PCM, Mono.
765
+
766
+ >>> from scipy.io.wavfile import write
767
+ >>> samplerate = 44100; fs = 100
768
+ >>> t = np.linspace(0., 1., samplerate)
769
+ >>> amplitude = np.iinfo(np.int16).max
770
+ >>> data = amplitude * np.sin(2. * np.pi * fs * t)
771
+ >>> write("example.wav", samplerate, data.astype(np.int16))
772
+
773
+ """
774
+ if hasattr(filename, "write"):
775
+ fid = filename
776
+ else:
777
+ # pylint: disable=consider-using-with
778
+ fid = open(filename, "wb")
779
+
780
+ fs = rate
781
+
782
+ try:
783
+ dkind = data.dtype.kind
784
+ if not (
785
+ dkind == "i" or dkind == "f" or (dkind == "u" and data.dtype.itemsize == 1)
786
+ ):
787
+ raise ValueError(f"Unsupported data type '{data.dtype}'")
788
+
789
+ header_data = b""
790
+
791
+ header_data += b"RIFF"
792
+ header_data += b"\x00\x00\x00\x00"
793
+ header_data += b"WAVE"
794
+
795
+ # fmt chunk
796
+ header_data += b"fmt "
797
+ if dkind == "f":
798
+ format_tag = WAVE_FORMAT.IEEE_FLOAT
799
+ else:
800
+ format_tag = WAVE_FORMAT.PCM
801
+ if data.ndim == 1:
802
+ channels = 1
803
+ else:
804
+ channels = data.shape[1]
805
+ bit_depth = data.dtype.itemsize * 8
806
+ bytes_per_second = fs * (bit_depth // 8) * channels
807
+ block_align = channels * (bit_depth // 8)
808
+
809
+ fmt_chunk_data = struct.pack(
810
+ "<HHIIHH",
811
+ format_tag,
812
+ channels,
813
+ fs,
814
+ bytes_per_second,
815
+ block_align,
816
+ bit_depth,
817
+ )
818
+ if not (dkind in ("i", "u")):
819
+ # add cbSize field for non-PCM files
820
+ fmt_chunk_data += b"\x00\x00"
821
+
822
+ header_data += struct.pack("<I", len(fmt_chunk_data))
823
+ header_data += fmt_chunk_data
824
+
825
+ # fact chunk (non-PCM files)
826
+ if not (dkind in ("i", "u")):
827
+ header_data += b"fact"
828
+ header_data += struct.pack("<II", 4, data.shape[0])
829
+
830
+ # check data size (needs to be immediately before the data chunk)
831
+ if ((len(header_data) - 4 - 4) + (4 + 4 + data.nbytes)) > 0xFFFFFFFF:
832
+ raise ValueError("Data exceeds wave file size limit")
833
+
834
+ fid.write(header_data)
835
+
836
+ # data chunk
837
+ fid.write(b"data")
838
+ fid.write(struct.pack("<I", data.nbytes))
839
+ if data.dtype.byteorder == ">" or (
840
+ data.dtype.byteorder == "=" and sys.byteorder == "big"
841
+ ):
842
+ data = data.byteswap()
843
+ _array_tofile(fid, data)
844
+
845
+ # Determine file size and place it in correct
846
+ # position at start of the file.
847
+ size = fid.tell()
848
+ fid.seek(4)
849
+ fid.write(struct.pack("<I", size - 8))
850
+
851
+ finally:
852
+ if not hasattr(filename, "write"):
853
+ fid.close()
854
+ else:
855
+ fid.seek(0)
856
+
857
+
858
+ def _array_tofile(fid, data):
859
+ # ravel gives a c-contiguous buffer
860
+ fid.write(data.ravel().view("b").data)