returnn 1.20250508.181644__py3-none-any.whl → 1.20250514.101430__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/datasets/postprocessing.py +11 -4
- returnn/datasets/util/vocabulary.py +17 -1
- {returnn-1.20250508.181644.dist-info → returnn-1.20250514.101430.dist-info}/METADATA +1 -1
- {returnn-1.20250508.181644.dist-info → returnn-1.20250514.101430.dist-info}/RECORD +9 -9
- {returnn-1.20250508.181644.dist-info → returnn-1.20250514.101430.dist-info}/LICENSE +0 -0
- {returnn-1.20250508.181644.dist-info → returnn-1.20250514.101430.dist-info}/WHEEL +0 -0
- {returnn-1.20250508.181644.dist-info → returnn-1.20250514.101430.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250514.101430'
|
|
2
|
+
long_version = '1.20250514.101430+git.c557cc3'
|
|
@@ -308,11 +308,18 @@ class PostprocessingDataset(CachedDataset2):
|
|
|
308
308
|
last_complete_frac = complete_frac
|
|
309
309
|
for data_key, out_t in self._out_tensor_dict_template.data.items():
|
|
310
310
|
in_t = t_dict.data[data_key]
|
|
311
|
-
assert (
|
|
312
|
-
in_t.ndim
|
|
313
|
-
|
|
314
|
-
and all(d.dimension in (d_, None) for (d, d_) in zip(in_t.dims, out_t.shape))
|
|
311
|
+
assert in_t.ndim == out_t.batch_ndim, (
|
|
312
|
+
f"Dim number mismatch for {data_key}: {in_t.ndim} != {out_t.batch_ndim}. "
|
|
313
|
+
"Postprocessing data tensors must not have a batch dimension."
|
|
315
314
|
)
|
|
315
|
+
assert in_t.dtype == out_t.dtype, (
|
|
316
|
+
f"dtype mismatch for {data_key}: '{in_t.dtype}' != '{out_t.dtype}'"
|
|
317
|
+
)
|
|
318
|
+
for i, (in_dim, out_shape) in enumerate(zip(in_t.dims, out_t.shape)):
|
|
319
|
+
assert in_dim.dimension is None or in_dim.dimension == out_shape, (
|
|
320
|
+
f"Dim {i} mismatch on {data_key}: "
|
|
321
|
+
f"{in_dim.dimension} must either be `None` or equal {out_shape}"
|
|
322
|
+
)
|
|
316
323
|
yield t_dict
|
|
317
324
|
|
|
318
325
|
data_iter = self._iterate_dataset()
|
|
@@ -15,6 +15,7 @@ __all__ = [
|
|
|
15
15
|
|
|
16
16
|
from typing import Optional, Union, Type, Callable, List, Dict
|
|
17
17
|
import sys
|
|
18
|
+
import re
|
|
18
19
|
import numpy
|
|
19
20
|
|
|
20
21
|
from returnn.util.basic import NotSpecified
|
|
@@ -58,6 +59,7 @@ class Vocabulary:
|
|
|
58
59
|
num_labels: Optional[int] = None,
|
|
59
60
|
seq_postfix: Optional[List[int]] = None,
|
|
60
61
|
labels: Optional[Union[List[str], Callable[[], List[str]]]] = None,
|
|
62
|
+
single_whitespace_split: bool = False,
|
|
61
63
|
):
|
|
62
64
|
"""
|
|
63
65
|
:param vocab_file:
|
|
@@ -76,6 +78,11 @@ class Vocabulary:
|
|
|
76
78
|
:param num_labels: just for verification
|
|
77
79
|
:param seq_postfix: labels will be added to the seq in self.get_seq
|
|
78
80
|
:param labels:
|
|
81
|
+
:param single_whitespace_split:
|
|
82
|
+
Assume that the given text is encoded using ``" ".join(labels[i] for i in seq)``,
|
|
83
|
+
and this will undo that.
|
|
84
|
+
This makes a difference when there is whitespace itself in the vocab (in ``labels``).
|
|
85
|
+
If not enabled (the default), this will simply use ``str.split()``.
|
|
79
86
|
"""
|
|
80
87
|
if vocab_file and not isinstance(vocab_file, str): # sometimes it is a Path
|
|
81
88
|
vocab_file = str(vocab_file)
|
|
@@ -131,6 +138,12 @@ class Vocabulary:
|
|
|
131
138
|
self.control_symbol_ids = {name: self.to_id(label) for name, label in (control_symbols or {}).items()}
|
|
132
139
|
self.user_defined_symbol_ids = {name: self.to_id(label) for name, label in (user_defined_symbols or {}).items()}
|
|
133
140
|
self.seq_postfix = seq_postfix or []
|
|
141
|
+
# To be used with findall in get_seq.
|
|
142
|
+
self.decode_seq_token_re = (
|
|
143
|
+
re.compile("(%s|\\S+)(?: |$)" % "|".join(re.escape(v) for v in self.labels))
|
|
144
|
+
if single_whitespace_split
|
|
145
|
+
else None
|
|
146
|
+
)
|
|
134
147
|
|
|
135
148
|
def __repr__(self):
|
|
136
149
|
parts = [repr(self.vocab_file), "num_labels=%s" % self.num_labels]
|
|
@@ -317,7 +330,10 @@ class Vocabulary:
|
|
|
317
330
|
:param sentence: assumed to be seq of vocab entries separated by whitespace
|
|
318
331
|
:return: seq of label indices
|
|
319
332
|
"""
|
|
320
|
-
|
|
333
|
+
if self.decode_seq_token_re is not None:
|
|
334
|
+
segments = self.decode_seq_token_re.findall(sentence)
|
|
335
|
+
else:
|
|
336
|
+
segments = sentence.split()
|
|
321
337
|
return self.get_seq_indices(segments) + self.seq_postfix
|
|
322
338
|
|
|
323
339
|
def get_seq_indices(self, seq: List[str]) -> List[int]:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=AEKRzwj7-1_1NcUCAPSSEPkMoIrXK-7K5NtSOZBfJvk,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256=
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=CfrKH5EWL08ucEeXafiSxPiV-BUoBw--NGoCK_ERZnw,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -26,7 +26,7 @@ returnn/datasets/meta.py,sha256=KQtidTgSh-1gNgbpJ8OhXt6v2lkhPPH5dpjfzwsr3E4,9525
|
|
|
26
26
|
returnn/datasets/multi_proc.py,sha256=aVjsLt2qjHnHOrEYCgIPCwNYE-f1fiGP6eZ8NGAr3A4,22583
|
|
27
27
|
returnn/datasets/normalization_data.py,sha256=J3njQCMvWAbIAVPepO2L_Xdau9eWYB7Zyd6STeGzTbc,14615
|
|
28
28
|
returnn/datasets/numpy_dump.py,sha256=wl8bKIKAlff2HPJPtuu5wBg3TLOf16d2wLVB4lLAwTM,5158
|
|
29
|
-
returnn/datasets/postprocessing.py,sha256=
|
|
29
|
+
returnn/datasets/postprocessing.py,sha256=6SfT58BxbHYO2QlGzOgIV04Zqkp-kl0B85168DQaB9A,24060
|
|
30
30
|
returnn/datasets/raw_wav.py,sha256=M7eTHp4CTtLQf3yPTiJY-mSJYgZNxkGV9IFN9J1dq_4,9144
|
|
31
31
|
returnn/datasets/sprint.py,sha256=JAs5dOmdteSOwA7YQcTF9KaTCtGfRjiyJUZClSr85pY,55502
|
|
32
32
|
returnn/datasets/stereo.py,sha256=PkowC91bZWihIYuIZgyGgPcNwgq5jBvyxxu1nER-VhM,17633
|
|
@@ -34,7 +34,7 @@ returnn/datasets/text_dict.py,sha256=BPE73nh6-vtSLy3SiDf4dpFl9RJorE7oO6l5y2FU3MI
|
|
|
34
34
|
returnn/datasets/util/__init__.py,sha256=rEKhSD6fyhDiQF-x7dUQMwa29JZu72SDm7mYcCcLghY,52
|
|
35
35
|
returnn/datasets/util/feature_extraction.py,sha256=axtXDb9wcNpOmyhmW3WJUj5xda29TKkKvOcGGvq7ExA,23923
|
|
36
36
|
returnn/datasets/util/strings.py,sha256=pP8pmXhArkssYqmPOLuxEG9gsko891ZxrWiai86qbLE,412
|
|
37
|
-
returnn/datasets/util/vocabulary.py,sha256=
|
|
37
|
+
returnn/datasets/util/vocabulary.py,sha256=1W13FgxPVP9XSIyhkt4I7CXK5lj99zT97R-gLa_xnAU,27964
|
|
38
38
|
returnn/engine/__init__.py,sha256=br7hpn8i_hIBi2uTQfnN3BF9g5DREYa_mQi0_Nvlu6o,228
|
|
39
39
|
returnn/engine/base.py,sha256=0n4FtB_B2H3W_9KdoLr0P7YPER-hVkbk69pwFqsqmqw,18467
|
|
40
40
|
returnn/engine/batch.py,sha256=amXW8mGspuSQjo00JdisE2eOLy5Ij1weWWzkE-lXSJM,9912
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250514.101430.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250514.101430.dist-info/METADATA,sha256=AEKRzwj7-1_1NcUCAPSSEPkMoIrXK-7K5NtSOZBfJvk,5215
|
|
258
|
+
returnn-1.20250514.101430.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250514.101430.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250514.101430.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|