nkululeko 0.76.0__py3-none-any.whl → 0.77.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nkululeko/augment.py +1 -1
- nkululeko/augmenting/augmenter.py +1 -1
- nkululeko/augmenting/randomsplicer.py +1 -1
- nkululeko/augmenting/resampler.py +4 -9
- nkululeko/autopredict/ap_age.py +2 -4
- nkululeko/autopredict/ap_arousal.py +2 -4
- nkululeko/autopredict/ap_dominance.py +2 -4
- nkululeko/autopredict/ap_gender.py +2 -4
- nkululeko/autopredict/ap_mos.py +2 -4
- nkululeko/autopredict/ap_pesq.py +2 -4
- nkululeko/autopredict/ap_sdr.py +2 -4
- nkululeko/autopredict/ap_snr.py +2 -4
- nkululeko/autopredict/ap_stoi.py +2 -4
- nkululeko/autopredict/ap_valence.py +2 -4
- nkululeko/constants.py +1 -1
- nkululeko/data/dataset.py +8 -5
- nkululeko/demo.py +4 -10
- nkululeko/demo_predictor.py +1 -1
- nkululeko/experiment.py +10 -5
- nkululeko/explore.py +6 -13
- nkululeko/export.py +14 -25
- nkululeko/feat_extract/feats_analyser.py +121 -17
- nkululeko/feat_extract/feats_clap.py +4 -10
- nkululeko/feat_extract/feats_import.py +2 -4
- nkululeko/feat_extract/feats_mld.py +4 -9
- nkululeko/feat_extract/feats_mos.py +5 -13
- nkululeko/feat_extract/feats_oxbow.py +5 -12
- nkululeko/feat_extract/feats_snr.py +3 -7
- nkululeko/feat_extract/feats_squim.py +5 -13
- nkululeko/feat_extract/feats_trill.py +5 -13
- nkululeko/feat_extract/featureset.py +2 -4
- nkululeko/feat_extract/feinberg_praat.py +1 -1
- nkululeko/feature_extractor.py +1 -1
- nkululeko/file_checker.py +5 -5
- nkululeko/filter_data.py +6 -16
- nkululeko/modelrunner.py +1 -1
- nkululeko/models/model.py +1 -1
- nkululeko/models/model_cnn.py +1 -1
- nkululeko/models/model_mlp.py +1 -1
- nkululeko/models/model_mlp_regression.py +1 -1
- nkululeko/nkululeko.py +5 -13
- nkululeko/plots.py +40 -3
- nkululeko/predict.py +5 -13
- nkululeko/reporter.py +1 -1
- nkululeko/reporting/latex_writer.py +14 -9
- nkululeko/reporting/report.py +2 -1
- nkululeko/resample.py +5 -13
- nkululeko/runmanager.py +1 -1
- nkululeko/scaler.py +1 -1
- nkululeko/segment.py +1 -1
- nkululeko/segmenting/seg_silero.py +3 -5
- nkululeko/test.py +4 -10
- nkululeko/test_predictor.py +1 -1
- nkululeko/utils/stats.py +8 -0
- {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/METADATA +12 -1
- nkululeko-0.77.1.dist-info/RECORD +104 -0
- nkululeko/balancer.py +0 -1
- nkululeko/split/__init__.py +0 -3
- nkululeko/split/example_binning.py +0 -27
- nkululeko/split/example_trainDevTestSplit.py +0 -81
- nkululeko/split/example_trainTestSplit.py +0 -77
- nkululeko/split/split_utils.py +0 -528
- nkululeko-0.76.0.dist-info/RECORD +0 -110
- /nkululeko/{util.py → utils/util.py} +0 -0
- {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/LICENSE +0 -0
- {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/WHEEL +0 -0
- {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/top_level.txt +0 -0
@@ -1,81 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Code copyright by Uwe Reichel
|
3
|
-
"""
|
4
|
-
|
5
|
-
# import json
|
6
|
-
import pandas as pd
|
7
|
-
import audb
|
8
|
-
from split_utils import optimize_traindevtest_split
|
9
|
-
|
10
|
-
# define train/dev/testset split on emodb, that is:
|
11
|
-
# - speaker disjunct
|
12
|
-
# - optimally stratified on emotion
|
13
|
-
# - optimally stratified on gender
|
14
|
-
# - optimally stratified on transcriptions
|
15
|
-
# - that contains 10% of the speakers in both dev and test set
|
16
|
-
# - and approximately 10% of the files in both dev and test set
|
17
|
-
|
18
|
-
|
19
|
-
# data
|
20
|
-
db = audb.load(
|
21
|
-
"emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
|
22
|
-
)
|
23
|
-
df_emotion = db["emotion"].get()
|
24
|
-
df_files = db["files"].get()
|
25
|
-
df_speaker = db["speaker"].get()
|
26
|
-
|
27
|
-
df = pd.concat([df_emotion, df_files], axis=1, join="inner")
|
28
|
-
|
29
|
-
|
30
|
-
def spk2gender(x):
|
31
|
-
if x in [8, 9, 13, 14, 16]:
|
32
|
-
return "female"
|
33
|
-
return "male"
|
34
|
-
|
35
|
-
|
36
|
-
df["gender"] = df["speaker"].map(spk2gender)
|
37
|
-
|
38
|
-
# seed, dev and test proportion, number of different splits
|
39
|
-
seed = 42
|
40
|
-
dev_size = 0.2
|
41
|
-
test_size = 0.2
|
42
|
-
k = 30
|
43
|
-
|
44
|
-
# targets
|
45
|
-
emotion = df["emotion"].to_numpy()
|
46
|
-
|
47
|
-
# on which variable to split
|
48
|
-
speaker = df["speaker"].to_numpy()
|
49
|
-
|
50
|
-
# on which variables (targets, groupings) to stratify
|
51
|
-
stratif_vars = {
|
52
|
-
"emotion": emotion,
|
53
|
-
"gender": df["gender"].to_numpy(),
|
54
|
-
"transcription": df["transcription"].to_numpy(),
|
55
|
-
}
|
56
|
-
|
57
|
-
# weights for all stratify_on variables and
|
58
|
-
# and for dev and test proportion match. Give target
|
59
|
-
# variable EMOTION more weight than groupings.
|
60
|
-
weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}
|
61
|
-
|
62
|
-
# find optimal dev and test indices DEV_I and TEST_I in DF
|
63
|
-
# info: dict with goodness of split information
|
64
|
-
train_i, dev_i, test_i, info = optimize_traindevtest_split(
|
65
|
-
X=df,
|
66
|
-
y=emotion,
|
67
|
-
split_on=speaker,
|
68
|
-
stratify_on=stratif_vars,
|
69
|
-
weight=weight,
|
70
|
-
dev_size=dev_size,
|
71
|
-
test_size=test_size,
|
72
|
-
k=k,
|
73
|
-
seed=seed,
|
74
|
-
)
|
75
|
-
|
76
|
-
print("dev split of DF:")
|
77
|
-
print(df.iloc[dev_i])
|
78
|
-
print("dev split of target variable:")
|
79
|
-
print(emotion[dev_i])
|
80
|
-
print("goodness of split:")
|
81
|
-
print(info)
|
@@ -1,77 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Code copyright by Uwe Reichel
|
3
|
-
"""
|
4
|
-
|
5
|
-
import pandas as pd
|
6
|
-
import audb
|
7
|
-
from split_utils import optimize_traintest_split
|
8
|
-
|
9
|
-
# define testset on emodb, that is:
|
10
|
-
# - speaker disjunct
|
11
|
-
# - optimally stratified on emotion
|
12
|
-
# - optimally stratified on gender
|
13
|
-
# - optimally stratified on transcriptions
|
14
|
-
# - that contains 10% of the speakers
|
15
|
-
# - and approximately 10% of the files
|
16
|
-
|
17
|
-
# data
|
18
|
-
db = audb.load(
|
19
|
-
"emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
|
20
|
-
)
|
21
|
-
df_emotion = db["emotion"].get()
|
22
|
-
df_files = db["files"].get()
|
23
|
-
df_speaker = db["speaker"].get()
|
24
|
-
df = pd.concat([df_emotion, df_files], axis=1, join="inner")
|
25
|
-
|
26
|
-
|
27
|
-
def spk2gender(x):
|
28
|
-
if x in [8, 9, 13, 14, 16]:
|
29
|
-
return "female"
|
30
|
-
return "male"
|
31
|
-
|
32
|
-
|
33
|
-
df["gender"] = df["speaker"].map(spk2gender)
|
34
|
-
|
35
|
-
# seed, test proportion, number of different splits
|
36
|
-
seed = 42
|
37
|
-
test_size = 0.2
|
38
|
-
k = 30
|
39
|
-
|
40
|
-
# targets
|
41
|
-
emotion = df["emotion"].to_numpy()
|
42
|
-
|
43
|
-
# on which variable to split
|
44
|
-
|
45
|
-
speaker = df["speaker"].to_numpy()
|
46
|
-
|
47
|
-
# on which variables (targets, groupings) to stratify
|
48
|
-
stratif_vars = {
|
49
|
-
"emotion": emotion,
|
50
|
-
"gender": df["gender"].to_numpy(),
|
51
|
-
"transcription": df["transcription"].to_numpy(),
|
52
|
-
}
|
53
|
-
|
54
|
-
# weights for all stratify_on variables and
|
55
|
-
# and for test proportion match. Give target
|
56
|
-
# variable EMOTION more weight than groupings.
|
57
|
-
weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}
|
58
|
-
|
59
|
-
# find optimal test indices TEST_I in DF
|
60
|
-
# info: dict with goodness of split information
|
61
|
-
train_i, test_i, info = optimize_traintest_split(
|
62
|
-
X=df,
|
63
|
-
y=emotion,
|
64
|
-
split_on=speaker,
|
65
|
-
stratify_on=stratif_vars,
|
66
|
-
weight=weight,
|
67
|
-
test_size=test_size,
|
68
|
-
k=k,
|
69
|
-
seed=seed,
|
70
|
-
)
|
71
|
-
|
72
|
-
print("test split of DF:")
|
73
|
-
print(df.iloc[test_i])
|
74
|
-
print("test split of target variable:")
|
75
|
-
print(emotion[test_i])
|
76
|
-
print("goodness of split:")
|
77
|
-
print(info)
|
nkululeko/split/split_utils.py
DELETED
@@ -1,528 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Code copyright by Uwe Reichel
|
3
|
-
"""
|
4
|
-
|
5
|
-
from collections import Counter
|
6
|
-
import numpy as np
|
7
|
-
import pandas as pd
|
8
|
-
import scipy.spatial as ssp
|
9
|
-
from sklearn.model_selection import GroupShuffleSplit
|
10
|
-
import sys
|
11
|
-
|
12
|
-
|
13
|
-
def optimize_traindevtest_split(
|
14
|
-
X, y, split_on, stratify_on, weight=None, dev_size=0.1, test_size=0.1, k=30, seed=42
|
15
|
-
):
|
16
|
-
"""optimize group-disjunct split into training, dev, and test set, which is guided by:
|
17
|
-
- disjunct split of values in SPLIT_ON
|
18
|
-
- stratification by all keys in STRATIFY_ON (targets and groupings)
|
19
|
-
- test set proportion in X should be close to test_size (which is the test
|
20
|
-
proportion in set(split_on))
|
21
|
-
|
22
|
-
Score to be minimized: (sum_v[w(v) * max_irad(v)] + w(d) * max_d) / (sum_v[w(v)] + w(d))
|
23
|
-
(v: variables to be stratified on
|
24
|
-
w(v): their weight
|
25
|
-
max_irad(v): maximum information radius of reference distribution of classes in v and
|
26
|
-
- dev set distribution,
|
27
|
-
- test set distribution
|
28
|
-
N(v): number of stratification variables
|
29
|
-
max_d: maximum of absolute difference between dev and test sizes of X and set(split_on)
|
30
|
-
w(d): its weight
|
31
|
-
|
32
|
-
Args:
|
33
|
-
X: (pd.DataFrame) of features/groupings for which best split
|
34
|
-
is to be calculated. Of shape (N, M)
|
35
|
-
y: (np.array) of targets of length N
|
36
|
-
if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally
|
37
|
-
tested that all partitions cover all classes. Else y is assumed to be numeric and no
|
38
|
-
coverage test is done.
|
39
|
-
split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs),
|
40
|
-
on which the group-disjunct split is to be performed. Must be categorical.
|
41
|
-
stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
|
42
|
-
the split should be stratified on (groupings could e.g. be sex, age class, etc).
|
43
|
-
Dict-Values are np.array-s of length N that contain the variable values. All
|
44
|
-
variables must be categorical.
|
45
|
-
weight: (dict) weight for each variable in stratify_on. Defines their amount of
|
46
|
-
contribution to the optimization score. Uniform weighting by default. Additional
|
47
|
-
key: "size_diff" defines how the corresponding size differences should be weighted.
|
48
|
-
dev_size: (float) proportion in set(split_on) for dev set, e.g. 10% of speakers
|
49
|
-
to be held-out
|
50
|
-
test_size: (float) test proportion in set(split_on) for test set
|
51
|
-
k: (int) number of different splits to be tried out
|
52
|
-
seed: (int) random seed
|
53
|
-
Returns:
|
54
|
-
train_i: (np.array) train set indices in X
|
55
|
-
dev_i: (np.array) dev set indices in X
|
56
|
-
test_i: (np.array) test set indices in X
|
57
|
-
info: (dict) detail information about reference and achieved prob distributions
|
58
|
-
"dev_size_in_spliton": intended grouping dev_size
|
59
|
-
"dev_size_in_X": optimized dev proportion of observations in X
|
60
|
-
"test_size_in_spliton": intended grouping test_size
|
61
|
-
"test_size_in_X": optimized test proportion of observations in X
|
62
|
-
"p_ref_{c}": reference class distribution calculated from stratify_on[c]
|
63
|
-
"p_dev_{c}": dev set class distribution calculated from stratify_on[c][dev_i]
|
64
|
-
"p_test_{c}": test set class distribution calculated from stratify_on[c][test_i]
|
65
|
-
"""
|
66
|
-
|
67
|
-
# data size
|
68
|
-
N = len(y)
|
69
|
-
|
70
|
-
# categorical target: number of classes for coverage test
|
71
|
-
if is_categorical(y[0]):
|
72
|
-
nc = len(set(y))
|
73
|
-
else:
|
74
|
-
nc = None
|
75
|
-
|
76
|
-
# adjusted dev_size after having split off the test set
|
77
|
-
dev_size_adj = (dev_size * N) / (N - test_size * N)
|
78
|
-
|
79
|
-
# split all into train/dev vs test
|
80
|
-
gss_o = GroupShuffleSplit(n_splits=k, test_size=test_size, random_state=seed)
|
81
|
-
|
82
|
-
# split train/dev into train vs dev
|
83
|
-
gss_i = GroupShuffleSplit(n_splits=k, test_size=dev_size_adj, random_state=seed)
|
84
|
-
|
85
|
-
# set weight defaults
|
86
|
-
if weight is None:
|
87
|
-
weight = {}
|
88
|
-
for c in stratify_on.keys():
|
89
|
-
if c not in weight:
|
90
|
-
weight[c] = 1
|
91
|
-
if "size_diff" not in weight:
|
92
|
-
weight["size_diff"] = 1
|
93
|
-
|
94
|
-
# stratification reference distributions calculated on stratify_on
|
95
|
-
p_ref = {}
|
96
|
-
for c in stratify_on:
|
97
|
-
p_ref[c] = class_prob(stratify_on[c])
|
98
|
-
|
99
|
-
# best train/dev/test indices in X; best associated score
|
100
|
-
train_i, dev_i, test_i, best_sco = None, None, None, np.inf
|
101
|
-
|
102
|
-
# full target coverage in all partitions
|
103
|
-
full_target_coverage = False
|
104
|
-
|
105
|
-
# brute-force optimization of SPLIT_ON split
|
106
|
-
# outer loop *_o: splitting into train/dev and test
|
107
|
-
# inner loop *_i: spltting into train and dev
|
108
|
-
for tri_o, tei_o in gss_o.split(X, y, split_on):
|
109
|
-
# current train/dev partition
|
110
|
-
X_i = X.iloc[tri_o]
|
111
|
-
y_i = y[tri_o]
|
112
|
-
split_on_i = split_on[tri_o]
|
113
|
-
|
114
|
-
for tri_i, tei_i in gss_i.split(X_i, y_i, split_on_i):
|
115
|
-
# all classes maintained in all partitions?
|
116
|
-
if nc:
|
117
|
-
nc_train = len(set(y[tri_o[tri_i]]))
|
118
|
-
nc_dev = len(set(y[tri_o[tei_i]]))
|
119
|
-
nc_test = len(set(y[tei_o]))
|
120
|
-
if min(nc_train, nc_dev, nc_test) < nc:
|
121
|
-
continue
|
122
|
-
|
123
|
-
full_target_coverage = True
|
124
|
-
|
125
|
-
sco = calc_split_score(
|
126
|
-
test_i=tei_o,
|
127
|
-
stratify_on=stratify_on,
|
128
|
-
weight=weight,
|
129
|
-
p_ref=p_ref,
|
130
|
-
N=N,
|
131
|
-
test_size=test_size,
|
132
|
-
dev_i=tri_o[tei_i],
|
133
|
-
dev_size=dev_size_adj,
|
134
|
-
)
|
135
|
-
|
136
|
-
if sco < best_sco:
|
137
|
-
best_sco = sco
|
138
|
-
test_i = tei_o
|
139
|
-
train_i = tri_o[tri_i]
|
140
|
-
dev_i = tri_o[tei_i]
|
141
|
-
|
142
|
-
if test_i is None:
|
143
|
-
sys.exit(exit_message(full_target_coverage, "dev and test"))
|
144
|
-
|
145
|
-
# matching info
|
146
|
-
info = {
|
147
|
-
"score": best_sco,
|
148
|
-
"size_devset_in_spliton": dev_size,
|
149
|
-
"size_devset_in_X": np.round(len(dev_i) / N, 2),
|
150
|
-
"size_testset_in_spliton": test_size,
|
151
|
-
"size_testset_in_X": np.round(len(test_i) / N, 2),
|
152
|
-
}
|
153
|
-
|
154
|
-
for c in p_ref:
|
155
|
-
info[f"p_{c}_ref"] = p_ref[c]
|
156
|
-
info[f"p_{c}_dev"] = class_prob(stratify_on[c][dev_i])
|
157
|
-
info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i])
|
158
|
-
|
159
|
-
return train_i, dev_i, test_i, info
|
160
|
-
|
161
|
-
|
162
|
-
def optimize_traintest_split(
|
163
|
-
X, y, split_on, stratify_on, weight=None, test_size=0.1, k=30, seed=42
|
164
|
-
):
|
165
|
-
"""optimize group-disjunct split which is guided by:
|
166
|
-
- disjunct split of values in SPLIT_ON
|
167
|
-
- stratification by all keys in STRATIFY_ON (targets and groupings)
|
168
|
-
- test set proportion in X should be close to test_size (which is the test
|
169
|
-
proportion in set(split_on))
|
170
|
-
|
171
|
-
Score to be minimized: (sum_v[w(v) * irad(v)] + w(d) * d) / (sum_v[w(v)] + w(d))
|
172
|
-
(v: variables to be stratified on
|
173
|
-
w(v): their weight
|
174
|
-
irad(v): information radius between reference distribution of classes in v
|
175
|
-
and test set distribution
|
176
|
-
N(v): number of stratification variables
|
177
|
-
d: absolute difference between test sizes of X and set(split_on)
|
178
|
-
w(d): its weight
|
179
|
-
|
180
|
-
Args:
|
181
|
-
X: (pd.DataFrame) of features/groupings for which best split
|
182
|
-
is to be calculated. Of shape (N, M)
|
183
|
-
y: (np.array) of targets of length N
|
184
|
-
if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally
|
185
|
-
tested that all partitions cover all classes. Else y is assumed to be numeric and no
|
186
|
-
coverage test is done.
|
187
|
-
split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs),
|
188
|
-
on which the group-disjunct split is to be performed. Must be categorical.
|
189
|
-
stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
|
190
|
-
the split should be stratified on (groupings could e.g. be sex, age class, etc).
|
191
|
-
Dict-Values are np.array-s of length N that contain the variable values. All
|
192
|
-
variables must be categorical.
|
193
|
-
weight: (dict) weight for each variable in stratify_on. Defines their amount of
|
194
|
-
contribution to the optimization score. Uniform weighting by default. Additional
|
195
|
-
key: "size_diff" defines how test size diff should be weighted.
|
196
|
-
test_size: (float) test proportion in set(split_on), e.g. 10% of speakers to be held-out
|
197
|
-
k: (int) number of different splits to be tried out
|
198
|
-
seed: (int) random seed
|
199
|
-
Returns:
|
200
|
-
train_i: (np.array) train set indices in X
|
201
|
-
test_i: (np.array) test set indices in X
|
202
|
-
info: (dict) detail information about reference and achieved prob distributions
|
203
|
-
"size_testset_in_spliton": intended test_size
|
204
|
-
"size_testset_in_X": optimized test proportion in X
|
205
|
-
"p_ref_{c}": reference class distribution calculated from stratify_on[c]
|
206
|
-
"p_test_{c}": test set class distribution calculated from stratify_on[c][test_i]
|
207
|
-
"""
|
208
|
-
|
209
|
-
gss = GroupShuffleSplit(n_splits=k, test_size=test_size, random_state=seed)
|
210
|
-
|
211
|
-
# set weight defaults
|
212
|
-
if weight is None:
|
213
|
-
weight = {}
|
214
|
-
for c in stratify_on.keys():
|
215
|
-
if c not in weight:
|
216
|
-
weight[c] = 1
|
217
|
-
if "size_diff" not in weight:
|
218
|
-
weight["size_diff"] = 1
|
219
|
-
|
220
|
-
# stratification reference distributions calculated on stratify_on
|
221
|
-
p_ref = {}
|
222
|
-
for c in stratify_on:
|
223
|
-
p_ref[c] = class_prob(stratify_on[c])
|
224
|
-
|
225
|
-
# best train and test indices in X; best associated score
|
226
|
-
train_i, test_i, best_sco = None, None, np.inf
|
227
|
-
|
228
|
-
# data size
|
229
|
-
N = len(y)
|
230
|
-
|
231
|
-
# full target coverage in all partitions
|
232
|
-
full_target_coverage = False
|
233
|
-
|
234
|
-
# categorical target: number of classes for coverage test
|
235
|
-
if is_categorical(y[0]):
|
236
|
-
nc = len(set(y))
|
237
|
-
else:
|
238
|
-
nc = None
|
239
|
-
|
240
|
-
# brute-force optimization of SPLIT_ON split
|
241
|
-
for tri, tei in gss.split(X, y, split_on):
|
242
|
-
# all classes maintained in all partitions?
|
243
|
-
if nc:
|
244
|
-
nc_train = len(set(y[tri]))
|
245
|
-
nc_test = len(set(y[tei]))
|
246
|
-
if min(nc_train, nc_test) < nc:
|
247
|
-
continue
|
248
|
-
|
249
|
-
full_target_coverage = True
|
250
|
-
|
251
|
-
sco = calc_split_score(tei, stratify_on, weight, p_ref, N, test_size)
|
252
|
-
if sco < best_sco:
|
253
|
-
train_i, test_i, best_sco = tri, tei, sco
|
254
|
-
|
255
|
-
if test_i is None:
|
256
|
-
sys.exit(exit_message(full_target_coverage))
|
257
|
-
|
258
|
-
# matching info
|
259
|
-
info = {
|
260
|
-
"score": best_sco,
|
261
|
-
"size_testset_in_spliton": test_size,
|
262
|
-
"size_testset_in_X": np.round(len(test_i) / N, 2),
|
263
|
-
}
|
264
|
-
|
265
|
-
for c in p_ref:
|
266
|
-
info[f"p_{c}_ref"] = p_ref[c]
|
267
|
-
info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i])
|
268
|
-
|
269
|
-
return train_i, test_i, info
|
270
|
-
|
271
|
-
|
272
|
-
def calc_split_score(
|
273
|
-
test_i, stratify_on, weight, p_ref, N, test_size, dev_i=None, dev_size=None
|
274
|
-
):
|
275
|
-
"""calculate split score based on class distribution IRADs and
|
276
|
-
differences in partition sizes of groups vs observations; smaller is better.
|
277
|
-
If dev_i and dev_size are not provided, the score is calculated for the train/test
|
278
|
-
split only. If they are provided the score is calculated for the train/dev/test split
|
279
|
-
Args:
|
280
|
-
test_i: (np.array) of test set indices
|
281
|
-
stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
|
282
|
-
the split should be stratified on (groupings could e.g. be sex, age class, etc).
|
283
|
-
Dict-Values are np.array-s of length N that contain the variable values.
|
284
|
-
weight: (dict) weight for each variable in stratify_on. Additional
|
285
|
-
key: "size_diff" that weights the grouping vs observation level test set size difference
|
286
|
-
p_ref: (dict) reference class distributions for all variables in stratify_on
|
287
|
-
N: (int) size of underlying data set
|
288
|
-
test_size: (float) test proportion in value set of variable, the disjunct grouping
|
289
|
-
has been carried out
|
290
|
-
dev_i: (np.array) of dev test indices
|
291
|
-
dev_size: (float) dev proportion in value set of variable, the disjunct grouping
|
292
|
-
has been carried out (this value should have been adjusted after splitting off the
|
293
|
-
test set)
|
294
|
-
"""
|
295
|
-
|
296
|
-
if dev_i is None:
|
297
|
-
do_dev = False
|
298
|
-
else:
|
299
|
-
do_dev = True
|
300
|
-
|
301
|
-
# dev and test set class distributions
|
302
|
-
p_test, p_dev = {}, {}
|
303
|
-
for c in p_ref:
|
304
|
-
p_test[c] = class_prob(stratify_on[c][test_i])
|
305
|
-
if do_dev:
|
306
|
-
p_dev[c] = class_prob(stratify_on[c][dev_i])
|
307
|
-
|
308
|
-
# score
|
309
|
-
sco, wgt = 0, 0
|
310
|
-
|
311
|
-
# IRADs (if p_test[c] or p_dec[c] do not contain
|
312
|
-
# all classes in p_ref[c], return INF)
|
313
|
-
for c in p_ref:
|
314
|
-
irad, full_coverage = calc_irad(p_ref[c], p_test[c])
|
315
|
-
if not full_coverage:
|
316
|
-
return np.inf
|
317
|
-
if do_dev:
|
318
|
-
irad_dev, full_coverage = calc_irad(p_ref[c], p_dev[c])
|
319
|
-
if not full_coverage:
|
320
|
-
return np.inf
|
321
|
-
irad = max(irad, irad_dev)
|
322
|
-
|
323
|
-
sco += weight[c] * irad
|
324
|
-
wgt += weight[c]
|
325
|
-
|
326
|
-
# partition size difference groups vs observations
|
327
|
-
size_diff = np.abs(len(test_i) / N - test_size)
|
328
|
-
if do_dev:
|
329
|
-
size_diff_dev = np.abs(len(dev_i) / N - dev_size)
|
330
|
-
size_diff = max(size_diff, size_diff_dev)
|
331
|
-
|
332
|
-
sco += weight["size_diff"] * size_diff
|
333
|
-
wgt += weight["size_diff"]
|
334
|
-
|
335
|
-
sco /= wgt
|
336
|
-
|
337
|
-
return sco
|
338
|
-
|
339
|
-
|
340
|
-
def calc_irad(p1, p2):
|
341
|
-
"""calculate information radius of prob dicts p1 and p2
|
342
|
-
Args:
|
343
|
-
p1, p2: (dict) of probabilities
|
344
|
-
Returns:
|
345
|
-
ir: (float) information radius
|
346
|
-
full_coverage: (bool) True if all elements in p1 occur in p2
|
347
|
-
and vice versa
|
348
|
-
"""
|
349
|
-
|
350
|
-
p, q = [], []
|
351
|
-
full_coverage = True
|
352
|
-
|
353
|
-
for u in sorted(p1.keys()):
|
354
|
-
if u not in p2:
|
355
|
-
full_coverage = False
|
356
|
-
a = 0.0
|
357
|
-
else:
|
358
|
-
a = p2[u]
|
359
|
-
|
360
|
-
p.append(p1[u])
|
361
|
-
q.append(a)
|
362
|
-
|
363
|
-
if full_coverage:
|
364
|
-
if len(p2.keys()) > len(p1.keys()):
|
365
|
-
full_coverage = False
|
366
|
-
|
367
|
-
irad = ssp.distance.jensenshannon(p, q)
|
368
|
-
|
369
|
-
return irad, full_coverage
|
370
|
-
|
371
|
-
|
372
|
-
def class_prob(y):
|
373
|
-
"""returns class probabilities in y
|
374
|
-
Args:
|
375
|
-
y (array-like) of classes
|
376
|
-
Returns:
|
377
|
-
p (dict) assigning to each class in Y its maximum likelihood
|
378
|
-
"""
|
379
|
-
|
380
|
-
p = {}
|
381
|
-
N = len(y)
|
382
|
-
c = Counter(y)
|
383
|
-
for x in c:
|
384
|
-
p[x] = c[x] / N
|
385
|
-
|
386
|
-
return p
|
387
|
-
|
388
|
-
|
389
|
-
def is_categorical(x):
|
390
|
-
"""returns True if type of x is in str or int*,
|
391
|
-
else False"""
|
392
|
-
|
393
|
-
if type(x) in [
|
394
|
-
str,
|
395
|
-
int,
|
396
|
-
np.int16,
|
397
|
-
np.int32,
|
398
|
-
np.int64,
|
399
|
-
np.uint8,
|
400
|
-
np.uint16,
|
401
|
-
np.uint32,
|
402
|
-
]:
|
403
|
-
return True
|
404
|
-
return False
|
405
|
-
|
406
|
-
|
407
|
-
def dummy_variable(X, columns, specs=None, squeeze_classes=False):
|
408
|
-
"""
|
409
|
-
creates dummy variable from binned numeric columns that can be used
|
410
|
-
later for stratification etc.
|
411
|
-
|
412
|
-
Args:
|
413
|
-
X: (pd.DataFrame)
|
414
|
-
columns: (str or list) of numeric column names
|
415
|
-
specs: (dict or str)
|
416
|
-
if nested dict: keys are column names with subdict that contains the
|
417
|
-
arguments for binning(), i.e. n_bins and lower_boundaries
|
418
|
-
squeeze_classes: (boolean) further squeeze classes by sorting the digits
|
419
|
-
within the string.
|
420
|
-
Example: from binning of 3 columns, each into 2 bins, we got
|
421
|
-
"000", "100", "010", "001", "110", "101", "011", "111".
|
422
|
-
These classes are further squeezed by within-string sorting:
|
423
|
-
"000", "001", "011", "111"
|
424
|
-
|
425
|
-
Returns:
|
426
|
-
y: (list) of class strings of length X.shape[0]
|
427
|
-
|
428
|
-
"""
|
429
|
-
|
430
|
-
df_bin = pd.DataFrame()
|
431
|
-
if specs is None:
|
432
|
-
specs = {}
|
433
|
-
if type(columns) is str:
|
434
|
-
columns = [columns]
|
435
|
-
|
436
|
-
# bin columns
|
437
|
-
for col in columns:
|
438
|
-
if col not in X.columns:
|
439
|
-
sys.exit(f"column {col} not in dataframe")
|
440
|
-
if col in specs:
|
441
|
-
kwargs = specs[col]
|
442
|
-
else:
|
443
|
-
kwargs = {"nbins": 2}
|
444
|
-
yc = binning(X[col].to_numpy(), **kwargs)
|
445
|
-
df_bin[col] = yc.astype(str)
|
446
|
-
|
447
|
-
# concatenate
|
448
|
-
df_bin["binvar"] = ""
|
449
|
-
for col in columns:
|
450
|
-
df_bin["binvar"] += df_bin[col]
|
451
|
-
|
452
|
-
# squeeze
|
453
|
-
if squeeze_classes:
|
454
|
-
|
455
|
-
def squeezing(x):
|
456
|
-
return "".join(sorted(x))
|
457
|
-
|
458
|
-
df_bin["binvar"] = df_bin["binvar"].apply(squeezing)
|
459
|
-
|
460
|
-
y = df_bin["binvar"].tolist()
|
461
|
-
return y
|
462
|
-
|
463
|
-
|
464
|
-
def binning(y, nbins=3, lower_boundaries=None):
|
465
|
-
"""
|
466
|
-
bins numeric array y either intrinsically into nbins classes
|
467
|
-
based on an equidistant percentile split, or extrinsically
|
468
|
-
by using the lower_boundaries values.
|
469
|
-
|
470
|
-
Args:
|
471
|
-
y: (np.array) with numeric data
|
472
|
-
nbins: (int) number of bins
|
473
|
-
lower_boundaries: (list) of lower bin boundaries.
|
474
|
-
If provided nbins will be ignored and y is binned
|
475
|
-
extrinsically. The first value of lower_boundaries
|
476
|
-
is always corrected not to be higher than min(y).
|
477
|
-
Returns:
|
478
|
-
yc: (np.array) with bin IDs (integers from 0 to nbins-1)
|
479
|
-
"""
|
480
|
-
|
481
|
-
# intrinsic binning by equidistant percentiles
|
482
|
-
if lower_boundaries is None:
|
483
|
-
prct = np.linspace(0, 100, nbins + 1)
|
484
|
-
lower_boundaries = np.percentile(y, prct)
|
485
|
-
lower_boundaries = lower_boundaries[0:nbins]
|
486
|
-
else:
|
487
|
-
# make sure that entire range of y is covered
|
488
|
-
lower_boundaries[0] = min(lower_boundaries[0], np.min(y))
|
489
|
-
|
490
|
-
# binned array
|
491
|
-
yc = np.zeros(len(y), dtype=int)
|
492
|
-
for i in range(1, len(lower_boundaries)):
|
493
|
-
yc[y >= lower_boundaries[i]] = i
|
494
|
-
|
495
|
-
return yc
|
496
|
-
|
497
|
-
|
498
|
-
def optimize_testset_split(
|
499
|
-
X, y, split_on, stratify_on, weight=None, test_size=0.1, k=30, seed=42
|
500
|
-
):
|
501
|
-
"""backward compatibility"""
|
502
|
-
return optimize_traintest_split(
|
503
|
-
X, y, split_on, stratify_on, weight, test_size, k, seed
|
504
|
-
)
|
505
|
-
|
506
|
-
|
507
|
-
def exit_message(full_target_coverage, infx="test"):
|
508
|
-
if not full_target_coverage:
|
509
|
-
return (
|
510
|
-
"not all partitions contain all target classes. What you can do:\n"
|
511
|
-
"(1) increase your dev and/or test partition, or\n"
|
512
|
-
"(2) reduce the amount of target classes by merging some of them."
|
513
|
-
)
|
514
|
-
|
515
|
-
return (
|
516
|
-
f"\n:-o No {infx} set split found. Reason is, that for at least one of the\n"
|
517
|
-
f"stratification variables not all its values can make it into the {infx} set.\n"
|
518
|
-
f"This happens e.g. if the {infx} set size is chosen too small or\n"
|
519
|
-
"if the (multidimensional) distribution of the stratification\n"
|
520
|
-
"variables is sparse. What you can do:\n"
|
521
|
-
"(1) remove a variable from this stratification, or\n"
|
522
|
-
"(2) merge classes within a variable to increase the per class probabilities, or\n"
|
523
|
-
f"(3) increase the {infx} set size, or\n"
|
524
|
-
"(4) increase the number of different splits (if it was small, say < 10, before), or\n"
|
525
|
-
"(5) in case your target is numeric and you have added a binned target array to the\n"
|
526
|
-
" stratification variables: reduce the number of bins.\n"
|
527
|
-
"Good luck!\n"
|
528
|
-
)
|