nkululeko 0.76.0__py3-none-any.whl → 0.77.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. nkululeko/augment.py +1 -1
  2. nkululeko/augmenting/augmenter.py +1 -1
  3. nkululeko/augmenting/randomsplicer.py +1 -1
  4. nkululeko/augmenting/resampler.py +4 -9
  5. nkululeko/autopredict/ap_age.py +2 -4
  6. nkululeko/autopredict/ap_arousal.py +2 -4
  7. nkululeko/autopredict/ap_dominance.py +2 -4
  8. nkululeko/autopredict/ap_gender.py +2 -4
  9. nkululeko/autopredict/ap_mos.py +2 -4
  10. nkululeko/autopredict/ap_pesq.py +2 -4
  11. nkululeko/autopredict/ap_sdr.py +2 -4
  12. nkululeko/autopredict/ap_snr.py +2 -4
  13. nkululeko/autopredict/ap_stoi.py +2 -4
  14. nkululeko/autopredict/ap_valence.py +2 -4
  15. nkululeko/constants.py +1 -1
  16. nkululeko/data/dataset.py +8 -5
  17. nkululeko/demo.py +4 -10
  18. nkululeko/demo_predictor.py +1 -1
  19. nkululeko/experiment.py +10 -5
  20. nkululeko/explore.py +6 -13
  21. nkululeko/export.py +14 -25
  22. nkululeko/feat_extract/feats_analyser.py +121 -17
  23. nkululeko/feat_extract/feats_clap.py +4 -10
  24. nkululeko/feat_extract/feats_import.py +2 -4
  25. nkululeko/feat_extract/feats_mld.py +4 -9
  26. nkululeko/feat_extract/feats_mos.py +5 -13
  27. nkululeko/feat_extract/feats_oxbow.py +5 -12
  28. nkululeko/feat_extract/feats_snr.py +3 -7
  29. nkululeko/feat_extract/feats_squim.py +5 -13
  30. nkululeko/feat_extract/feats_trill.py +5 -13
  31. nkululeko/feat_extract/featureset.py +2 -4
  32. nkululeko/feat_extract/feinberg_praat.py +1 -1
  33. nkululeko/feature_extractor.py +1 -1
  34. nkululeko/file_checker.py +5 -5
  35. nkululeko/filter_data.py +6 -16
  36. nkululeko/modelrunner.py +1 -1
  37. nkululeko/models/model.py +1 -1
  38. nkululeko/models/model_cnn.py +1 -1
  39. nkululeko/models/model_mlp.py +1 -1
  40. nkululeko/models/model_mlp_regression.py +1 -1
  41. nkululeko/nkululeko.py +5 -13
  42. nkululeko/plots.py +40 -3
  43. nkululeko/predict.py +5 -13
  44. nkululeko/reporter.py +1 -1
  45. nkululeko/reporting/latex_writer.py +14 -9
  46. nkululeko/reporting/report.py +2 -1
  47. nkululeko/resample.py +5 -13
  48. nkululeko/runmanager.py +1 -1
  49. nkululeko/scaler.py +1 -1
  50. nkululeko/segment.py +1 -1
  51. nkululeko/segmenting/seg_silero.py +3 -5
  52. nkululeko/test.py +4 -10
  53. nkululeko/test_predictor.py +1 -1
  54. nkululeko/utils/stats.py +8 -0
  55. {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/METADATA +12 -1
  56. nkululeko-0.77.1.dist-info/RECORD +104 -0
  57. nkululeko/balancer.py +0 -1
  58. nkululeko/split/__init__.py +0 -3
  59. nkululeko/split/example_binning.py +0 -27
  60. nkululeko/split/example_trainDevTestSplit.py +0 -81
  61. nkululeko/split/example_trainTestSplit.py +0 -77
  62. nkululeko/split/split_utils.py +0 -528
  63. nkululeko-0.76.0.dist-info/RECORD +0 -110
  64. /nkululeko/{util.py → utils/util.py} +0 -0
  65. {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/LICENSE +0 -0
  66. {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/WHEEL +0 -0
  67. {nkululeko-0.76.0.dist-info → nkululeko-0.77.1.dist-info}/top_level.txt +0 -0
@@ -1,81 +0,0 @@
1
- """
2
- Code copyright by Uwe Reichel
3
- """
4
-
5
- # import json
6
- import pandas as pd
7
- import audb
8
- from split_utils import optimize_traindevtest_split
9
-
10
- # define train/dev/testset split on emodb, that is:
11
- # - speaker disjunct
12
- # - optimally stratified on emotion
13
- # - optimally stratified on gender
14
- # - optimally stratified on transcriptions
15
- # - that contains 10% of the speakers in both dev and test set
16
- # - and approximately 10% of the files in both dev and test set
17
-
18
-
19
- # data
20
- db = audb.load(
21
- "emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
22
- )
23
- df_emotion = db["emotion"].get()
24
- df_files = db["files"].get()
25
- df_speaker = db["speaker"].get()
26
-
27
- df = pd.concat([df_emotion, df_files], axis=1, join="inner")
28
-
29
-
30
- def spk2gender(x):
31
- if x in [8, 9, 13, 14, 16]:
32
- return "female"
33
- return "male"
34
-
35
-
36
- df["gender"] = df["speaker"].map(spk2gender)
37
-
38
- # seed, dev and test proportion, number of different splits
39
- seed = 42
40
- dev_size = 0.2
41
- test_size = 0.2
42
- k = 30
43
-
44
- # targets
45
- emotion = df["emotion"].to_numpy()
46
-
47
- # on which variable to split
48
- speaker = df["speaker"].to_numpy()
49
-
50
- # on which variables (targets, groupings) to stratify
51
- stratif_vars = {
52
- "emotion": emotion,
53
- "gender": df["gender"].to_numpy(),
54
- "transcription": df["transcription"].to_numpy(),
55
- }
56
-
57
- # weights for all stratify_on variables and
58
- # and for dev and test proportion match. Give target
59
- # variable EMOTION more weight than groupings.
60
- weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}
61
-
62
- # find optimal dev and test indices DEV_I and TEST_I in DF
63
- # info: dict with goodness of split information
64
- train_i, dev_i, test_i, info = optimize_traindevtest_split(
65
- X=df,
66
- y=emotion,
67
- split_on=speaker,
68
- stratify_on=stratif_vars,
69
- weight=weight,
70
- dev_size=dev_size,
71
- test_size=test_size,
72
- k=k,
73
- seed=seed,
74
- )
75
-
76
- print("dev split of DF:")
77
- print(df.iloc[dev_i])
78
- print("dev split of target variable:")
79
- print(emotion[dev_i])
80
- print("goodness of split:")
81
- print(info)
@@ -1,77 +0,0 @@
1
- """
2
- Code copyright by Uwe Reichel
3
- """
4
-
5
- import pandas as pd
6
- import audb
7
- from split_utils import optimize_traintest_split
8
-
9
- # define testset on emodb, that is:
10
- # - speaker disjunct
11
- # - optimally stratified on emotion
12
- # - optimally stratified on gender
13
- # - optimally stratified on transcriptions
14
- # - that contains 10% of the speakers
15
- # - and approximately 10% of the files
16
-
17
- # data
18
- db = audb.load(
19
- "emodb", version="1.3.0", format="wav", sampling_rate=16000, mixdown=True
20
- )
21
- df_emotion = db["emotion"].get()
22
- df_files = db["files"].get()
23
- df_speaker = db["speaker"].get()
24
- df = pd.concat([df_emotion, df_files], axis=1, join="inner")
25
-
26
-
27
- def spk2gender(x):
28
- if x in [8, 9, 13, 14, 16]:
29
- return "female"
30
- return "male"
31
-
32
-
33
- df["gender"] = df["speaker"].map(spk2gender)
34
-
35
- # seed, test proportion, number of different splits
36
- seed = 42
37
- test_size = 0.2
38
- k = 30
39
-
40
- # targets
41
- emotion = df["emotion"].to_numpy()
42
-
43
- # on which variable to split
44
-
45
- speaker = df["speaker"].to_numpy()
46
-
47
- # on which variables (targets, groupings) to stratify
48
- stratif_vars = {
49
- "emotion": emotion,
50
- "gender": df["gender"].to_numpy(),
51
- "transcription": df["transcription"].to_numpy(),
52
- }
53
-
54
- # weights for all stratify_on variables and
55
- # and for test proportion match. Give target
56
- # variable EMOTION more weight than groupings.
57
- weight = {"emotion": 2, "gender": 1, "transcription": 1, "size_diff": 1}
58
-
59
- # find optimal test indices TEST_I in DF
60
- # info: dict with goodness of split information
61
- train_i, test_i, info = optimize_traintest_split(
62
- X=df,
63
- y=emotion,
64
- split_on=speaker,
65
- stratify_on=stratif_vars,
66
- weight=weight,
67
- test_size=test_size,
68
- k=k,
69
- seed=seed,
70
- )
71
-
72
- print("test split of DF:")
73
- print(df.iloc[test_i])
74
- print("test split of target variable:")
75
- print(emotion[test_i])
76
- print("goodness of split:")
77
- print(info)
@@ -1,528 +0,0 @@
1
- """
2
- Code copyright by Uwe Reichel
3
- """
4
-
5
- from collections import Counter
6
- import numpy as np
7
- import pandas as pd
8
- import scipy.spatial as ssp
9
- from sklearn.model_selection import GroupShuffleSplit
10
- import sys
11
-
12
-
13
- def optimize_traindevtest_split(
14
- X, y, split_on, stratify_on, weight=None, dev_size=0.1, test_size=0.1, k=30, seed=42
15
- ):
16
- """optimize group-disjunct split into training, dev, and test set, which is guided by:
17
- - disjunct split of values in SPLIT_ON
18
- - stratification by all keys in STRATIFY_ON (targets and groupings)
19
- - test set proportion in X should be close to test_size (which is the test
20
- proportion in set(split_on))
21
-
22
- Score to be minimized: (sum_v[w(v) * max_irad(v)] + w(d) * max_d) / (sum_v[w(v)] + w(d))
23
- (v: variables to be stratified on
24
- w(v): their weight
25
- max_irad(v): maximum information radius of reference distribution of classes in v and
26
- - dev set distribution,
27
- - test set distribution
28
- N(v): number of stratification variables
29
- max_d: maximum of absolute difference between dev and test sizes of X and set(split_on)
30
- w(d): its weight
31
-
32
- Args:
33
- X: (pd.DataFrame) of features/groupings for which best split
34
- is to be calculated. Of shape (N, M)
35
- y: (np.array) of targets of length N
36
- if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally
37
- tested that all partitions cover all classes. Else y is assumed to be numeric and no
38
- coverage test is done.
39
- split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs),
40
- on which the group-disjunct split is to be performed. Must be categorical.
41
- stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
42
- the split should be stratified on (groupings could e.g. be sex, age class, etc).
43
- Dict-Values are np.array-s of length N that contain the variable values. All
44
- variables must be categorical.
45
- weight: (dict) weight for each variable in stratify_on. Defines their amount of
46
- contribution to the optimization score. Uniform weighting by default. Additional
47
- key: "size_diff" defines how the corresponding size differences should be weighted.
48
- dev_size: (float) proportion in set(split_on) for dev set, e.g. 10% of speakers
49
- to be held-out
50
- test_size: (float) test proportion in set(split_on) for test set
51
- k: (int) number of different splits to be tried out
52
- seed: (int) random seed
53
- Returns:
54
- train_i: (np.array) train set indices in X
55
- dev_i: (np.array) dev set indices in X
56
- test_i: (np.array) test set indices in X
57
- info: (dict) detail information about reference and achieved prob distributions
58
- "dev_size_in_spliton": intended grouping dev_size
59
- "dev_size_in_X": optimized dev proportion of observations in X
60
- "test_size_in_spliton": intended grouping test_size
61
- "test_size_in_X": optimized test proportion of observations in X
62
- "p_ref_{c}": reference class distribution calculated from stratify_on[c]
63
- "p_dev_{c}": dev set class distribution calculated from stratify_on[c][dev_i]
64
- "p_test_{c}": test set class distribution calculated from stratify_on[c][test_i]
65
- """
66
-
67
- # data size
68
- N = len(y)
69
-
70
- # categorical target: number of classes for coverage test
71
- if is_categorical(y[0]):
72
- nc = len(set(y))
73
- else:
74
- nc = None
75
-
76
- # adjusted dev_size after having split off the test set
77
- dev_size_adj = (dev_size * N) / (N - test_size * N)
78
-
79
- # split all into train/dev vs test
80
- gss_o = GroupShuffleSplit(n_splits=k, test_size=test_size, random_state=seed)
81
-
82
- # split train/dev into train vs dev
83
- gss_i = GroupShuffleSplit(n_splits=k, test_size=dev_size_adj, random_state=seed)
84
-
85
- # set weight defaults
86
- if weight is None:
87
- weight = {}
88
- for c in stratify_on.keys():
89
- if c not in weight:
90
- weight[c] = 1
91
- if "size_diff" not in weight:
92
- weight["size_diff"] = 1
93
-
94
- # stratification reference distributions calculated on stratify_on
95
- p_ref = {}
96
- for c in stratify_on:
97
- p_ref[c] = class_prob(stratify_on[c])
98
-
99
- # best train/dev/test indices in X; best associated score
100
- train_i, dev_i, test_i, best_sco = None, None, None, np.inf
101
-
102
- # full target coverage in all partitions
103
- full_target_coverage = False
104
-
105
- # brute-force optimization of SPLIT_ON split
106
- # outer loop *_o: splitting into train/dev and test
107
- # inner loop *_i: spltting into train and dev
108
- for tri_o, tei_o in gss_o.split(X, y, split_on):
109
- # current train/dev partition
110
- X_i = X.iloc[tri_o]
111
- y_i = y[tri_o]
112
- split_on_i = split_on[tri_o]
113
-
114
- for tri_i, tei_i in gss_i.split(X_i, y_i, split_on_i):
115
- # all classes maintained in all partitions?
116
- if nc:
117
- nc_train = len(set(y[tri_o[tri_i]]))
118
- nc_dev = len(set(y[tri_o[tei_i]]))
119
- nc_test = len(set(y[tei_o]))
120
- if min(nc_train, nc_dev, nc_test) < nc:
121
- continue
122
-
123
- full_target_coverage = True
124
-
125
- sco = calc_split_score(
126
- test_i=tei_o,
127
- stratify_on=stratify_on,
128
- weight=weight,
129
- p_ref=p_ref,
130
- N=N,
131
- test_size=test_size,
132
- dev_i=tri_o[tei_i],
133
- dev_size=dev_size_adj,
134
- )
135
-
136
- if sco < best_sco:
137
- best_sco = sco
138
- test_i = tei_o
139
- train_i = tri_o[tri_i]
140
- dev_i = tri_o[tei_i]
141
-
142
- if test_i is None:
143
- sys.exit(exit_message(full_target_coverage, "dev and test"))
144
-
145
- # matching info
146
- info = {
147
- "score": best_sco,
148
- "size_devset_in_spliton": dev_size,
149
- "size_devset_in_X": np.round(len(dev_i) / N, 2),
150
- "size_testset_in_spliton": test_size,
151
- "size_testset_in_X": np.round(len(test_i) / N, 2),
152
- }
153
-
154
- for c in p_ref:
155
- info[f"p_{c}_ref"] = p_ref[c]
156
- info[f"p_{c}_dev"] = class_prob(stratify_on[c][dev_i])
157
- info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i])
158
-
159
- return train_i, dev_i, test_i, info
160
-
161
-
162
- def optimize_traintest_split(
163
- X, y, split_on, stratify_on, weight=None, test_size=0.1, k=30, seed=42
164
- ):
165
- """optimize group-disjunct split which is guided by:
166
- - disjunct split of values in SPLIT_ON
167
- - stratification by all keys in STRATIFY_ON (targets and groupings)
168
- - test set proportion in X should be close to test_size (which is the test
169
- proportion in set(split_on))
170
-
171
- Score to be minimized: (sum_v[w(v) * irad(v)] + w(d) * d) / (sum_v[w(v)] + w(d))
172
- (v: variables to be stratified on
173
- w(v): their weight
174
- irad(v): information radius between reference distribution of classes in v
175
- and test set distribution
176
- N(v): number of stratification variables
177
- d: absolute difference between test sizes of X and set(split_on)
178
- w(d): its weight
179
-
180
- Args:
181
- X: (pd.DataFrame) of features/groupings for which best split
182
- is to be calculated. Of shape (N, M)
183
- y: (np.array) of targets of length N
184
- if type(y[0]) in ["str", "int"]: y is assumed to be categorical, so that it is additionally
185
- tested that all partitions cover all classes. Else y is assumed to be numeric and no
186
- coverage test is done.
187
- split_on: (np.array) list of length N with grouping variable (e.g. speaker IDs),
188
- on which the group-disjunct split is to be performed. Must be categorical.
189
- stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
190
- the split should be stratified on (groupings could e.g. be sex, age class, etc).
191
- Dict-Values are np.array-s of length N that contain the variable values. All
192
- variables must be categorical.
193
- weight: (dict) weight for each variable in stratify_on. Defines their amount of
194
- contribution to the optimization score. Uniform weighting by default. Additional
195
- key: "size_diff" defines how test size diff should be weighted.
196
- test_size: (float) test proportion in set(split_on), e.g. 10% of speakers to be held-out
197
- k: (int) number of different splits to be tried out
198
- seed: (int) random seed
199
- Returns:
200
- train_i: (np.array) train set indices in X
201
- test_i: (np.array) test set indices in X
202
- info: (dict) detail information about reference and achieved prob distributions
203
- "size_testset_in_spliton": intended test_size
204
- "size_testset_in_X": optimized test proportion in X
205
- "p_ref_{c}": reference class distribution calculated from stratify_on[c]
206
- "p_test_{c}": test set class distribution calculated from stratify_on[c][test_i]
207
- """
208
-
209
- gss = GroupShuffleSplit(n_splits=k, test_size=test_size, random_state=seed)
210
-
211
- # set weight defaults
212
- if weight is None:
213
- weight = {}
214
- for c in stratify_on.keys():
215
- if c not in weight:
216
- weight[c] = 1
217
- if "size_diff" not in weight:
218
- weight["size_diff"] = 1
219
-
220
- # stratification reference distributions calculated on stratify_on
221
- p_ref = {}
222
- for c in stratify_on:
223
- p_ref[c] = class_prob(stratify_on[c])
224
-
225
- # best train and test indices in X; best associated score
226
- train_i, test_i, best_sco = None, None, np.inf
227
-
228
- # data size
229
- N = len(y)
230
-
231
- # full target coverage in all partitions
232
- full_target_coverage = False
233
-
234
- # categorical target: number of classes for coverage test
235
- if is_categorical(y[0]):
236
- nc = len(set(y))
237
- else:
238
- nc = None
239
-
240
- # brute-force optimization of SPLIT_ON split
241
- for tri, tei in gss.split(X, y, split_on):
242
- # all classes maintained in all partitions?
243
- if nc:
244
- nc_train = len(set(y[tri]))
245
- nc_test = len(set(y[tei]))
246
- if min(nc_train, nc_test) < nc:
247
- continue
248
-
249
- full_target_coverage = True
250
-
251
- sco = calc_split_score(tei, stratify_on, weight, p_ref, N, test_size)
252
- if sco < best_sco:
253
- train_i, test_i, best_sco = tri, tei, sco
254
-
255
- if test_i is None:
256
- sys.exit(exit_message(full_target_coverage))
257
-
258
- # matching info
259
- info = {
260
- "score": best_sco,
261
- "size_testset_in_spliton": test_size,
262
- "size_testset_in_X": np.round(len(test_i) / N, 2),
263
- }
264
-
265
- for c in p_ref:
266
- info[f"p_{c}_ref"] = p_ref[c]
267
- info[f"p_{c}_test"] = class_prob(stratify_on[c][test_i])
268
-
269
- return train_i, test_i, info
270
-
271
-
272
- def calc_split_score(
273
- test_i, stratify_on, weight, p_ref, N, test_size, dev_i=None, dev_size=None
274
- ):
275
- """calculate split score based on class distribution IRADs and
276
- differences in partition sizes of groups vs observations; smaller is better.
277
- If dev_i and dev_size are not provided, the score is calculated for the train/test
278
- split only. If they are provided the score is calculated for the train/dev/test split
279
- Args:
280
- test_i: (np.array) of test set indices
281
- stratify_on: (dict) Dict-keys are variable names (targets and/or further groupings)
282
- the split should be stratified on (groupings could e.g. be sex, age class, etc).
283
- Dict-Values are np.array-s of length N that contain the variable values.
284
- weight: (dict) weight for each variable in stratify_on. Additional
285
- key: "size_diff" that weights the grouping vs observation level test set size difference
286
- p_ref: (dict) reference class distributions for all variables in stratify_on
287
- N: (int) size of underlying data set
288
- test_size: (float) test proportion in value set of variable, the disjunct grouping
289
- has been carried out
290
- dev_i: (np.array) of dev test indices
291
- dev_size: (float) dev proportion in value set of variable, the disjunct grouping
292
- has been carried out (this value should have been adjusted after splitting off the
293
- test set)
294
- """
295
-
296
- if dev_i is None:
297
- do_dev = False
298
- else:
299
- do_dev = True
300
-
301
- # dev and test set class distributions
302
- p_test, p_dev = {}, {}
303
- for c in p_ref:
304
- p_test[c] = class_prob(stratify_on[c][test_i])
305
- if do_dev:
306
- p_dev[c] = class_prob(stratify_on[c][dev_i])
307
-
308
- # score
309
- sco, wgt = 0, 0
310
-
311
- # IRADs (if p_test[c] or p_dec[c] do not contain
312
- # all classes in p_ref[c], return INF)
313
- for c in p_ref:
314
- irad, full_coverage = calc_irad(p_ref[c], p_test[c])
315
- if not full_coverage:
316
- return np.inf
317
- if do_dev:
318
- irad_dev, full_coverage = calc_irad(p_ref[c], p_dev[c])
319
- if not full_coverage:
320
- return np.inf
321
- irad = max(irad, irad_dev)
322
-
323
- sco += weight[c] * irad
324
- wgt += weight[c]
325
-
326
- # partition size difference groups vs observations
327
- size_diff = np.abs(len(test_i) / N - test_size)
328
- if do_dev:
329
- size_diff_dev = np.abs(len(dev_i) / N - dev_size)
330
- size_diff = max(size_diff, size_diff_dev)
331
-
332
- sco += weight["size_diff"] * size_diff
333
- wgt += weight["size_diff"]
334
-
335
- sco /= wgt
336
-
337
- return sco
338
-
339
-
340
- def calc_irad(p1, p2):
341
- """calculate information radius of prob dicts p1 and p2
342
- Args:
343
- p1, p2: (dict) of probabilities
344
- Returns:
345
- ir: (float) information radius
346
- full_coverage: (bool) True if all elements in p1 occur in p2
347
- and vice versa
348
- """
349
-
350
- p, q = [], []
351
- full_coverage = True
352
-
353
- for u in sorted(p1.keys()):
354
- if u not in p2:
355
- full_coverage = False
356
- a = 0.0
357
- else:
358
- a = p2[u]
359
-
360
- p.append(p1[u])
361
- q.append(a)
362
-
363
- if full_coverage:
364
- if len(p2.keys()) > len(p1.keys()):
365
- full_coverage = False
366
-
367
- irad = ssp.distance.jensenshannon(p, q)
368
-
369
- return irad, full_coverage
370
-
371
-
372
- def class_prob(y):
373
- """returns class probabilities in y
374
- Args:
375
- y (array-like) of classes
376
- Returns:
377
- p (dict) assigning to each class in Y its maximum likelihood
378
- """
379
-
380
- p = {}
381
- N = len(y)
382
- c = Counter(y)
383
- for x in c:
384
- p[x] = c[x] / N
385
-
386
- return p
387
-
388
-
389
- def is_categorical(x):
390
- """returns True if type of x is in str or int*,
391
- else False"""
392
-
393
- if type(x) in [
394
- str,
395
- int,
396
- np.int16,
397
- np.int32,
398
- np.int64,
399
- np.uint8,
400
- np.uint16,
401
- np.uint32,
402
- ]:
403
- return True
404
- return False
405
-
406
-
407
- def dummy_variable(X, columns, specs=None, squeeze_classes=False):
408
- """
409
- creates dummy variable from binned numeric columns that can be used
410
- later for stratification etc.
411
-
412
- Args:
413
- X: (pd.DataFrame)
414
- columns: (str or list) of numeric column names
415
- specs: (dict or str)
416
- if nested dict: keys are column names with subdict that contains the
417
- arguments for binning(), i.e. n_bins and lower_boundaries
418
- squeeze_classes: (boolean) further squeeze classes by sorting the digits
419
- within the string.
420
- Example: from binning of 3 columns, each into 2 bins, we got
421
- "000", "100", "010", "001", "110", "101", "011", "111".
422
- These classes are further squeezed by within-string sorting:
423
- "000", "001", "011", "111"
424
-
425
- Returns:
426
- y: (list) of class strings of length X.shape[0]
427
-
428
- """
429
-
430
- df_bin = pd.DataFrame()
431
- if specs is None:
432
- specs = {}
433
- if type(columns) is str:
434
- columns = [columns]
435
-
436
- # bin columns
437
- for col in columns:
438
- if col not in X.columns:
439
- sys.exit(f"column {col} not in dataframe")
440
- if col in specs:
441
- kwargs = specs[col]
442
- else:
443
- kwargs = {"nbins": 2}
444
- yc = binning(X[col].to_numpy(), **kwargs)
445
- df_bin[col] = yc.astype(str)
446
-
447
- # concatenate
448
- df_bin["binvar"] = ""
449
- for col in columns:
450
- df_bin["binvar"] += df_bin[col]
451
-
452
- # squeeze
453
- if squeeze_classes:
454
-
455
- def squeezing(x):
456
- return "".join(sorted(x))
457
-
458
- df_bin["binvar"] = df_bin["binvar"].apply(squeezing)
459
-
460
- y = df_bin["binvar"].tolist()
461
- return y
462
-
463
-
464
- def binning(y, nbins=3, lower_boundaries=None):
465
- """
466
- bins numeric array y either intrinsically into nbins classes
467
- based on an equidistant percentile split, or extrinsically
468
- by using the lower_boundaries values.
469
-
470
- Args:
471
- y: (np.array) with numeric data
472
- nbins: (int) number of bins
473
- lower_boundaries: (list) of lower bin boundaries.
474
- If provided nbins will be ignored and y is binned
475
- extrinsically. The first value of lower_boundaries
476
- is always corrected not to be higher than min(y).
477
- Returns:
478
- yc: (np.array) with bin IDs (integers from 0 to nbins-1)
479
- """
480
-
481
- # intrinsic binning by equidistant percentiles
482
- if lower_boundaries is None:
483
- prct = np.linspace(0, 100, nbins + 1)
484
- lower_boundaries = np.percentile(y, prct)
485
- lower_boundaries = lower_boundaries[0:nbins]
486
- else:
487
- # make sure that entire range of y is covered
488
- lower_boundaries[0] = min(lower_boundaries[0], np.min(y))
489
-
490
- # binned array
491
- yc = np.zeros(len(y), dtype=int)
492
- for i in range(1, len(lower_boundaries)):
493
- yc[y >= lower_boundaries[i]] = i
494
-
495
- return yc
496
-
497
-
498
- def optimize_testset_split(
499
- X, y, split_on, stratify_on, weight=None, test_size=0.1, k=30, seed=42
500
- ):
501
- """backward compatibility"""
502
- return optimize_traintest_split(
503
- X, y, split_on, stratify_on, weight, test_size, k, seed
504
- )
505
-
506
-
507
- def exit_message(full_target_coverage, infx="test"):
508
- if not full_target_coverage:
509
- return (
510
- "not all partitions contain all target classes. What you can do:\n"
511
- "(1) increase your dev and/or test partition, or\n"
512
- "(2) reduce the amount of target classes by merging some of them."
513
- )
514
-
515
- return (
516
- f"\n:-o No {infx} set split found. Reason is, that for at least one of the\n"
517
- f"stratification variables not all its values can make it into the {infx} set.\n"
518
- f"This happens e.g. if the {infx} set size is chosen too small or\n"
519
- "if the (multidimensional) distribution of the stratification\n"
520
- "variables is sparse. What you can do:\n"
521
- "(1) remove a variable from this stratification, or\n"
522
- "(2) merge classes within a variable to increase the per class probabilities, or\n"
523
- f"(3) increase the {infx} set size, or\n"
524
- "(4) increase the number of different splits (if it was small, say < 10, before), or\n"
525
- "(5) in case your target is numeric and you have added a binned target array to the\n"
526
- " stratification variables: reduce the number of bins.\n"
527
- "Good luck!\n"
528
- )