cutpointpy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cutpointpy/__init__.py +0 -0
- cutpointpy/core.py +457 -0
- cutpointpy/utils.py +143 -0
- cutpointpy-1.0.0.dist-info/METADATA +48 -0
- cutpointpy-1.0.0.dist-info/RECORD +10 -0
- cutpointpy-1.0.0.dist-info/WHEEL +5 -0
- cutpointpy-1.0.0.dist-info/licenses/LICENSE +674 -0
- cutpointpy-1.0.0.dist-info/scm_file_list.json +14 -0
- cutpointpy-1.0.0.dist-info/scm_version.json +8 -0
- cutpointpy-1.0.0.dist-info/top_level.txt +1 -0
cutpointpy/__init__.py
ADDED
|
File without changes
|
cutpointpy/core.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from itertools import product
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.model_selection import StratifiedShuffleSplit
|
|
5
|
+
|
|
6
|
+
from cutpointpy.utils import auc as area_under_curve,\
|
|
7
|
+
check_same_length, cm, cm_performance_metrics
|
|
8
|
+
|
|
9
|
+
class CutpointCalculator():
|
|
10
|
+
"""
|
|
11
|
+
Performs optimal cut-point calculation and bootstrapping.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, target='youdenj', polarity=True,
|
|
15
|
+
interpolation=None, num_points=100):
|
|
16
|
+
"""
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
target : str
|
|
20
|
+
The target function to maximise or minimise.
|
|
21
|
+
Possible values:
|
|
22
|
+
'youdenj' -> see `Youdenj` class documentation.
|
|
23
|
+
'eucdist' -> see `Eucdist` class documentation.
|
|
24
|
+
polarity : bool
|
|
25
|
+
The direction of the inequality. If True (False) the
|
|
26
|
+
datapoints with feature value greater (less) than or equal
|
|
27
|
+
to the threshold are flagged as positive and the other ones
|
|
28
|
+
as negative.
|
|
29
|
+
interpolation : str [optional]
|
|
30
|
+
Interpolation method for generating the set of thresholds
|
|
31
|
+
to test. Possible values:
|
|
32
|
+
None -> No interpolation is used. The optimal
|
|
33
|
+
cut-point is chosen among the feature
|
|
34
|
+
values.
|
|
35
|
+
'linear' -> Thresholds are generated by linear
|
|
36
|
+
interpolation between the min and max value
|
|
37
|
+
of `features`.
|
|
38
|
+
num_points : unsigned int [optional]
|
|
39
|
+
Number of thresholds generated. Has no effect if `interpolation`
|
|
40
|
+
is None.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
match target:
|
|
44
|
+
case 'eucdist':
|
|
45
|
+
self.optimal_cpcalculator = Eucdist()
|
|
46
|
+
case 'youdenj':
|
|
47
|
+
self.optimal_cpcalculator = Youdenj()
|
|
48
|
+
case _:
|
|
49
|
+
raise ValueError(f'Target `{target}` not recognised.')
|
|
50
|
+
|
|
51
|
+
self.above = polarity
|
|
52
|
+
self.interpolation = interpolation
|
|
53
|
+
self.num_points = num_points
|
|
54
|
+
|
|
55
|
+
def find(self, features, labels):
|
|
56
|
+
"""
|
|
57
|
+
Determine the optimal cut-point value of a predictor variable
|
|
58
|
+
(feature) for a binary classification task.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
features : array-like of numeric (n_samples)
|
|
63
|
+
Value of the predictor variable for each datapoint.
|
|
64
|
+
It is converted to an ndarray of float internally.
|
|
65
|
+
labels : array-like of numeric (n_samples)
|
|
66
|
+
Class label of each datapoint, where 0 indicates negative
|
|
67
|
+
and any other value positive. It is converted to an ndarray of
|
|
68
|
+
bool internally.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
cutpoint : float
|
|
73
|
+
The optimal cut-point value.
|
|
74
|
+
cutpoint_idx : int
|
|
75
|
+
The index corresponding to the optimal cut-point value
|
|
76
|
+
among the thresholds tested.
|
|
77
|
+
thresholds : ndarray of numeric (N,1)
|
|
78
|
+
The thresholds tested. N = len(features) if
|
|
79
|
+
`self.interpolation` is None, otherwise N = num_points.
|
|
80
|
+
acc : ndarray of numeric (N,1)
|
|
81
|
+
Accuracy as a function of the thresholds.
|
|
82
|
+
se : ndarray of numeric (N,1)
|
|
83
|
+
Sensitivity as a function of the thresholds.
|
|
84
|
+
sp : ndarray of numeric (N,1)
|
|
85
|
+
Specificity as a function of the thresholds.
|
|
86
|
+
auc : float
|
|
87
|
+
The area under the receiver-operating characteristic (ROC)
|
|
88
|
+
curve.
|
|
89
|
+
|
|
90
|
+
References
|
|
91
|
+
----------
|
|
92
|
+
1. Hassanzad and Haijan-Tilaki (2024)
|
|
93
|
+
doi:10.1186/s12874-024-02198-2
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
if not check_same_length(features, labels):
|
|
97
|
+
raise ValueError('There should be as many features as '
|
|
98
|
+
'labels')
|
|
99
|
+
|
|
100
|
+
#Labels and features: cast and reshape
|
|
101
|
+
features, labels = self._cast_and_reshape(features, labels)
|
|
102
|
+
|
|
103
|
+
#Sort datapoints in ascending order by feature value
|
|
104
|
+
sorted_idxs = np.argsort(features, axis=0)
|
|
105
|
+
features, labels =\
|
|
106
|
+
[np.take_along_axis(x, sorted_idxs, axis=0) for x in
|
|
107
|
+
[features, labels]]
|
|
108
|
+
|
|
109
|
+
#Define the set of thresholds to test
|
|
110
|
+
if self.interpolation:
|
|
111
|
+
|
|
112
|
+
lingrid = np.linspace(start=np.min(features[:, 0]),
|
|
113
|
+
stop=np.max(features[:, 0]),
|
|
114
|
+
num=self.num_points)
|
|
115
|
+
|
|
116
|
+
match self.interpolation:
|
|
117
|
+
case 'linear':
|
|
118
|
+
thresholds = np.interp(
|
|
119
|
+
x=lingrid, xp=features[:, 0], fp=features[:, 0])
|
|
120
|
+
case _:
|
|
121
|
+
raise ValueError(f'Interpolation "{interpolation}" '
|
|
122
|
+
f'not recognised')
|
|
123
|
+
|
|
124
|
+
thresholds.shape = (thresholds.size, 1)
|
|
125
|
+
else:
|
|
126
|
+
thresholds = features
|
|
127
|
+
|
|
128
|
+
acc, se, sp, cutpoint, cutpoint_idx, auc =\
|
|
129
|
+
self._test_cutoff_values(
|
|
130
|
+
features=features,
|
|
131
|
+
labels=labels,
|
|
132
|
+
thresholds=thresholds,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return cutpoint, cutpoint_idx, thresholds, acc, se, sp, auc
|
|
136
|
+
|
|
137
|
+
def bootstrap(self, features, labels, method='sss', train_ratio=0.7,
|
|
138
|
+
num_reps=30, random_state=0):
|
|
139
|
+
"""
|
|
140
|
+
Compute/validate optimal cut-point through bootstrapping.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
features : array-like of numeric (n_samples)
|
|
145
|
+
Value of the predictor variable for each datapoint. It is
|
|
146
|
+
converted to ndarray of float internally.
|
|
147
|
+
labels : array-like of numeric (n_samples)
|
|
148
|
+
Class label of each datapoint, where 0 indicates negative
|
|
149
|
+
and any other value positive. It is converted to ndarray of
|
|
150
|
+
bool internally.
|
|
151
|
+
method : str
|
|
152
|
+
The strategy for generating the bootstratp repetitions-i.e.,
|
|
153
|
+
the subdivisions of the original data into train (in-bag
|
|
154
|
+
data) and test set (out-of-bag data).
|
|
155
|
+
Possible values:
|
|
156
|
+
`sss` -> Stratified shuffle split
|
|
157
|
+
train_ratio : float [0.0, 1.0]
|
|
158
|
+
The proportion of the original data used to generate the
|
|
159
|
+
train set at each repetition.
|
|
160
|
+
num_reps : int
|
|
161
|
+
Number of bootstrap repetitions (i.e., number of
|
|
162
|
+
subdivisions into train and test set).
|
|
163
|
+
random_state : int
|
|
164
|
+
Controls the randomness of the repetitions produced. Pass
|
|
165
|
+
an int for reproducible output across multiple function
|
|
166
|
+
calls, None for non-reproducible output.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
cutpoints : ndarray of float (num_reps, 1)
|
|
171
|
+
The optimal cut-point value for each repetition estimated
|
|
172
|
+
on the train set.
|
|
173
|
+
cutpoints_idxs : ndarray of float (num_reps, 1)
|
|
174
|
+
For each repetition the index of the `thresholds` array that
|
|
175
|
+
corresponds to the optimal cut-point.
|
|
176
|
+
thresholds : ndarray of float (num_reps, N)
|
|
177
|
+
The values of the thresholds tested on the train set at each
|
|
178
|
+
repetition. Element [i,j] of the matrix represents the j-th
|
|
179
|
+
threshold value for repetition i.
|
|
180
|
+
N = floor(len(`features`) * `train_ratio`) if
|
|
181
|
+
`self.interpolation` is None, otherwise N =
|
|
182
|
+
`self.num_points`.
|
|
183
|
+
accs : ndarray of float (num_reps, N)
|
|
184
|
+
Accuracy on the train set for each repetition and threshold
|
|
185
|
+
value tested.
|
|
186
|
+
ses : ndarray of float (num_reps, N)
|
|
187
|
+
Sensitivity on the train set for each repetition and
|
|
188
|
+
threshold value tested.
|
|
189
|
+
sps : ndarray of float (num_reps, N)
|
|
190
|
+
Specificity on the train set for each repetition and
|
|
191
|
+
threshold value tested.
|
|
192
|
+
aucs_train : ndarray of float (num_reps, 1)
|
|
193
|
+
The area under the curve estimated on the train set for each
|
|
194
|
+
repetition .
|
|
195
|
+
aucs_test : ndarray of float (num_reps, 1)
|
|
196
|
+
The area under the curve estimated on the test set for each
|
|
197
|
+
repetition .
|
|
198
|
+
performance_train : ndarray of float (num_reps, 3)
|
|
199
|
+
For each repetition, in column-wise order, respectively
|
|
200
|
+
accuracy, sensitivity and specificity yielded by the optimal
|
|
201
|
+
cut-point value when applied to the train set.
|
|
202
|
+
performance_test : ndarray of float (num_reps, 3)
|
|
203
|
+
For each repetition, in column-wise order, respectively
|
|
204
|
+
accuracy, sensitivity and specificity yielded by the optimal
|
|
205
|
+
cut-point value when applied to the test set.
|
|
206
|
+
performance_whole : ndarray of float (num_reps, 3)
|
|
207
|
+
For each repetition, in column-wise order, respectively
|
|
208
|
+
accuracy, sensitivity and specificity yielded by the optimal
|
|
209
|
+
cut-point value when applied to the whole dataset.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
features, labels = self._cast_and_reshape(features, labels)
|
|
213
|
+
|
|
214
|
+
#===============================================================
|
|
215
|
+
#================ Initialise the output ========================
|
|
216
|
+
#===============================================================
|
|
217
|
+
cutpoints, cutpoints_idxs, aucs_train, aucs_test =\
|
|
218
|
+
[np.zeros(shape=(num_reps,1), dtype=float) for _ in range(4)]
|
|
219
|
+
performance_train, performance_test, performance_whole =\
|
|
220
|
+
[np.zeros(shape=(num_reps,3), dtype=float) for _ in range(3)]
|
|
221
|
+
|
|
222
|
+
if self.interpolation:
|
|
223
|
+
num_thresholds = self.num_points
|
|
224
|
+
else:
|
|
225
|
+
num_thresholds = np.floor(
|
|
226
|
+
len(features) * train_ratio,
|
|
227
|
+
dtype=int, casting='unsafe'
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
thresholds, accs, ses, sps = \
|
|
231
|
+
[np.zeros(shape=(num_reps, num_thresholds), dtype=float)
|
|
232
|
+
for _ in range(4)]
|
|
233
|
+
#===============================================================
|
|
234
|
+
#===============================================================
|
|
235
|
+
#===============================================================
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
match method:
|
|
239
|
+
case 'sss':
|
|
240
|
+
splitter = StratifiedShuffleSplit(
|
|
241
|
+
n_splits=num_reps,
|
|
242
|
+
train_size=train_ratio,
|
|
243
|
+
random_state=random_state
|
|
244
|
+
)
|
|
245
|
+
case _:
|
|
246
|
+
raise ValueError(f'Unrecognised method `{method}`.')
|
|
247
|
+
|
|
248
|
+
for split_idx, (train_idxs, test_idxs) in enumerate(
|
|
249
|
+
splitter.split(X=features, y=labels)):
|
|
250
|
+
|
|
251
|
+
train_features, test_features, train_labels, test_labels = \
|
|
252
|
+
[container[idxs] for container, idxs in
|
|
253
|
+
product((features, labels), (train_idxs, test_idxs))]
|
|
254
|
+
|
|
255
|
+
#===========================================================
|
|
256
|
+
#==== Return values computed on the train (in bag) data ====
|
|
257
|
+
#===========================================================
|
|
258
|
+
cutpoint, cutpoint_idx, thresholds_, acc, se, sp, auc =\
|
|
259
|
+
self.find(train_features, train_labels)
|
|
260
|
+
|
|
261
|
+
for out_, in_ in zip((cutpoints, cutpoints_idxs),
|
|
262
|
+
(cutpoint, cutpoint_idx)):
|
|
263
|
+
out_[split_idx] = in_
|
|
264
|
+
|
|
265
|
+
for out_, in_ in zip((thresholds, accs, ses, sps),
|
|
266
|
+
(thresholds_, acc, se, sp)):
|
|
267
|
+
out_[split_idx,:] = in_.flat
|
|
268
|
+
|
|
269
|
+
for i, in_ in enumerate([acc, se, sp]):
|
|
270
|
+
performance_train[split_idx, i] = in_[cutpoint_idx,0]
|
|
271
|
+
|
|
272
|
+
aucs_train[split_idx,0] = auc
|
|
273
|
+
#===========================================================
|
|
274
|
+
#===========================================================
|
|
275
|
+
#===========================================================
|
|
276
|
+
|
|
277
|
+
#Compute auc on the test set
|
|
278
|
+
_, _, _, _, _, _, aucs_test[split_idx,0] =\
|
|
279
|
+
self.find(test_features, test_labels)
|
|
280
|
+
|
|
281
|
+
#Compute performance parameters on the test set using the
|
|
282
|
+
#cut-point value estimated on the train set
|
|
283
|
+
acc, se, sp, _, _, _ = self._test_cutoff_values(
|
|
284
|
+
features=test_features,
|
|
285
|
+
labels=test_labels,
|
|
286
|
+
thresholds = np.array(cutpoint, ndmin=2),
|
|
287
|
+
)
|
|
288
|
+
for i, in_ in enumerate([acc, se, sp]):
|
|
289
|
+
performance_test[split_idx, i] = in_[0,0]
|
|
290
|
+
|
|
291
|
+
#Compute performance parameters on the whole dataset using
|
|
292
|
+
#the cut-point value estimated on the train set
|
|
293
|
+
acc, se, sp, _, _, _ = self._test_cutoff_values(
|
|
294
|
+
features=features,
|
|
295
|
+
labels=labels,
|
|
296
|
+
thresholds = np.array(cutpoint, ndmin=2),
|
|
297
|
+
)
|
|
298
|
+
for i, in_ in enumerate([acc, se, sp]):
|
|
299
|
+
performance_whole[split_idx, i] = in_[0,0]
|
|
300
|
+
|
|
301
|
+
return cutpoints, cutpoints_idxs, thresholds, accs, ses, sps,\
|
|
302
|
+
aucs_train, aucs_test, performance_train,\
|
|
303
|
+
performance_test, performance_whole
|
|
304
|
+
|
|
305
|
+
def _test_cutoff_values(self, features, labels, thresholds):
|
|
306
|
+
"""
|
|
307
|
+
Classification performance of a set of cut-off values
|
|
308
|
+
(thresholds) when applied to a predictor variable (feature) for
|
|
309
|
+
a binary classification task.
|
|
310
|
+
|
|
311
|
+
Parameters
|
|
312
|
+
----------
|
|
313
|
+
features : ndarray of float (n_features, 1)
|
|
314
|
+
Value of the predictor variable for each datapoint.
|
|
315
|
+
labels : ndarray of bool (n_features, 1)
|
|
316
|
+
Class label of each datapoint, where True indicates the
|
|
317
|
+
positive class.
|
|
318
|
+
thresholds : ndarray of float (n_thresholds, 1)
|
|
319
|
+
The cut-off values to be tested, sorted in ascending order.
|
|
320
|
+
|
|
321
|
+
Returns
|
|
322
|
+
-------
|
|
323
|
+
acc, se, sp : ndarrays of float, each of shape (n_thresholds, 1)
|
|
324
|
+
Accuracy, sensitivity and pecificity as a function of the
|
|
325
|
+
cut-off value.
|
|
326
|
+
cutpoint : float
|
|
327
|
+
The optimal cut-off value.
|
|
328
|
+
cutpoint_idx : int [0, (n_thresholds - 1)]
|
|
329
|
+
Index of the optimal cut-off value.
|
|
330
|
+
auc : float
|
|
331
|
+
The area under the curve.
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
#Reshape features and thresholds for vectorisation.
|
|
335
|
+
#Prepend 'r_' to indicate the reshaped versions
|
|
336
|
+
r_features = np.tile(features.T, reps=(thresholds.size, 1))
|
|
337
|
+
r_thresholds = np.tile(thresholds, reps=(1, features.size))
|
|
338
|
+
|
|
339
|
+
#Predict labels as a function of threshold
|
|
340
|
+
if self.above:
|
|
341
|
+
predicted = (r_features >= r_thresholds)
|
|
342
|
+
else:
|
|
343
|
+
predicted = (r_features <= r_thresholds)
|
|
344
|
+
|
|
345
|
+
#Compute sensitivity and specificity as a function of threshold
|
|
346
|
+
confmat = cm(predicted=predicted,
|
|
347
|
+
target=np.tile(labels.T, reps=(thresholds.size, 1)))
|
|
348
|
+
acc, se, sp = cm_performance_metrics(confmat)
|
|
349
|
+
|
|
350
|
+
#Compute optimal cutpoint
|
|
351
|
+
cutpoint, cutpoint_idx = self.optimal_cpcalculator.find(
|
|
352
|
+
thresholds=thresholds, se=se, sp=sp
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
#Compute AUC
|
|
356
|
+
auc = area_under_curve(se=se.T, sp=sp.T).flatten()[0]
|
|
357
|
+
|
|
358
|
+
return acc, se, sp, cutpoint, cutpoint_idx, auc
|
|
359
|
+
|
|
360
|
+
@staticmethod
|
|
361
|
+
def _cast_and_reshape(features, labels):
|
|
362
|
+
"""
|
|
363
|
+
Convert and reshape features and labels to ndarray.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
features : array-like of numeric (n_samples)
|
|
368
|
+
Value of the predictor variable for each datapoint.
|
|
369
|
+
labels : array-like of numeric (n_samples)
|
|
370
|
+
Class label of each datapoint, where 0 indicates negative
|
|
371
|
+
and any other value positive.
|
|
372
|
+
|
|
373
|
+
Returns
|
|
374
|
+
-------
|
|
375
|
+
features : ndarray of float (n_samples, 1)
|
|
376
|
+
Cast and reshaped features.
|
|
377
|
+
labels : ndarray of bool (n_samples, 1)
|
|
378
|
+
Cast and reshaped labels.
|
|
379
|
+
"""
|
|
380
|
+
|
|
381
|
+
check_same_length(features, labels)
|
|
382
|
+
|
|
383
|
+
features = np.array(features, dtype=float, ndmin=2)
|
|
384
|
+
labels = np.array(labels, dtype=bool, ndmin=2)
|
|
385
|
+
|
|
386
|
+
for item in [features, labels]:
|
|
387
|
+
item.shape = (features.size, 1)
|
|
388
|
+
|
|
389
|
+
return features, labels
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
class OptimalCutpointCalculator(ABC):
|
|
394
|
+
"""
|
|
395
|
+
Encapsulates the target function to maximise or minimise.
|
|
396
|
+
"""
|
|
397
|
+
|
|
398
|
+
@staticmethod
|
|
399
|
+
@abstractmethod
|
|
400
|
+
def find(thresholds, se, sp):
|
|
401
|
+
"""
|
|
402
|
+
Determine the optimal cut-point given sensitivity and specificity as a
|
|
403
|
+
function of the thresholds tested.
|
|
404
|
+
|
|
405
|
+
Parameters
|
|
406
|
+
----------
|
|
407
|
+
thresholds : ndarray of float (N,1)
|
|
408
|
+
The set of thresholds to test, sorted from smallest to
|
|
409
|
+
largest.
|
|
410
|
+
se : ndarray of float (N,1)
|
|
411
|
+
Sensitivity as a function of the threshold.
|
|
412
|
+
sp : ndarray of float (N,1)
|
|
413
|
+
Specificity as a function of the threshold.
|
|
414
|
+
|
|
415
|
+
Returns
|
|
416
|
+
-------
|
|
417
|
+
cutpoint : numeric
|
|
418
|
+
The optimal cut-point value.
|
|
419
|
+
cutpoint_idx : int
|
|
420
|
+
The index corresponding to optimal cut-point value.
|
|
421
|
+
"""
|
|
422
|
+
pass
|
|
423
|
+
|
|
424
|
+
class Youdenj(OptimalCutpointCalculator):
|
|
425
|
+
"""
|
|
426
|
+
Maximises Youden’s J - i.e.: Sensitivity + Specificity - 1.
|
|
427
|
+
"""
|
|
428
|
+
|
|
429
|
+
@staticmethod
|
|
430
|
+
def find(thresholds, se, sp):
|
|
431
|
+
"""
|
|
432
|
+
Note
|
|
433
|
+
----
|
|
434
|
+
If the maximum occurrs on multiple values of threshold the first
|
|
435
|
+
(smallest) occurrence is returned.
|
|
436
|
+
"""
|
|
437
|
+
target_values = se + sp - 1
|
|
438
|
+
idx = np.argmax(target_values, axis=0).flatten()[0]
|
|
439
|
+
return thresholds[idx, 0], idx
|
|
440
|
+
|
|
441
|
+
class Eucdist(OptimalCutpointCalculator):
|
|
442
|
+
"""
|
|
443
|
+
Minimises the Euclidean distance between the (0,1) point and the ROC curve
|
|
444
|
+
in the FPR (1 - sp) vs. TPR (se) space.
|
|
445
|
+
"""
|
|
446
|
+
|
|
447
|
+
@staticmethod
|
|
448
|
+
def find(thresholds, se, sp):
|
|
449
|
+
"""
|
|
450
|
+
Note
|
|
451
|
+
----
|
|
452
|
+
If the minimum occurrs on multiple values of threshold the last
|
|
453
|
+
(largest) occurrence is returned.
|
|
454
|
+
"""
|
|
455
|
+
target_values = np.sqrt((1 - se)**2 + (1 - sp)**2)
|
|
456
|
+
idx = np.argmin(target_values, axis=0).flatten()[-1]
|
|
457
|
+
return thresholds[idx, 0], idx
|
cutpointpy/utils.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def check_same_length(*args):
|
|
4
|
+
"""
|
|
5
|
+
Check if iterables have the same length.
|
|
6
|
+
|
|
7
|
+
Parameters
|
|
8
|
+
----------
|
|
9
|
+
args :
|
|
10
|
+
The iterables to be compared for equal length.
|
|
11
|
+
|
|
12
|
+
Returns
|
|
13
|
+
-------
|
|
14
|
+
same_length : bool
|
|
15
|
+
True if all `àrgs` have the same length, False otherwise.
|
|
16
|
+
"""
|
|
17
|
+
if not (len(args) > 1):
|
|
18
|
+
raise ValueError('At least two iterables should be given.')
|
|
19
|
+
|
|
20
|
+
first_len = len(args[0])
|
|
21
|
+
same_length = all(len(arg) == first_len for arg in args)
|
|
22
|
+
|
|
23
|
+
return same_length
|
|
24
|
+
|
|
25
|
+
def cm(predicted, target):
|
|
26
|
+
"""
|
|
27
|
+
Confusion matrix for a binary outcome.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
predicted : ndarray of bool (n_tests, n_samples)
|
|
32
|
+
The predicted labels.
|
|
33
|
+
target : ndarray of bool (n_tests, n_samples)
|
|
34
|
+
The target labels (ground truth).
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
cm : ndarray of bool (n_tests, 4)
|
|
39
|
+
The confusion matrices. Each row represents one matrix; columns
|
|
40
|
+
0 to 3 respectively report the number of true positives, false
|
|
41
|
+
negatives, false positives and true negatives.
|
|
42
|
+
|
|
43
|
+
Notes
|
|
44
|
+
-----
|
|
45
|
+
Vectorised function - computes n_tests confusion matrices at once.
|
|
46
|
+
Convention for labels: True denotes the positive class.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
if not (predicted.shape == target.shape):
|
|
50
|
+
raise ValueError('Predicted and target values must have the'
|
|
51
|
+
'same shape')
|
|
52
|
+
|
|
53
|
+
cm = np.zeros(shape=(predicted.shape[0], 4), dtype=np.uint)
|
|
54
|
+
|
|
55
|
+
cm[:, 0] = np.sum(np.equal(predicted, True)
|
|
56
|
+
& np.equal(target, True), axis=1)
|
|
57
|
+
cm[:, 1] = np.sum(np.equal(predicted, False)
|
|
58
|
+
& np.equal(target, True), axis=1)
|
|
59
|
+
cm[:, 2] = np.sum(np.equal(predicted, True)
|
|
60
|
+
& np.equal(target, False), axis=1)
|
|
61
|
+
cm[:, 3] = np.sum(np.equal(predicted, False)
|
|
62
|
+
& np.equal(target, False), axis=1)
|
|
63
|
+
|
|
64
|
+
return cm
|
|
65
|
+
|
|
66
|
+
def cm_performance_metrics(cm):
|
|
67
|
+
"""
|
|
68
|
+
Performance metrics from the confusion matrix.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
cm : ndarray of bool (n_matrices, 4)
|
|
73
|
+
The confusion matrices. Each row represents one matrix; columns
|
|
74
|
+
0 to 3 respectively report the number of true positives, false
|
|
75
|
+
negatives, false positives and true negatives.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
acc : ndarray of float (n_matrices, 1)
|
|
80
|
+
Accuracy.
|
|
81
|
+
se : ndarray of float (n_matrices, 1)
|
|
82
|
+
Sensitivity.
|
|
83
|
+
sp : ndarray of float (n_matrices, 1)
|
|
84
|
+
Specificity.
|
|
85
|
+
|
|
86
|
+
Notes
|
|
87
|
+
-----
|
|
88
|
+
Returned values are for each confusion matrix. Accuracy, sensitivity
|
|
89
|
+
and specificity range between 0.0 and 1.0.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
acc = (cm[:, 0] + cm[:, 3]) / np.sum(cm, axis=1)
|
|
93
|
+
se = cm[:, 0] / (cm[:, 0] + cm[:, 1])
|
|
94
|
+
sn = cm[:, 3] / (cm[:, 3] + cm[:, 2])
|
|
95
|
+
|
|
96
|
+
retval = list()
|
|
97
|
+
for item in [acc, se, sn]:
|
|
98
|
+
retval.append(np.array(item, ndmin=2).T)
|
|
99
|
+
|
|
100
|
+
return retval
|
|
101
|
+
|
|
102
|
+
def auc(se, sp):
|
|
103
|
+
"""
|
|
104
|
+
Compute the area under the Receiver-operating characteristic (ROC)
|
|
105
|
+
curve.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
se : ndarray of numeric (n_tests, n_thresholds)
|
|
110
|
+
Sensitivity as a function of the thresholds.
|
|
111
|
+
sp : ndarray of numeric (n_tests, n_thresholds)
|
|
112
|
+
Specificity as a function of the thresholds.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
auc : float (n_tests, 1)
|
|
117
|
+
The areas under the curve.
|
|
118
|
+
|
|
119
|
+
Notes
|
|
120
|
+
-----
|
|
121
|
+
1. Vectorised function, computes n_tests AUCs at once.
|
|
122
|
+
2. It is assumed that in each row `se` and `sp` are matched by
|
|
123
|
+
threshold value. That is, for each row of `se` and `sp`
|
|
124
|
+
the same column index corresponds to the same threshold value.
|
|
125
|
+
3. Area calculation is based on the trapezoidal rule.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
if not (se.shape == sp.shape):
|
|
129
|
+
raise ValueError(f'`se` and `sp` must have the same shape')
|
|
130
|
+
|
|
131
|
+
#Compute true positive rate (TPR) and false positive rate (FPR)
|
|
132
|
+
tpr = se
|
|
133
|
+
fpr = 1 - sp
|
|
134
|
+
|
|
135
|
+
#Sort by FPR in increasing order
|
|
136
|
+
sorted_idxs = np.argsort(fpr)
|
|
137
|
+
tpr = np.take_along_axis(tpr, sorted_idxs)
|
|
138
|
+
fpr = np.take_along_axis(fpr, sorted_idxs)
|
|
139
|
+
|
|
140
|
+
#Compute AUC
|
|
141
|
+
auc = np.array(np.trapezoid(y=tpr, x=fpr), ndmin=2).T
|
|
142
|
+
|
|
143
|
+
return auc
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cutpointpy
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Optimal cut-point calculation of a predictor variable for binary classification tasks
|
|
5
|
+
Author-email: Francesco Bianconi <bianco@ieee.org>
|
|
6
|
+
License-Expression: GPL-3.0-or-later
|
|
7
|
+
Project-URL: home, https://github.com/bianconif/cutpointpy
|
|
8
|
+
Project-URL: repository, https://github.com/bianconif/cutpointpy
|
|
9
|
+
Keywords: binary-classification,cut-point,roc-analysis,sensitivity,specificity
|
|
10
|
+
Requires-Python: >=3.14
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: scikit-learn
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# cutpointpy
|
|
18
|
+
____________
|
|
19
|
+
A Python package for estimating the optimal cut-point of a predictor variable (feature) for a binary classification task. It is loosely inspired by [`cutpointr`](https://cran.r-project.org/web/packages/cutpointr/index.html), an optimal cut-point calculation package for [R](https://www.r-project.org/).
|
|
20
|
+
|
|
21
|
+
Main usage:
|
|
22
|
+
|
|
23
|
+
- Optimal cut-point estimation
|
|
24
|
+
- Stability analysis of the estimated cut-points through bootstrapping
|
|
25
|
+
- Receiver-operating characteristic curve (ROC) analysis
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
`pip install cutpointpy`
|
|
29
|
+
|
|
30
|
+
## Structure
|
|
31
|
+
- `cutpointpy.core`: contains the main class (`CutpointCalculator`) with functions `find()` and `bootstrap()` respectively for optimal cut-point estimation and stability analysis/validation through bootstrapping.
|
|
32
|
+
- `cutpointpy.utils`: contains ancillary functions including methods for computing performance parameters on binary classification tasks (e.g. confusion matrices, accuracy, sensitivity, specificity and AUC)
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
We recommend the following [marimo](https://marimo.io/) notebooks to get started with `cutpointpy`.
|
|
36
|
+
- Optimal cut-point estimation without bootstrapping
|
|
37
|
+
* [cutpointfind__glucose_cutoff_for_diabetes.py](https://molab.marimo.io/notebooks/nb_P81opF6FjJpcwDeycTAVDb)
|
|
38
|
+
* [cutpointfind__ibmi_cutoff_for_diabetes.py](https://molab.marimo.io/notebooks/nb_D5qnT3WHLxrNVpFtwxBHpc)
|
|
39
|
+
- Optimal cut-point estimation with bootstrapping
|
|
40
|
+
* [cutpointboot__glucose_cutoff_for_diabetes.py](https://molab.marimo.io/notebooks/nb_ZxGQFfmRsEBb5LRq8hqrXo)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
## References
|
|
44
|
+
1. Baratloo, A., Hosseini, M., Negida, A., El Ashal, G. [Part 1: simple definition and calculation of accuracy, sensitivity and specificity](https://pmc.ncbi.nlm.nih.gov/articles/PMC4614595/) (2015) Emergency 3(2):48-49
|
|
45
|
+
2. Hassanzad M., Hajian-Tilaki K. [Methods of determining optimal cut-point of diagnostic biomarkers with application of clinical data in ROC analysis: an update review](https://doi.org/10.1186/s12874-024-02198-2) (2024) BMC Medical Research Methodology, 24(1), art. no. 84
|
|
46
|
+
|
|
47
|
+
## Contacts
|
|
48
|
+
[Francesco Bianconi](www.bianconif.net) - [bianco@ieee.org](mailto:bianco@ieee.org).
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
cutpointpy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
cutpointpy/core.py,sha256=ezQJ8kNqm9d4oihMCcLcasY5dYTq8mYl5fgDYpxfcU4,19069
|
|
3
|
+
cutpointpy/utils.py,sha256=cxTrtTNczIlR0bN7k00_NUyk8WzVX1iM6Sdn80afRhc,4388
|
|
4
|
+
cutpointpy-1.0.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
5
|
+
cutpointpy-1.0.0.dist-info/METADATA,sha256=F_8i-E2Jbuv40y-If6dT4GuF76wGX8BQiixXGrqMJJg,2793
|
|
6
|
+
cutpointpy-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
cutpointpy-1.0.0.dist-info/scm_file_list.json,sha256=_HvuVUKfgFObfnhdLVfPaIk0jTiA5d3Ltm6-PexJj3s,309
|
|
8
|
+
cutpointpy-1.0.0.dist-info/scm_version.json,sha256=TxlYJP0Qs8Unv1JgWlv37F6xrY7Vxxg4YztqiSm6_z4,170
|
|
9
|
+
cutpointpy-1.0.0.dist-info/top_level.txt,sha256=kHAUPkGYJmDf_OsRAsBsIOC6VYr2nDvSPEk5fjPZbLw,11
|
|
10
|
+
cutpointpy-1.0.0.dist-info/RECORD,,
|