atlas-ftag-tools 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/METADATA +4 -3
- {atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/RECORD +14 -12
- {atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/WHEEL +1 -1
- {atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/entry_points.txt +1 -1
- ftag/__init__.py +6 -5
- ftag/flavours.yaml +47 -4
- ftag/fraction_optimization.py +184 -0
- ftag/labels.py +10 -2
- ftag/mock.py +58 -17
- ftag/utils/__init__.py +24 -0
- ftag/utils/logging.py +123 -0
- ftag/utils/metrics.py +431 -0
- ftag/working_points.py +547 -0
- ftag/wps/__init__.py +0 -0
- ftag/wps/discriminant.py +0 -131
- ftag/wps/working_points.py +0 -316
- {atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/top_level.txt +0 -0
ftag/utils/metrics.py
ADDED
@@ -0,0 +1,431 @@
|
|
1
|
+
"""Tools for metrics module."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import TYPE_CHECKING
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from scipy.ndimage import gaussian_filter1d
|
9
|
+
|
10
|
+
from ftag.utils import logger
|
11
|
+
|
12
|
+
if TYPE_CHECKING: # pragma: no cover
|
13
|
+
from ftag.labels import Label, LabelContainer
|
14
|
+
|
15
|
+
|
16
|
+
def save_divide(
|
17
|
+
numerator: np.ndarray | float,
|
18
|
+
denominator: np.ndarray | float,
|
19
|
+
default: float = 1.0,
|
20
|
+
):
|
21
|
+
"""Save divide for denominator equal to 0.
|
22
|
+
|
23
|
+
Division using numpy divide function returning default value in cases where
|
24
|
+
denominator is 0.
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
numerator: np.ndarray | float,
|
29
|
+
Numerator in the ratio calculation.
|
30
|
+
denominator: np.ndarray | float,
|
31
|
+
Denominator in the ratio calculation.
|
32
|
+
default: float
|
33
|
+
Default value which is returned if denominator is 0.
|
34
|
+
|
35
|
+
Returns
|
36
|
+
-------
|
37
|
+
ratio: np.ndarray | float
|
38
|
+
Result of the division
|
39
|
+
"""
|
40
|
+
logger.debug("Calculating save division.")
|
41
|
+
logger.debug("numerator: %s", numerator)
|
42
|
+
logger.debug("denominator: %s", denominator)
|
43
|
+
logger.debug("default: %s", default)
|
44
|
+
|
45
|
+
if isinstance(numerator, (int, float, np.number)) and isinstance(
|
46
|
+
denominator, (int, float, np.number)
|
47
|
+
):
|
48
|
+
output_shape = 1
|
49
|
+
else:
|
50
|
+
try:
|
51
|
+
output_shape = denominator.shape
|
52
|
+
except AttributeError:
|
53
|
+
output_shape = numerator.shape
|
54
|
+
|
55
|
+
ratio = np.divide(
|
56
|
+
numerator,
|
57
|
+
denominator,
|
58
|
+
out=np.ones(
|
59
|
+
output_shape,
|
60
|
+
dtype=float,
|
61
|
+
)
|
62
|
+
* default,
|
63
|
+
where=(denominator != 0),
|
64
|
+
)
|
65
|
+
if output_shape == 1:
|
66
|
+
return float(ratio)
|
67
|
+
return ratio
|
68
|
+
|
69
|
+
|
70
|
+
def weighted_percentile(
|
71
|
+
arr: np.ndarray,
|
72
|
+
percentile: np.ndarray,
|
73
|
+
weights: np.ndarray = None,
|
74
|
+
):
|
75
|
+
"""Calculate weighted percentile.
|
76
|
+
|
77
|
+
Implementation according to https://stackoverflow.com/a/29677616/11509698
|
78
|
+
(https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method)
|
79
|
+
|
80
|
+
Parameters
|
81
|
+
----------
|
82
|
+
arr : np.ndarray
|
83
|
+
Data array
|
84
|
+
percentile : np.ndarray
|
85
|
+
Percentile array
|
86
|
+
weights : np.ndarray
|
87
|
+
Weights array, by default None
|
88
|
+
|
89
|
+
Returns
|
90
|
+
-------
|
91
|
+
np.ndarray
|
92
|
+
Weighted percentile array
|
93
|
+
"""
|
94
|
+
logger.debug("Calculating weighted percentile.")
|
95
|
+
logger.debug("arr: %s", arr)
|
96
|
+
logger.debug("percentile: %s", percentile)
|
97
|
+
logger.debug("weights: %s", weights)
|
98
|
+
|
99
|
+
# Set weights to one if no weights are given
|
100
|
+
if weights is None:
|
101
|
+
weights = np.ones_like(arr)
|
102
|
+
|
103
|
+
# Set dtype to float64 if the weights are too large
|
104
|
+
dtype = np.float64 if np.sum(weights) > 1000000 else np.float32
|
105
|
+
|
106
|
+
# Get an array sorting and sort the array and the weights
|
107
|
+
ix = np.argsort(arr)
|
108
|
+
arr = arr[ix]
|
109
|
+
weights = weights[ix]
|
110
|
+
|
111
|
+
# Return the cumulative sum
|
112
|
+
cdf = np.cumsum(weights, dtype=dtype) - 0.5 * weights
|
113
|
+
cdf -= cdf[0]
|
114
|
+
cdf /= cdf[-1]
|
115
|
+
|
116
|
+
# Return the linear interpolation
|
117
|
+
return np.interp(percentile, cdf, arr)
|
118
|
+
|
119
|
+
|
120
|
+
def calculate_efficiency(
|
121
|
+
sig_disc: np.ndarray,
|
122
|
+
bkg_disc: np.ndarray,
|
123
|
+
target_eff: float | list | np.ndarray,
|
124
|
+
return_cuts: bool = False,
|
125
|
+
sig_weights: np.ndarray = None,
|
126
|
+
bkg_weights: np.ndarray = None,
|
127
|
+
):
|
128
|
+
"""Calculate efficiency.
|
129
|
+
|
130
|
+
Parameters
|
131
|
+
----------
|
132
|
+
sig_disc : np.ndarray
|
133
|
+
Signal discriminant
|
134
|
+
bkg_disc : np.ndarray
|
135
|
+
Background discriminant
|
136
|
+
target_eff : float or list or np.ndarray
|
137
|
+
Working point which is used for discriminant calculation
|
138
|
+
return_cuts : bool
|
139
|
+
Specifies if the cut values corresponding to the provided WPs are returned.
|
140
|
+
If target_eff is a float, only one cut value will be returned. If target_eff
|
141
|
+
is an array, target_eff is an array as well.
|
142
|
+
sig_weights : np.ndarray
|
143
|
+
Weights for signal events
|
144
|
+
bkg_weights : np.ndarray
|
145
|
+
Weights for background events
|
146
|
+
|
147
|
+
Returns
|
148
|
+
-------
|
149
|
+
eff : float or np.ndarray
|
150
|
+
Efficiency.
|
151
|
+
Return float if target_eff is a float, else np.ndarray
|
152
|
+
cutvalue : float or np.ndarray
|
153
|
+
Cutvalue if return_cuts is True.
|
154
|
+
Return float if target_eff is a float, else np.ndarray
|
155
|
+
"""
|
156
|
+
logger.debug("Calculating efficiency.")
|
157
|
+
logger.debug("sig_disc: %s", sig_disc)
|
158
|
+
logger.debug("bkg_disc: %s", bkg_disc)
|
159
|
+
logger.debug("target_eff: %s", target_eff)
|
160
|
+
logger.debug("return_cuts: %s", return_cuts)
|
161
|
+
logger.debug("sig_weights: %s", sig_weights)
|
162
|
+
logger.debug("bkg_weights: %s", bkg_weights)
|
163
|
+
|
164
|
+
# float | np.ndarray for both target_eff and the returned values
|
165
|
+
return_float = False
|
166
|
+
if isinstance(target_eff, float):
|
167
|
+
return_float = True
|
168
|
+
|
169
|
+
# Flatten the target efficiencies
|
170
|
+
target_eff = np.asarray([target_eff]).flatten()
|
171
|
+
|
172
|
+
# Get the cutvalue for the given target efficiency
|
173
|
+
cutvalue = weighted_percentile(arr=sig_disc, percentile=1.0 - target_eff, weights=sig_weights)
|
174
|
+
|
175
|
+
# Sort the cutvalues to get the correct order
|
176
|
+
sorted_args = np.argsort(1 - target_eff)
|
177
|
+
|
178
|
+
# Get the histogram for the backgrounds
|
179
|
+
hist, _ = np.histogram(bkg_disc, (-np.inf, *cutvalue[sorted_args], np.inf), weights=bkg_weights)
|
180
|
+
|
181
|
+
# Calculate the efficiencies for the calculated cut values
|
182
|
+
eff = hist[::-1].cumsum()[-2::-1] / hist.sum()
|
183
|
+
eff = eff[sorted_args]
|
184
|
+
|
185
|
+
# Ensure that a float is returned if float was given
|
186
|
+
if return_float:
|
187
|
+
eff = eff[0]
|
188
|
+
cutvalue = cutvalue[0]
|
189
|
+
|
190
|
+
# Also return the cuts if wanted
|
191
|
+
if return_cuts:
|
192
|
+
return eff, cutvalue
|
193
|
+
|
194
|
+
return eff
|
195
|
+
|
196
|
+
|
197
|
+
def calculate_rejection(
|
198
|
+
sig_disc: np.ndarray,
|
199
|
+
bkg_disc: np.ndarray,
|
200
|
+
target_eff,
|
201
|
+
return_cuts: bool = False,
|
202
|
+
sig_weights: np.ndarray = None,
|
203
|
+
bkg_weights: np.ndarray = None,
|
204
|
+
smooth: bool = False,
|
205
|
+
):
|
206
|
+
"""Calculate rejection.
|
207
|
+
|
208
|
+
Parameters
|
209
|
+
----------
|
210
|
+
sig_disc : np.ndarray
|
211
|
+
Signal discriminant
|
212
|
+
bkg_disc : np.ndarray
|
213
|
+
Background discriminant
|
214
|
+
target_eff : float or list
|
215
|
+
Working point which is used for discriminant calculation
|
216
|
+
return_cuts : bool
|
217
|
+
Specifies if the cut values corresponding to the provided WPs are returned.
|
218
|
+
If target_eff is a float, only one cut value will be returned. If target_eff
|
219
|
+
is an array, target_eff is an array as well.
|
220
|
+
sig_weights : np.ndarray
|
221
|
+
Weights for signal events, by default None
|
222
|
+
bkg_weights : np.ndarray
|
223
|
+
Weights for background events, by default None
|
224
|
+
|
225
|
+
Returns
|
226
|
+
-------
|
227
|
+
rej : float or np.ndarray
|
228
|
+
Rejection.
|
229
|
+
If target_eff is a float, a float is returned if it's a list a np.ndarray
|
230
|
+
cut_value : float or np.ndarray
|
231
|
+
Cutvalue if return_cuts is True.
|
232
|
+
If target_eff is a float, a float is returned if it's a list a np.ndarray
|
233
|
+
"""
|
234
|
+
logger.debug("Calculating rejection.")
|
235
|
+
logger.debug("sig_disc: %s", sig_disc)
|
236
|
+
logger.debug("bkg_disc: %s", bkg_disc)
|
237
|
+
logger.debug("target_eff: %s", target_eff)
|
238
|
+
logger.debug("return_cuts: %s", return_cuts)
|
239
|
+
logger.debug("sig_weights: %s", sig_weights)
|
240
|
+
logger.debug("bkg_weights: %s", bkg_weights)
|
241
|
+
logger.debug("smooth: %s", smooth)
|
242
|
+
|
243
|
+
# Calculate the efficiency
|
244
|
+
eff = calculate_efficiency(
|
245
|
+
sig_disc=sig_disc,
|
246
|
+
bkg_disc=bkg_disc,
|
247
|
+
target_eff=target_eff,
|
248
|
+
return_cuts=return_cuts,
|
249
|
+
sig_weights=sig_weights,
|
250
|
+
bkg_weights=bkg_weights,
|
251
|
+
)
|
252
|
+
|
253
|
+
# Invert the efficiency to get a rejection
|
254
|
+
rej = save_divide(1, eff[0] if return_cuts else eff, np.inf)
|
255
|
+
|
256
|
+
# Smooth out the rejection if wanted
|
257
|
+
if smooth:
|
258
|
+
rej = gaussian_filter1d(rej, sigma=1, radius=2, mode="nearest")
|
259
|
+
|
260
|
+
# Return also the cut values if wanted
|
261
|
+
if return_cuts:
|
262
|
+
return rej, eff[1]
|
263
|
+
|
264
|
+
return rej
|
265
|
+
|
266
|
+
|
267
|
+
def calculate_efficiency_error(
|
268
|
+
arr: np.ndarray,
|
269
|
+
n_counts: int,
|
270
|
+
suppress_zero_divison_error: bool = False,
|
271
|
+
norm: bool = False,
|
272
|
+
) -> np.ndarray:
|
273
|
+
"""Calculate statistical efficiency uncertainty.
|
274
|
+
|
275
|
+
Parameters
|
276
|
+
----------
|
277
|
+
arr : numpy.array
|
278
|
+
Efficiency values
|
279
|
+
n_counts : int
|
280
|
+
Number of used statistics to calculate efficiency
|
281
|
+
suppress_zero_divison_error : bool
|
282
|
+
Not raising Error for zero division
|
283
|
+
norm : bool, optional
|
284
|
+
If True, normed (relative) error is being calculated, by default False
|
285
|
+
|
286
|
+
Returns
|
287
|
+
-------
|
288
|
+
numpy.array
|
289
|
+
Efficiency uncertainties
|
290
|
+
|
291
|
+
Raises
|
292
|
+
------
|
293
|
+
ValueError
|
294
|
+
If n_counts <=0
|
295
|
+
|
296
|
+
Notes
|
297
|
+
-----
|
298
|
+
This method uses binomial errors as described in section 2.2 of
|
299
|
+
https://inspirehep.net/files/57287ac8e45a976ab423f3dd456af694
|
300
|
+
"""
|
301
|
+
logger.debug("Calculating efficiency error.")
|
302
|
+
logger.debug("arr: %s", arr)
|
303
|
+
logger.debug("n_counts: %i", n_counts)
|
304
|
+
logger.debug("suppress_zero_divison_error: %s", suppress_zero_divison_error)
|
305
|
+
logger.debug("norm: %s", norm)
|
306
|
+
if np.any(n_counts <= 0) and not suppress_zero_divison_error:
|
307
|
+
raise ValueError(f"You passed as argument `N` {n_counts} but it has to be larger 0.")
|
308
|
+
if norm:
|
309
|
+
return np.sqrt(arr * (1 - arr) / n_counts) / arr
|
310
|
+
return np.sqrt(arr * (1 - arr) / n_counts)
|
311
|
+
|
312
|
+
|
313
|
+
def calculate_rejection_error(
|
314
|
+
arr: np.ndarray,
|
315
|
+
n_counts: int,
|
316
|
+
norm: bool = False,
|
317
|
+
) -> np.ndarray:
|
318
|
+
"""Calculate the rejection uncertainties.
|
319
|
+
|
320
|
+
Parameters
|
321
|
+
----------
|
322
|
+
arr : numpy.array
|
323
|
+
Rejection values
|
324
|
+
n_counts : int
|
325
|
+
Number of used statistics to calculate rejection
|
326
|
+
norm : bool, optional
|
327
|
+
If True, normed (relative) error is being calculated, by default False
|
328
|
+
|
329
|
+
Returns
|
330
|
+
-------
|
331
|
+
numpy.array
|
332
|
+
Rejection uncertainties
|
333
|
+
|
334
|
+
Raises
|
335
|
+
------
|
336
|
+
ValueError
|
337
|
+
If n_counts <=0
|
338
|
+
ValueError
|
339
|
+
If any rejection value is 0
|
340
|
+
|
341
|
+
Notes
|
342
|
+
-----
|
343
|
+
Special case of `eff_err()`
|
344
|
+
"""
|
345
|
+
logger.debug("Calculating rejection error.")
|
346
|
+
logger.debug("arr: %s", arr)
|
347
|
+
logger.debug("n_counts: %i", n_counts)
|
348
|
+
logger.debug("norm: %s", norm)
|
349
|
+
if np.any(n_counts <= 0):
|
350
|
+
raise ValueError(f"You passed as argument `n_counts` {n_counts} but it has to be larger 0.")
|
351
|
+
if np.any(arr == 0):
|
352
|
+
raise ValueError("One rejection value is 0, cannot calculate error.")
|
353
|
+
if norm:
|
354
|
+
return np.power(arr, 2) * calculate_efficiency_error(1 / arr, n_counts) / arr
|
355
|
+
return np.power(arr, 2) * calculate_efficiency_error(1 / arr, n_counts)
|
356
|
+
|
357
|
+
|
358
|
+
def get_discriminant(
|
359
|
+
jets: np.ndarray,
|
360
|
+
tagger: str,
|
361
|
+
signal: Label,
|
362
|
+
flavours: LabelContainer,
|
363
|
+
fraction_values: dict[str, float],
|
364
|
+
epsilon: float = 1e-10,
|
365
|
+
) -> np.ndarray:
|
366
|
+
"""Calculate the tagging discriminant for a given tagger.
|
367
|
+
|
368
|
+
Calculated as the logarithm of the ratio of a specified signal probability
|
369
|
+
to a weighted sum ofbackground probabilities.
|
370
|
+
|
371
|
+
Parameters
|
372
|
+
----------
|
373
|
+
jets : np.ndarray
|
374
|
+
Structured array of jets containing tagger outputs
|
375
|
+
tagger : str
|
376
|
+
Name of the tagger
|
377
|
+
signal : Label
|
378
|
+
Signal flavour (bjets/cjets or hbb/hcc)
|
379
|
+
fraction_values : dict
|
380
|
+
Dict with the fraction values for the background classes for the given tagger
|
381
|
+
epsilon : float, optional
|
382
|
+
Small number to avoid division by zero, by default 1e-10
|
383
|
+
|
384
|
+
Returns
|
385
|
+
-------
|
386
|
+
np.ndarray
|
387
|
+
Array of discriminant values.
|
388
|
+
|
389
|
+
Raises
|
390
|
+
------
|
391
|
+
ValueError
|
392
|
+
If the signal flavour is not recognised.
|
393
|
+
"""
|
394
|
+
# Init the denominator
|
395
|
+
denominator = 0.0
|
396
|
+
|
397
|
+
# Loop over background flavours
|
398
|
+
for flav in flavours:
|
399
|
+
# Skip signal flavour for denominator
|
400
|
+
if flav == signal:
|
401
|
+
continue
|
402
|
+
|
403
|
+
# Get the probability name of the tagger/flavour combo + fraction value
|
404
|
+
prob_name = f"{tagger}_{flav.px}"
|
405
|
+
fraction_value = fraction_values[flav.frac_str]
|
406
|
+
|
407
|
+
# If fraction_value for the given flavour is zero, skip it
|
408
|
+
if fraction_value == 0:
|
409
|
+
continue
|
410
|
+
|
411
|
+
# Check that the probability value for the flavour is available
|
412
|
+
if fraction_value > 0 and prob_name not in jets.dtype.names:
|
413
|
+
raise ValueError(
|
414
|
+
f"Nonzero fraction value for {flav.name}, but '{prob_name}' "
|
415
|
+
"not found in input array."
|
416
|
+
)
|
417
|
+
|
418
|
+
# Update denominator
|
419
|
+
denominator += jets[prob_name] * fraction_value if prob_name in jets.dtype.names else 0
|
420
|
+
|
421
|
+
# Calculate numerator
|
422
|
+
signal_field = f"{tagger}_{signal.px}"
|
423
|
+
|
424
|
+
# Check that the probability of the signal is available
|
425
|
+
if signal_field not in jets.dtype.names:
|
426
|
+
raise ValueError(
|
427
|
+
f"No signal probability value(s) found for tagger {tagger}. "
|
428
|
+
f"Missing variable: {signal_field}"
|
429
|
+
)
|
430
|
+
|
431
|
+
return np.log((jets[signal_field] + epsilon) / (denominator + epsilon))
|