atlas-ftag-tools 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ftag/utils/metrics.py ADDED
@@ -0,0 +1,431 @@
1
+ """Tools for metrics module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+ from scipy.ndimage import gaussian_filter1d
9
+
10
+ from ftag.utils import logger
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from ftag.labels import Label, LabelContainer
14
+
15
+
16
+ def save_divide(
17
+ numerator: np.ndarray | float,
18
+ denominator: np.ndarray | float,
19
+ default: float = 1.0,
20
+ ):
21
+ """Save divide for denominator equal to 0.
22
+
23
+ Division using numpy divide function returning default value in cases where
24
+ denominator is 0.
25
+
26
+ Parameters
27
+ ----------
28
+ numerator: np.ndarray | float,
29
+ Numerator in the ratio calculation.
30
+ denominator: np.ndarray | float,
31
+ Denominator in the ratio calculation.
32
+ default: float
33
+ Default value which is returned if denominator is 0.
34
+
35
+ Returns
36
+ -------
37
+ ratio: np.ndarray | float
38
+ Result of the division
39
+ """
40
+ logger.debug("Calculating save division.")
41
+ logger.debug("numerator: %s", numerator)
42
+ logger.debug("denominator: %s", denominator)
43
+ logger.debug("default: %s", default)
44
+
45
+ if isinstance(numerator, (int, float, np.number)) and isinstance(
46
+ denominator, (int, float, np.number)
47
+ ):
48
+ output_shape = 1
49
+ else:
50
+ try:
51
+ output_shape = denominator.shape
52
+ except AttributeError:
53
+ output_shape = numerator.shape
54
+
55
+ ratio = np.divide(
56
+ numerator,
57
+ denominator,
58
+ out=np.ones(
59
+ output_shape,
60
+ dtype=float,
61
+ )
62
+ * default,
63
+ where=(denominator != 0),
64
+ )
65
+ if output_shape == 1:
66
+ return float(ratio)
67
+ return ratio
68
+
69
+
70
+ def weighted_percentile(
71
+ arr: np.ndarray,
72
+ percentile: np.ndarray,
73
+ weights: np.ndarray = None,
74
+ ):
75
+ """Calculate weighted percentile.
76
+
77
+ Implementation according to https://stackoverflow.com/a/29677616/11509698
78
+ (https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method)
79
+
80
+ Parameters
81
+ ----------
82
+ arr : np.ndarray
83
+ Data array
84
+ percentile : np.ndarray
85
+ Percentile array
86
+ weights : np.ndarray
87
+ Weights array, by default None
88
+
89
+ Returns
90
+ -------
91
+ np.ndarray
92
+ Weighted percentile array
93
+ """
94
+ logger.debug("Calculating weighted percentile.")
95
+ logger.debug("arr: %s", arr)
96
+ logger.debug("percentile: %s", percentile)
97
+ logger.debug("weights: %s", weights)
98
+
99
+ # Set weights to one if no weights are given
100
+ if weights is None:
101
+ weights = np.ones_like(arr)
102
+
103
+ # Set dtype to float64 if the weights are too large
104
+ dtype = np.float64 if np.sum(weights) > 1000000 else np.float32
105
+
106
+ # Get an array sorting and sort the array and the weights
107
+ ix = np.argsort(arr)
108
+ arr = arr[ix]
109
+ weights = weights[ix]
110
+
111
+ # Return the cumulative sum
112
+ cdf = np.cumsum(weights, dtype=dtype) - 0.5 * weights
113
+ cdf -= cdf[0]
114
+ cdf /= cdf[-1]
115
+
116
+ # Return the linear interpolation
117
+ return np.interp(percentile, cdf, arr)
118
+
119
+
120
+ def calculate_efficiency(
121
+ sig_disc: np.ndarray,
122
+ bkg_disc: np.ndarray,
123
+ target_eff: float | list | np.ndarray,
124
+ return_cuts: bool = False,
125
+ sig_weights: np.ndarray = None,
126
+ bkg_weights: np.ndarray = None,
127
+ ):
128
+ """Calculate efficiency.
129
+
130
+ Parameters
131
+ ----------
132
+ sig_disc : np.ndarray
133
+ Signal discriminant
134
+ bkg_disc : np.ndarray
135
+ Background discriminant
136
+ target_eff : float or list or np.ndarray
137
+ Working point which is used for discriminant calculation
138
+ return_cuts : bool
139
+ Specifies if the cut values corresponding to the provided WPs are returned.
140
+ If target_eff is a float, only one cut value will be returned. If target_eff
141
+ is an array, target_eff is an array as well.
142
+ sig_weights : np.ndarray
143
+ Weights for signal events
144
+ bkg_weights : np.ndarray
145
+ Weights for background events
146
+
147
+ Returns
148
+ -------
149
+ eff : float or np.ndarray
150
+ Efficiency.
151
+ Return float if target_eff is a float, else np.ndarray
152
+ cutvalue : float or np.ndarray
153
+ Cutvalue if return_cuts is True.
154
+ Return float if target_eff is a float, else np.ndarray
155
+ """
156
+ logger.debug("Calculating efficiency.")
157
+ logger.debug("sig_disc: %s", sig_disc)
158
+ logger.debug("bkg_disc: %s", bkg_disc)
159
+ logger.debug("target_eff: %s", target_eff)
160
+ logger.debug("return_cuts: %s", return_cuts)
161
+ logger.debug("sig_weights: %s", sig_weights)
162
+ logger.debug("bkg_weights: %s", bkg_weights)
163
+
164
+ # float | np.ndarray for both target_eff and the returned values
165
+ return_float = False
166
+ if isinstance(target_eff, float):
167
+ return_float = True
168
+
169
+ # Flatten the target efficiencies
170
+ target_eff = np.asarray([target_eff]).flatten()
171
+
172
+ # Get the cutvalue for the given target efficiency
173
+ cutvalue = weighted_percentile(arr=sig_disc, percentile=1.0 - target_eff, weights=sig_weights)
174
+
175
+ # Sort the cutvalues to get the correct order
176
+ sorted_args = np.argsort(1 - target_eff)
177
+
178
+ # Get the histogram for the backgrounds
179
+ hist, _ = np.histogram(bkg_disc, (-np.inf, *cutvalue[sorted_args], np.inf), weights=bkg_weights)
180
+
181
+ # Calculate the efficiencies for the calculated cut values
182
+ eff = hist[::-1].cumsum()[-2::-1] / hist.sum()
183
+ eff = eff[sorted_args]
184
+
185
+ # Ensure that a float is returned if float was given
186
+ if return_float:
187
+ eff = eff[0]
188
+ cutvalue = cutvalue[0]
189
+
190
+ # Also return the cuts if wanted
191
+ if return_cuts:
192
+ return eff, cutvalue
193
+
194
+ return eff
195
+
196
+
197
+ def calculate_rejection(
198
+ sig_disc: np.ndarray,
199
+ bkg_disc: np.ndarray,
200
+ target_eff,
201
+ return_cuts: bool = False,
202
+ sig_weights: np.ndarray = None,
203
+ bkg_weights: np.ndarray = None,
204
+ smooth: bool = False,
205
+ ):
206
+ """Calculate rejection.
207
+
208
+ Parameters
209
+ ----------
210
+ sig_disc : np.ndarray
211
+ Signal discriminant
212
+ bkg_disc : np.ndarray
213
+ Background discriminant
214
+ target_eff : float or list
215
+ Working point which is used for discriminant calculation
216
+ return_cuts : bool
217
+ Specifies if the cut values corresponding to the provided WPs are returned.
218
+ If target_eff is a float, only one cut value will be returned. If target_eff
219
+ is an array, target_eff is an array as well.
220
+ sig_weights : np.ndarray
221
+ Weights for signal events, by default None
222
+ bkg_weights : np.ndarray
223
+ Weights for background events, by default None
224
+
225
+ Returns
226
+ -------
227
+ rej : float or np.ndarray
228
+ Rejection.
229
+ If target_eff is a float, a float is returned if it's a list a np.ndarray
230
+ cut_value : float or np.ndarray
231
+ Cutvalue if return_cuts is True.
232
+ If target_eff is a float, a float is returned if it's a list a np.ndarray
233
+ """
234
+ logger.debug("Calculating rejection.")
235
+ logger.debug("sig_disc: %s", sig_disc)
236
+ logger.debug("bkg_disc: %s", bkg_disc)
237
+ logger.debug("target_eff: %s", target_eff)
238
+ logger.debug("return_cuts: %s", return_cuts)
239
+ logger.debug("sig_weights: %s", sig_weights)
240
+ logger.debug("bkg_weights: %s", bkg_weights)
241
+ logger.debug("smooth: %s", smooth)
242
+
243
+ # Calculate the efficiency
244
+ eff = calculate_efficiency(
245
+ sig_disc=sig_disc,
246
+ bkg_disc=bkg_disc,
247
+ target_eff=target_eff,
248
+ return_cuts=return_cuts,
249
+ sig_weights=sig_weights,
250
+ bkg_weights=bkg_weights,
251
+ )
252
+
253
+ # Invert the efficiency to get a rejection
254
+ rej = save_divide(1, eff[0] if return_cuts else eff, np.inf)
255
+
256
+ # Smooth out the rejection if wanted
257
+ if smooth:
258
+ rej = gaussian_filter1d(rej, sigma=1, radius=2, mode="nearest")
259
+
260
+ # Return also the cut values if wanted
261
+ if return_cuts:
262
+ return rej, eff[1]
263
+
264
+ return rej
265
+
266
+
267
+ def calculate_efficiency_error(
268
+ arr: np.ndarray,
269
+ n_counts: int,
270
+ suppress_zero_divison_error: bool = False,
271
+ norm: bool = False,
272
+ ) -> np.ndarray:
273
+ """Calculate statistical efficiency uncertainty.
274
+
275
+ Parameters
276
+ ----------
277
+ arr : numpy.array
278
+ Efficiency values
279
+ n_counts : int
280
+ Number of used statistics to calculate efficiency
281
+ suppress_zero_divison_error : bool
282
+ Not raising Error for zero division
283
+ norm : bool, optional
284
+ If True, normed (relative) error is being calculated, by default False
285
+
286
+ Returns
287
+ -------
288
+ numpy.array
289
+ Efficiency uncertainties
290
+
291
+ Raises
292
+ ------
293
+ ValueError
294
+ If n_counts <=0
295
+
296
+ Notes
297
+ -----
298
+ This method uses binomial errors as described in section 2.2 of
299
+ https://inspirehep.net/files/57287ac8e45a976ab423f3dd456af694
300
+ """
301
+ logger.debug("Calculating efficiency error.")
302
+ logger.debug("arr: %s", arr)
303
+ logger.debug("n_counts: %i", n_counts)
304
+ logger.debug("suppress_zero_divison_error: %s", suppress_zero_divison_error)
305
+ logger.debug("norm: %s", norm)
306
+ if np.any(n_counts <= 0) and not suppress_zero_divison_error:
307
+ raise ValueError(f"You passed as argument `N` {n_counts} but it has to be larger 0.")
308
+ if norm:
309
+ return np.sqrt(arr * (1 - arr) / n_counts) / arr
310
+ return np.sqrt(arr * (1 - arr) / n_counts)
311
+
312
+
313
+ def calculate_rejection_error(
314
+ arr: np.ndarray,
315
+ n_counts: int,
316
+ norm: bool = False,
317
+ ) -> np.ndarray:
318
+ """Calculate the rejection uncertainties.
319
+
320
+ Parameters
321
+ ----------
322
+ arr : numpy.array
323
+ Rejection values
324
+ n_counts : int
325
+ Number of used statistics to calculate rejection
326
+ norm : bool, optional
327
+ If True, normed (relative) error is being calculated, by default False
328
+
329
+ Returns
330
+ -------
331
+ numpy.array
332
+ Rejection uncertainties
333
+
334
+ Raises
335
+ ------
336
+ ValueError
337
+ If n_counts <=0
338
+ ValueError
339
+ If any rejection value is 0
340
+
341
+ Notes
342
+ -----
343
+ Special case of `eff_err()`
344
+ """
345
+ logger.debug("Calculating rejection error.")
346
+ logger.debug("arr: %s", arr)
347
+ logger.debug("n_counts: %i", n_counts)
348
+ logger.debug("norm: %s", norm)
349
+ if np.any(n_counts <= 0):
350
+ raise ValueError(f"You passed as argument `n_counts` {n_counts} but it has to be larger 0.")
351
+ if np.any(arr == 0):
352
+ raise ValueError("One rejection value is 0, cannot calculate error.")
353
+ if norm:
354
+ return np.power(arr, 2) * calculate_efficiency_error(1 / arr, n_counts) / arr
355
+ return np.power(arr, 2) * calculate_efficiency_error(1 / arr, n_counts)
356
+
357
+
358
+ def get_discriminant(
359
+ jets: np.ndarray,
360
+ tagger: str,
361
+ signal: Label,
362
+ flavours: LabelContainer,
363
+ fraction_values: dict[str, float],
364
+ epsilon: float = 1e-10,
365
+ ) -> np.ndarray:
366
+ """Calculate the tagging discriminant for a given tagger.
367
+
368
+ Calculated as the logarithm of the ratio of a specified signal probability
369
+ to a weighted sum ofbackground probabilities.
370
+
371
+ Parameters
372
+ ----------
373
+ jets : np.ndarray
374
+ Structured array of jets containing tagger outputs
375
+ tagger : str
376
+ Name of the tagger
377
+ signal : Label
378
+ Signal flavour (bjets/cjets or hbb/hcc)
379
+ fraction_values : dict
380
+ Dict with the fraction values for the background classes for the given tagger
381
+ epsilon : float, optional
382
+ Small number to avoid division by zero, by default 1e-10
383
+
384
+ Returns
385
+ -------
386
+ np.ndarray
387
+ Array of discriminant values.
388
+
389
+ Raises
390
+ ------
391
+ ValueError
392
+ If the signal flavour is not recognised.
393
+ """
394
+ # Init the denominator
395
+ denominator = 0.0
396
+
397
+ # Loop over background flavours
398
+ for flav in flavours:
399
+ # Skip signal flavour for denominator
400
+ if flav == signal:
401
+ continue
402
+
403
+ # Get the probability name of the tagger/flavour combo + fraction value
404
+ prob_name = f"{tagger}_{flav.px}"
405
+ fraction_value = fraction_values[flav.frac_str]
406
+
407
+ # If fraction_value for the given flavour is zero, skip it
408
+ if fraction_value == 0:
409
+ continue
410
+
411
+ # Check that the probability value for the flavour is available
412
+ if fraction_value > 0 and prob_name not in jets.dtype.names:
413
+ raise ValueError(
414
+ f"Nonzero fraction value for {flav.name}, but '{prob_name}' "
415
+ "not found in input array."
416
+ )
417
+
418
+ # Update denominator
419
+ denominator += jets[prob_name] * fraction_value if prob_name in jets.dtype.names else 0
420
+
421
+ # Calculate numerator
422
+ signal_field = f"{tagger}_{signal.px}"
423
+
424
+ # Check that the probability of the signal is available
425
+ if signal_field not in jets.dtype.names:
426
+ raise ValueError(
427
+ f"No signal probability value(s) found for tagger {tagger}. "
428
+ f"Missing variable: {signal_field}"
429
+ )
430
+
431
+ return np.log((jets[signal_field] + epsilon) / (denominator + epsilon))