umami-preprocessing 0.2.0__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/PKG-INFO +3 -3
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/pyproject.toml +2 -2
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/PKG-INFO +3 -3
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/requires.txt +2 -2
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/__init__.py +2 -1
- umami_preprocessing-0.2.3/upp/classes/components.py +522 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/classes/preprocessing_config.py +1 -1
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/main.py +12 -3
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/stages/hist.py +51 -10
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/stages/interpolation.py +24 -13
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/stages/merging.py +81 -24
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/stages/normalisation.py +144 -17
- umami_preprocessing-0.2.3/upp/stages/resampling.py +513 -0
- umami_preprocessing-0.2.0/upp/classes/components.py +0 -226
- umami_preprocessing-0.2.0/upp/stages/resampling.py +0 -283
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/README.md +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/setup.cfg +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/SOURCES.txt +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/dependency_links.txt +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/entry_points.txt +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/top_level.txt +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/classes/__init__.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/classes/region.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/classes/resampling_config.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/classes/variable_config.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/logger.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/stages/__init__.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/stages/plot.py +0 -0
- {umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/upp/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: umami-preprocessing
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Preprocessing for jet tagging
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
|
|
@@ -10,8 +10,8 @@ Requires-Dist: pyyaml-include==1.3
|
|
|
10
10
|
Requires-Dist: PyYAML==6.0.1
|
|
11
11
|
Requires-Dist: rich==12.6.0
|
|
12
12
|
Requires-Dist: scipy==1.10.1
|
|
13
|
-
Requires-Dist: puma-hep==0.4.
|
|
14
|
-
Requires-Dist: atlas-ftag-tools==0.2.
|
|
13
|
+
Requires-Dist: puma-hep==0.4.2
|
|
14
|
+
Requires-Dist: atlas-ftag-tools==0.2.8
|
|
15
15
|
Requires-Dist: dotmap==1.3.30
|
|
16
16
|
Provides-Extra: dev
|
|
17
17
|
Requires-Dist: ruff==0.1.6; extra == "dev"
|
{umami_preprocessing-0.2.0 → umami_preprocessing-0.2.3}/umami_preprocessing.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: umami-preprocessing
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Preprocessing for jet tagging
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
|
|
@@ -10,8 +10,8 @@ Requires-Dist: pyyaml-include==1.3
|
|
|
10
10
|
Requires-Dist: PyYAML==6.0.1
|
|
11
11
|
Requires-Dist: rich==12.6.0
|
|
12
12
|
Requires-Dist: scipy==1.10.1
|
|
13
|
-
Requires-Dist: puma-hep==0.4.
|
|
14
|
-
Requires-Dist: atlas-ftag-tools==0.2.
|
|
13
|
+
Requires-Dist: puma-hep==0.4.2
|
|
14
|
+
Requires-Dist: atlas-ftag-tools==0.2.8
|
|
15
15
|
Requires-Dist: dotmap==1.3.30
|
|
16
16
|
Provides-Extra: dev
|
|
17
17
|
Requires-Dist: ruff==0.1.6; extra == "dev"
|
|
@@ -0,0 +1,522 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging as log
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from ftag import Cuts, Label, Sample
|
|
10
|
+
from ftag.hdf5 import H5Reader, H5Writer
|
|
11
|
+
|
|
12
|
+
from upp.classes.region import Region
|
|
13
|
+
from upp.stages.hist import Hist
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
16
|
+
from upp.classes.preprocessing_config import PreprocessingConfig
|
|
17
|
+
from upp.classes.variable_config import VariableConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Component:
|
|
22
|
+
"""
|
|
23
|
+
Component class for the different components/flavours.
|
|
24
|
+
|
|
25
|
+
It stores the needed information about the component and
|
|
26
|
+
allow for certain features in terms of resampling.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
region : Region
|
|
31
|
+
Region instance of the region for which this instance is setup
|
|
32
|
+
sample : Sample
|
|
33
|
+
Sample instance of the sample for which this instance is setup
|
|
34
|
+
flavour : Label
|
|
35
|
+
Flavour for which this instance is setup
|
|
36
|
+
global_cuts : Cuts
|
|
37
|
+
Global cuts that should be applied for this component
|
|
38
|
+
dirname : Path
|
|
39
|
+
Directory of where this component is/will be stored
|
|
40
|
+
num_jets : int
|
|
41
|
+
Number of jets that are to be used from this component
|
|
42
|
+
num_jets_estimate_available : int
|
|
43
|
+
Estimated available jets for this component
|
|
44
|
+
equal_jets : bool
|
|
45
|
+
If the same number of jets should be used from the different samples
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
region: Region
|
|
49
|
+
sample: Sample
|
|
50
|
+
flavour: Label
|
|
51
|
+
global_cuts: Cuts
|
|
52
|
+
dirname: Path
|
|
53
|
+
num_jets: int
|
|
54
|
+
num_jets_estimate_available: int
|
|
55
|
+
equal_jets: bool
|
|
56
|
+
|
|
57
|
+
def __post_init__(self):
|
|
58
|
+
"""Post init setup of internal variables."""
|
|
59
|
+
self.hist = Hist(self.dirname.parent.parent / "hists" / f"hist_{self.name}.h5")
|
|
60
|
+
self._unique_jets = -1
|
|
61
|
+
self._complete = None
|
|
62
|
+
self._ups_ratio = None
|
|
63
|
+
self._ups_max = None
|
|
64
|
+
self.sampling_fraction = None
|
|
65
|
+
|
|
66
|
+
def setup_reader(
|
|
67
|
+
self,
|
|
68
|
+
batch_size: int,
|
|
69
|
+
jets_name: str = "jets",
|
|
70
|
+
fname: Path | str | list[Path | str] | None = None,
|
|
71
|
+
**kwargs,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Set up the reader of the jets to load them from file.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
batch_size : int
|
|
78
|
+
Batch size that is used for loading from file
|
|
79
|
+
jets_name : str, optional
|
|
80
|
+
Name of the group in which the jets are stored, by default "jets"
|
|
81
|
+
fname : Path | str | list[Path | str], optional
|
|
82
|
+
Filename of the file(s) from which the jets are loaded, by default None
|
|
83
|
+
"""
|
|
84
|
+
if fname is None:
|
|
85
|
+
fname = self.sample.path
|
|
86
|
+
|
|
87
|
+
self.reader = H5Reader(
|
|
88
|
+
fname=fname,
|
|
89
|
+
batch_size=batch_size,
|
|
90
|
+
jets_name=jets_name,
|
|
91
|
+
equal_jets=self.equal_jets,
|
|
92
|
+
**kwargs,
|
|
93
|
+
)
|
|
94
|
+
log.debug(f"Setup component reader at: {fname}")
|
|
95
|
+
|
|
96
|
+
def setup_writer(self, variables: VariableConfig, jets_name: str = "jets") -> None:
|
|
97
|
+
"""Set up the writer of the jets to file.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
variables : VariableConfig
|
|
102
|
+
Instance of VariableConfig in which the variables are stored.
|
|
103
|
+
jets_name : str, optional
|
|
104
|
+
Name of the group in which the jets are stored, by default "jets"
|
|
105
|
+
"""
|
|
106
|
+
dtypes = self.reader.dtypes(variables.combined())
|
|
107
|
+
shapes = self.reader.shapes(self.num_jets, variables.keys())
|
|
108
|
+
self.writer = H5Writer(self.out_path, dtypes, shapes, jets_name=jets_name)
|
|
109
|
+
log.debug(f"Setup component writer at: {self.out_path}")
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def name(self) -> str:
|
|
113
|
+
"""Return the name of this component.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
str
|
|
118
|
+
Name of the component
|
|
119
|
+
"""
|
|
120
|
+
return f"{self.region.name}_{self.sample.name}_{self.flavour.name}"
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def cuts(self) -> Cuts:
|
|
124
|
+
"""Return all cuts that are applied for this component.
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
Cuts
|
|
129
|
+
Cuts instance of all the cuts that are applied on the component
|
|
130
|
+
"""
|
|
131
|
+
return self.global_cuts + self.flavour.cuts + self.region.cuts
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def out_path(self) -> Path:
|
|
135
|
+
"""Return the output file path.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
Path
|
|
140
|
+
Output file psth
|
|
141
|
+
"""
|
|
142
|
+
return self.dirname / f"{self.name}.h5"
|
|
143
|
+
|
|
144
|
+
def is_target(self, target_str: str) -> bool:
|
|
145
|
+
"""Check if the component is the target component for resampling.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
target_str : str
|
|
150
|
+
Target string to check against.
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
bool
|
|
155
|
+
If the component is a target or not.
|
|
156
|
+
"""
|
|
157
|
+
return self.flavour.name == target_str
|
|
158
|
+
|
|
159
|
+
def get_jets(self, variables: list, num_jets: int, cuts: Cuts | None = None) -> dict:
|
|
160
|
+
"""Load jets from file.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
variables : list
|
|
165
|
+
Variables that are to be loaded
|
|
166
|
+
num_jets : int
|
|
167
|
+
Number of jets that are to be loaded
|
|
168
|
+
cuts : Cuts | None, optional
|
|
169
|
+
Cuts instance of the cuts that should be applied on the jets, by default None
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
dict
|
|
174
|
+
Dict with the loaded jets
|
|
175
|
+
"""
|
|
176
|
+
jn = self.reader.jets_name
|
|
177
|
+
return self.reader.load({jn: variables}, num_jets, cuts)[jn]
|
|
178
|
+
|
|
179
|
+
def check_num_jets(
|
|
180
|
+
self,
|
|
181
|
+
num_req: int,
|
|
182
|
+
sampling_fraction: float | None = None,
|
|
183
|
+
cuts: Cuts | None = None,
|
|
184
|
+
silent: bool = False,
|
|
185
|
+
raise_error: bool = True,
|
|
186
|
+
) -> None:
|
|
187
|
+
"""Check the number of available jets.
|
|
188
|
+
|
|
189
|
+
If more jets are requested than available, throw an Error.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
num_req : int
|
|
194
|
+
Number of requested jets
|
|
195
|
+
sampling_fraction : float | None, optional
|
|
196
|
+
Sampling , by default None
|
|
197
|
+
cuts : Cuts | None, optional
|
|
198
|
+
Cuts instance of the cuts that are to be applied on the jets, by default None
|
|
199
|
+
silent : bool, optional
|
|
200
|
+
Decide, if the debug and info log statements are printed, by default False
|
|
201
|
+
raise_error : bool, optional
|
|
202
|
+
Decide if the error should be raised if not enough jets are available,
|
|
203
|
+
by default True
|
|
204
|
+
|
|
205
|
+
Raises
|
|
206
|
+
------
|
|
207
|
+
ValueError
|
|
208
|
+
If more jets are requsted than available
|
|
209
|
+
"""
|
|
210
|
+
# Check if num_jets jets are aviailable after the cuts and sampling fraction
|
|
211
|
+
num_est = (
|
|
212
|
+
None if self.num_jets_estimate_available <= 0 else self.num_jets_estimate_available
|
|
213
|
+
)
|
|
214
|
+
total = self.reader.estimate_available_jets(cuts, num_est)
|
|
215
|
+
available = total
|
|
216
|
+
if sampling_fraction:
|
|
217
|
+
available = int(total * sampling_fraction)
|
|
218
|
+
|
|
219
|
+
# check with tolerance to avoid failure midway through preprocessing
|
|
220
|
+
if available < num_req and raise_error:
|
|
221
|
+
raise ValueError(
|
|
222
|
+
f"{num_req:,} jets requested, but only {total:,} are estimated to be"
|
|
223
|
+
f" in {self}. With a sampling fraction of {sampling_fraction}, at most"
|
|
224
|
+
f" {available:,} of these are available. You can either reduce the"
|
|
225
|
+
" number of requested jets or increase the sampling fraction."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
if not silent:
|
|
229
|
+
log.debug(f"Sampling fraction {sampling_fraction}")
|
|
230
|
+
log.info(
|
|
231
|
+
f"Estimated {available:,} {self} jets available - {num_req:,} requested"
|
|
232
|
+
f"({self.reader.num_jets:,} in {self.sample})"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def get_auto_sampling_fraction(
|
|
236
|
+
self,
|
|
237
|
+
num_jets: int,
|
|
238
|
+
cuts: Cuts | None = None,
|
|
239
|
+
silent: bool = False,
|
|
240
|
+
) -> float:
|
|
241
|
+
"""Estimate the optimal/auto sampling fraction.
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
num_jets : int
|
|
246
|
+
Number of jets available
|
|
247
|
+
cuts : Cuts | None, optional
|
|
248
|
+
Cuts instance of the cuts that should be applied on the jets, by default None
|
|
249
|
+
silent : bool, optional
|
|
250
|
+
Decide, if the debug and info log statements are printed, by default False
|
|
251
|
+
|
|
252
|
+
Returns
|
|
253
|
+
-------
|
|
254
|
+
float
|
|
255
|
+
Automatically estimated sampling fraction
|
|
256
|
+
"""
|
|
257
|
+
num_est = (
|
|
258
|
+
None if self.num_jets_estimate_available <= 0 else self.num_jets_estimate_available
|
|
259
|
+
)
|
|
260
|
+
total = self.reader.estimate_available_jets(cuts, num_est)
|
|
261
|
+
auto_sampling_frac = round(1.1 * num_jets / total, 3) # 1.1 is a tolerance factor
|
|
262
|
+
if not silent:
|
|
263
|
+
log.debug(f"optimal sampling fraction {auto_sampling_frac:.3f}")
|
|
264
|
+
return auto_sampling_frac
|
|
265
|
+
|
|
266
|
+
def __str__(self) -> str:
|
|
267
|
+
"""Return internal name of the component instance.
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
str
|
|
272
|
+
Internal name of the component instance
|
|
273
|
+
"""
|
|
274
|
+
return self.name
|
|
275
|
+
|
|
276
|
+
@property
|
|
277
|
+
def unique_jets(self) -> int:
|
|
278
|
+
"""Return the number of unique jets for this component.
|
|
279
|
+
|
|
280
|
+
Returns
|
|
281
|
+
-------
|
|
282
|
+
int
|
|
283
|
+
Number of unique jets for this component
|
|
284
|
+
"""
|
|
285
|
+
if self._unique_jets == -1:
|
|
286
|
+
self._unique_jets = sum([r.get_attr("unique_jets") for r in self.reader.readers])
|
|
287
|
+
|
|
288
|
+
return self._unique_jets
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class Components:
|
|
292
|
+
"""Components class to store and manage multiple Component instances."""
|
|
293
|
+
|
|
294
|
+
def __init__(self, components: Components | list):
|
|
295
|
+
"""Init Components instance.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
components : Components
|
|
300
|
+
List of all Component instances that are to be managed.
|
|
301
|
+
"""
|
|
302
|
+
self.components = components
|
|
303
|
+
|
|
304
|
+
@classmethod
|
|
305
|
+
def from_config(cls, config: PreprocessingConfig) -> Components:
|
|
306
|
+
"""Create Components instance from PreprocessingConfig instance.
|
|
307
|
+
|
|
308
|
+
Parameters
|
|
309
|
+
----------
|
|
310
|
+
config : PreprocessingConfig
|
|
311
|
+
PreprocessingConfig instance with the loaded config file.
|
|
312
|
+
|
|
313
|
+
Returns
|
|
314
|
+
-------
|
|
315
|
+
Components
|
|
316
|
+
Components instance created from the PreprocessingConfig
|
|
317
|
+
"""
|
|
318
|
+
component_list = []
|
|
319
|
+
for component in config.config["components"]:
|
|
320
|
+
# Ensure equal_jets flag is correctly set
|
|
321
|
+
assert (
|
|
322
|
+
"equal_jets" not in component
|
|
323
|
+
), "equal_jets flag should be set in the sample config"
|
|
324
|
+
|
|
325
|
+
# Get the region cuts
|
|
326
|
+
region_cuts = (
|
|
327
|
+
Cuts.empty() if config.is_test else Cuts.from_list(component["region"]["cuts"])
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Get the region and apply the region cuts
|
|
331
|
+
region = Region(component["region"]["name"], region_cuts + config.global_cuts)
|
|
332
|
+
|
|
333
|
+
# Load the pattern and the equal_jets settings
|
|
334
|
+
pattern = component["sample"]["pattern"]
|
|
335
|
+
equal_jets = component["sample"].get("equal_jets", True)
|
|
336
|
+
if isinstance(pattern, list):
|
|
337
|
+
pattern = tuple(pattern)
|
|
338
|
+
|
|
339
|
+
# Create the Sample instance for the pattern
|
|
340
|
+
sample = Sample(
|
|
341
|
+
pattern=pattern,
|
|
342
|
+
ntuple_dir=config.ntuple_dir,
|
|
343
|
+
name=component["sample"]["name"],
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Create the Component instances for the different flavours
|
|
347
|
+
for name in component["flavours"]:
|
|
348
|
+
num_jets = component["num_jets"]
|
|
349
|
+
if config.split == "val":
|
|
350
|
+
num_jets = component.get("num_jets_val", num_jets // 10)
|
|
351
|
+
elif config.split == "test":
|
|
352
|
+
num_jets = component.get("num_jets_test", num_jets // 10)
|
|
353
|
+
component_list.append(
|
|
354
|
+
Component(
|
|
355
|
+
region=region,
|
|
356
|
+
sample=sample,
|
|
357
|
+
flavour=config.flavour_cont[name],
|
|
358
|
+
global_cuts=config.global_cuts,
|
|
359
|
+
dirname=config.components_dir,
|
|
360
|
+
num_jets=num_jets,
|
|
361
|
+
num_jets_estimate_available=config.num_jets_estimate_available, # type: ignore
|
|
362
|
+
equal_jets=equal_jets,
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
components = cls(component_list)
|
|
366
|
+
|
|
367
|
+
# Check the flavour ratios
|
|
368
|
+
if config.sampl_cfg.method is not None:
|
|
369
|
+
components.check_flavour_ratios()
|
|
370
|
+
|
|
371
|
+
return components
|
|
372
|
+
|
|
373
|
+
def check_flavour_ratios(self) -> None:
|
|
374
|
+
"""Check if the flavour ratios match.
|
|
375
|
+
|
|
376
|
+
Raises
|
|
377
|
+
------
|
|
378
|
+
ValueError
|
|
379
|
+
If inconsistent flavour ratios are found
|
|
380
|
+
"""
|
|
381
|
+
ratios = {}
|
|
382
|
+
flavours = self.flavours
|
|
383
|
+
for region, components in self.groupby_region():
|
|
384
|
+
this_ratios = {}
|
|
385
|
+
for f in flavours:
|
|
386
|
+
this_ratios[f.name] = components[f].num_jets / components.num_jets
|
|
387
|
+
ratios[region] = this_ratios
|
|
388
|
+
|
|
389
|
+
ref = next(iter(ratios.values()))
|
|
390
|
+
ref_region = next(iter(ratios.keys()))
|
|
391
|
+
for i, (region, ratio) in enumerate(ratios.items()):
|
|
392
|
+
if i != 0 and not np.allclose(list(ratio.values()), list(ref.values())):
|
|
393
|
+
raise ValueError(
|
|
394
|
+
f"Found inconsistent flavour ratios: \n - {ref_region}: {ref} \n -"
|
|
395
|
+
f" {region}: {ratio}"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def regions(self) -> list[str]:
|
|
400
|
+
"""Return the regions used.
|
|
401
|
+
|
|
402
|
+
Returns
|
|
403
|
+
-------
|
|
404
|
+
list[str]
|
|
405
|
+
List of regions
|
|
406
|
+
"""
|
|
407
|
+
return list(dict.fromkeys(c.region for c in self))
|
|
408
|
+
|
|
409
|
+
@property
|
|
410
|
+
def samples(self) -> list[str]:
|
|
411
|
+
"""Return the samples used.
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
list[str]
|
|
416
|
+
List of samples
|
|
417
|
+
"""
|
|
418
|
+
return list(dict.fromkeys(c.sample for c in self))
|
|
419
|
+
|
|
420
|
+
@property
|
|
421
|
+
def flavours(self) -> list[Label]:
|
|
422
|
+
"""Return the flavours used.
|
|
423
|
+
|
|
424
|
+
Returns
|
|
425
|
+
-------
|
|
426
|
+
list[str]
|
|
427
|
+
List of flavours
|
|
428
|
+
"""
|
|
429
|
+
return list(dict.fromkeys(c.flavour for c in self))
|
|
430
|
+
|
|
431
|
+
@property
|
|
432
|
+
def cuts(self) -> Cuts:
|
|
433
|
+
"""Return the cuts that are applied.
|
|
434
|
+
|
|
435
|
+
Returns
|
|
436
|
+
-------
|
|
437
|
+
list
|
|
438
|
+
List with all the cuts
|
|
439
|
+
"""
|
|
440
|
+
return sum((c.cuts for c in self), Cuts.from_list([]))
|
|
441
|
+
|
|
442
|
+
@property
|
|
443
|
+
def num_jets(self) -> int:
|
|
444
|
+
"""Return the number of jets available.
|
|
445
|
+
|
|
446
|
+
Returns
|
|
447
|
+
-------
|
|
448
|
+
int
|
|
449
|
+
Number of available jets
|
|
450
|
+
"""
|
|
451
|
+
return sum(c.num_jets for c in self)
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def unique_jets(self) -> int:
|
|
455
|
+
"""Return the number of unique jets available.
|
|
456
|
+
|
|
457
|
+
Returns
|
|
458
|
+
-------
|
|
459
|
+
int
|
|
460
|
+
Number of available unique jets
|
|
461
|
+
"""
|
|
462
|
+
return sum(c.unique_jets for c in self)
|
|
463
|
+
|
|
464
|
+
@property
|
|
465
|
+
def out_dir(self):
|
|
466
|
+
out_dir = {c.out_path.parent for c in self}
|
|
467
|
+
assert len(out_dir) == 1
|
|
468
|
+
return next(iter(out_dir))
|
|
469
|
+
|
|
470
|
+
@property
|
|
471
|
+
def jet_counts(self):
|
|
472
|
+
num_dict = {
|
|
473
|
+
c.name: {"num_jets": int(c.num_jets), "unique_jets": int(c.unique_jets)} for c in self
|
|
474
|
+
}
|
|
475
|
+
num_dict["total"] = {
|
|
476
|
+
"num_jets": int(self.num_jets),
|
|
477
|
+
"unique_jets": int(self.unique_jets),
|
|
478
|
+
}
|
|
479
|
+
return num_dict
|
|
480
|
+
|
|
481
|
+
@property
|
|
482
|
+
def dsids(self) -> list[str]:
|
|
483
|
+
"""Return the DSIDs used.
|
|
484
|
+
|
|
485
|
+
Returns
|
|
486
|
+
-------
|
|
487
|
+
list[str]
|
|
488
|
+
List of used DSIDs
|
|
489
|
+
"""
|
|
490
|
+
return list(set(sum([c.sample.dsid for c in self], []))) # noqa: RUF017
|
|
491
|
+
|
|
492
|
+
def groupby_region(self) -> list[tuple]:
|
|
493
|
+
"""Return the components grouped by region.
|
|
494
|
+
|
|
495
|
+
Returns
|
|
496
|
+
-------
|
|
497
|
+
list[tuple]
|
|
498
|
+
List of tuples in the form of (Region, Component)
|
|
499
|
+
"""
|
|
500
|
+
return [(r, Components([c for c in self if c.region == r])) for r in self.regions]
|
|
501
|
+
|
|
502
|
+
def groupby_sample(self) -> list[tuple]:
|
|
503
|
+
"""Return the components grouped by sample.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
list[tuple]
|
|
508
|
+
List of tuples in the form of (Sample, Component)
|
|
509
|
+
"""
|
|
510
|
+
return [(s, Components([c for c in self if c.sample == s])) for s in self.samples]
|
|
511
|
+
|
|
512
|
+
def __iter__(self):
|
|
513
|
+
yield from self.components
|
|
514
|
+
|
|
515
|
+
def __getitem__(self, index):
|
|
516
|
+
if isinstance(index, int):
|
|
517
|
+
return self.components[index]
|
|
518
|
+
if isinstance(index, (str, Label)):
|
|
519
|
+
return self.components[self.flavours.index(index)]
|
|
520
|
+
|
|
521
|
+
def __len__(self):
|
|
522
|
+
return len(self.components)
|
|
@@ -37,7 +37,7 @@ class PreprocessingConfig:
|
|
|
37
37
|
"""
|
|
38
38
|
Global options for the preprocessing.
|
|
39
39
|
|
|
40
|
-
These
|
|
40
|
+
These options are specified in the config file
|
|
41
41
|
under the `global:` key. They are passed as kwargs to PreprocessingConfig.
|
|
42
42
|
The config file is also copied to the output directory.
|
|
43
43
|
|
|
@@ -41,10 +41,16 @@ def parse_args(args):
|
|
|
41
41
|
parser.add_argument("--no-plot", dest="plot", action="store_false")
|
|
42
42
|
splits = ["train", "val", "test", "all"]
|
|
43
43
|
parser.add_argument("--split", default="train", choices=splits, help="Which file to produce")
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--component", default=None, help="Component which is processed during --prep"
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--region", default=None, help="Region which is processed during --resample"
|
|
49
|
+
)
|
|
44
50
|
|
|
45
51
|
args = parser.parse_args(args)
|
|
46
52
|
d = vars(args)
|
|
47
|
-
ignore = ["config", "split"]
|
|
53
|
+
ignore = ["config", "split", "component", "region"]
|
|
48
54
|
if not any(v for a, v in d.items() if a not in ignore):
|
|
49
55
|
for v in d:
|
|
50
56
|
if v not in ignore and d[v] is None:
|
|
@@ -65,12 +71,15 @@ def run_pp(args) -> None:
|
|
|
65
71
|
|
|
66
72
|
# create virtual datasets and pdf files
|
|
67
73
|
if args.prep and args.split == "train":
|
|
68
|
-
create_histograms(
|
|
74
|
+
create_histograms(
|
|
75
|
+
config=config,
|
|
76
|
+
component_to_run=args.component,
|
|
77
|
+
)
|
|
69
78
|
|
|
70
79
|
# run the resampling
|
|
71
80
|
if args.resample:
|
|
72
81
|
resampling = Resampling(config)
|
|
73
|
-
resampling.run()
|
|
82
|
+
resampling.run(region=args.region)
|
|
74
83
|
|
|
75
84
|
# run the merging
|
|
76
85
|
if args.merge:
|