umami-preprocessing 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {umami_preprocessing-0.3.0/umami_preprocessing.egg-info → umami_preprocessing-0.3.1}/PKG-INFO +15 -17
  2. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/README.md +2 -1
  3. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/pyproject.toml +18 -7
  4. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1/umami_preprocessing.egg-info}/PKG-INFO +15 -17
  5. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/umami_preprocessing.egg-info/SOURCES.txt +1 -0
  6. umami_preprocessing-0.3.1/umami_preprocessing.egg-info/requires.txt +8 -0
  7. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/__init__.py +1 -1
  8. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/__init__.py +2 -0
  9. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/components.py +15 -6
  10. umami_preprocessing-0.3.1/upp/classes/plotting_config.py +121 -0
  11. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/preprocessing_config.py +47 -3
  12. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/resampling_config.py +4 -3
  13. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/main.py +13 -7
  14. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/hist.py +5 -1
  15. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/merging.py +21 -12
  16. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/normalisation.py +6 -2
  17. umami_preprocessing-0.3.1/upp/stages/plot.py +762 -0
  18. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/resampling.py +68 -36
  19. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/reweight.py +44 -23
  20. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/rw_merge.py +10 -15
  21. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/split_containers.py +1 -1
  22. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/utils/check_input_samples.py +1 -0
  23. umami_preprocessing-0.3.0/umami_preprocessing.egg-info/requires.txt +0 -20
  24. umami_preprocessing-0.3.0/upp/stages/plot.py +0 -209
  25. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/LICENSE +0 -0
  26. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/MANIFEST.in +0 -0
  27. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/setup.cfg +0 -0
  28. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/umami_preprocessing.egg-info/dependency_links.txt +0 -0
  29. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/umami_preprocessing.egg-info/entry_points.txt +0 -0
  30. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/umami_preprocessing.egg-info/top_level.txt +0 -0
  31. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/region.py +0 -0
  32. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/reweight_config.py +0 -0
  33. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/classes/variable_config.py +0 -0
  34. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/__init__.py +7 -7
  35. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/stages/interpolation.py +0 -0
  36. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/utils/__init__.py +1 -1
  37. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/utils/logger.py +0 -0
  38. {umami_preprocessing-0.3.0 → umami_preprocessing-0.3.1}/upp/utils/tools.py +0 -0
@@ -1,36 +1,34 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: umami-preprocessing
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: ATLAS Flavour Tagging Preprocessing - Umami PreProcessing (UPP)
5
5
  Author: Alexander Froch
6
- License: MIT
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
8
8
  Project-URL: Issue Tracker, https://github.com/umami-hep/umami-preprocessing/issues
9
- Requires-Python: <3.12,>=3.10
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Classifier: Topic :: Scientific/Engineering :: Physics
17
+ Requires-Python: <3.15,>=3.11
10
18
  Description-Content-Type: text/markdown
11
19
  License-File: LICENSE
12
- Requires-Dist: atlas-ftag-tools==0.3.1
20
+ Requires-Dist: atlas-ftag-tools==0.3.3
13
21
  Requires-Dist: dotmap>=1.3.30
14
22
  Requires-Dist: numpy>=2.2.6
15
- Requires-Dist: puma-hep==0.5.1
23
+ Requires-Dist: puma-hep==0.5.3
16
24
  Requires-Dist: pyyaml-include==1.3
17
25
  Requires-Dist: PyYAML>=6.0.2
18
26
  Requires-Dist: rich>=14.1.0
19
27
  Requires-Dist: scipy>=1.15.3
20
- Provides-Extra: dev
21
- Requires-Dist: coverage>=7.10.6; extra == "dev"
22
- Requires-Dist: ipykernel>=6.30.1; extra == "dev"
23
- Requires-Dist: mypy>=1.18.1; extra == "dev"
24
- Requires-Dist: pre-commit>=4.3.0; extra == "dev"
25
- Requires-Dist: pydoclint>=0.7.3; extra == "dev"
26
- Requires-Dist: pytest_notebook>=0.10.0; extra == "dev"
27
- Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
28
- Requires-Dist: pytest-randomly>=4.0.1; extra == "dev"
29
- Requires-Dist: pytest>=8.4.2; extra == "dev"
30
- Requires-Dist: ruff>=0.13.0; extra == "dev"
31
28
  Dynamic: license-file
32
29
 
33
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
30
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
31
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
34
32
  [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)
35
33
  [![PyPI version](https://badge.fury.io/py/umami-preprocessing.svg)](https://badge.fury.io/py/umami-preprocessing)
36
34
  [![docs](https://img.shields.io/badge/info-documentation-informational)](https://umami-hep.github.io/umami-preprocessing//)
@@ -1,4 +1,5 @@
1
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
1
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
2
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
2
3
  [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)
3
4
  [![PyPI version](https://badge.fury.io/py/umami-preprocessing.svg)](https://badge.fury.io/py/umami-preprocessing)
4
5
  [![docs](https://img.shields.io/badge/info-documentation-informational)](https://umami-hep.github.io/umami-preprocessing//)
@@ -3,29 +3,39 @@ name = "umami-preprocessing"
3
3
  description = "ATLAS Flavour Tagging Preprocessing - Umami PreProcessing (UPP)"
4
4
  authors = [{name="Alexander Froch"}]
5
5
  dynamic = ["version"]
6
- license = {text = "MIT"}
6
+ license = "Apache-2.0"
7
+ license-files = ["LICENSE"]
7
8
  readme = "README.md"
8
- requires-python = ">=3.10,<3.12"
9
+ requires-python = ">=3.11,<3.15"
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Intended Audience :: Science/Research",
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Programming Language :: Python :: 3.13",
17
+ "Programming Language :: Python :: 3.14",
18
+ "Topic :: Scientific/Engineering :: Physics",
19
+ ]
9
20
 
10
21
  dependencies = [
11
- "atlas-ftag-tools==0.3.1",
22
+ "atlas-ftag-tools==0.3.3",
12
23
  "dotmap>=1.3.30",
13
24
  "numpy>=2.2.6",
14
- "puma-hep==0.5.1",
25
+ "puma-hep==0.5.3",
15
26
  "pyyaml-include==1.3",
16
27
  "PyYAML>=6.0.2",
17
28
  "rich>=14.1.0",
18
29
  "scipy>=1.15.3",
19
30
  ]
20
31
 
21
- [project.optional-dependencies]
32
+ [dependency-groups]
22
33
  dev = [
23
34
  "coverage>=7.10.6",
24
35
  "ipykernel>=6.30.1",
25
36
  "mypy>=1.18.1",
26
37
  "pre-commit>=4.3.0",
27
38
  "pydoclint>=0.7.3",
28
- "pytest_notebook>=0.10.0",
29
39
  "pytest-cov>=7.0.0",
30
40
  "pytest-randomly>=4.0.1",
31
41
  "pytest>=8.4.2",
@@ -48,10 +58,11 @@ include-package-data = true
48
58
  version = {attr = "upp.__version__"}
49
59
 
50
60
  [build-system]
51
- requires = ["setuptools>=62"]
61
+ requires = ["setuptools>=77"]
52
62
  build-backend = "setuptools.build_meta"
53
63
 
54
64
  [tool.ruff]
65
+ target-version = "py311"
55
66
  lint.select = ["I", "E", "W", "F", "B", "UP", "ARG", "SIM", "TID", "RUF", "D2", "D3", "D4"]
56
67
  lint.ignore = ["RUF005"]
57
68
  line-length = 100
@@ -1,36 +1,34 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: umami-preprocessing
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: ATLAS Flavour Tagging Preprocessing - Umami PreProcessing (UPP)
5
5
  Author: Alexander Froch
6
- License: MIT
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
8
8
  Project-URL: Issue Tracker, https://github.com/umami-hep/umami-preprocessing/issues
9
- Requires-Python: <3.12,>=3.10
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Classifier: Topic :: Scientific/Engineering :: Physics
17
+ Requires-Python: <3.15,>=3.11
10
18
  Description-Content-Type: text/markdown
11
19
  License-File: LICENSE
12
- Requires-Dist: atlas-ftag-tools==0.3.1
20
+ Requires-Dist: atlas-ftag-tools==0.3.3
13
21
  Requires-Dist: dotmap>=1.3.30
14
22
  Requires-Dist: numpy>=2.2.6
15
- Requires-Dist: puma-hep==0.5.1
23
+ Requires-Dist: puma-hep==0.5.3
16
24
  Requires-Dist: pyyaml-include==1.3
17
25
  Requires-Dist: PyYAML>=6.0.2
18
26
  Requires-Dist: rich>=14.1.0
19
27
  Requires-Dist: scipy>=1.15.3
20
- Provides-Extra: dev
21
- Requires-Dist: coverage>=7.10.6; extra == "dev"
22
- Requires-Dist: ipykernel>=6.30.1; extra == "dev"
23
- Requires-Dist: mypy>=1.18.1; extra == "dev"
24
- Requires-Dist: pre-commit>=4.3.0; extra == "dev"
25
- Requires-Dist: pydoclint>=0.7.3; extra == "dev"
26
- Requires-Dist: pytest_notebook>=0.10.0; extra == "dev"
27
- Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
28
- Requires-Dist: pytest-randomly>=4.0.1; extra == "dev"
29
- Requires-Dist: pytest>=8.4.2; extra == "dev"
30
- Requires-Dist: ruff>=0.13.0; extra == "dev"
31
28
  Dynamic: license-file
32
29
 
33
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
30
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
31
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
34
32
  [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)
35
33
  [![PyPI version](https://badge.fury.io/py/umami-preprocessing.svg)](https://badge.fury.io/py/umami-preprocessing)
36
34
  [![docs](https://img.shields.io/badge/info-documentation-informational)](https://umami-hep.github.io/umami-preprocessing//)
@@ -12,6 +12,7 @@ upp/__init__.py
12
12
  upp/main.py
13
13
  upp/classes/__init__.py
14
14
  upp/classes/components.py
15
+ upp/classes/plotting_config.py
15
16
  upp/classes/preprocessing_config.py
16
17
  upp/classes/region.py
17
18
  upp/classes/resampling_config.py
@@ -0,0 +1,8 @@
1
+ atlas-ftag-tools==0.3.3
2
+ dotmap>=1.3.30
3
+ numpy>=2.2.6
4
+ puma-hep==0.5.3
5
+ pyyaml-include==1.3
6
+ PyYAML>=6.0.2
7
+ rich>=14.1.0
8
+ scipy>=1.15.3
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "v0.3.0"
5
+ __version__ = "v0.3.1"
6
6
 
7
7
  from . import classes, stages, utils
8
8
  from .main import run_pp
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from upp.classes.components import Component, Components
6
+ from upp.classes.plotting_config import PlottingConfig
6
7
  from upp.classes.preprocessing_config import PreprocessingConfig
7
8
  from upp.classes.region import Region
8
9
  from upp.classes.resampling_config import ResamplingConfig
@@ -11,6 +12,7 @@ from upp.classes.variable_config import VariableConfig
11
12
  __all__ = [
12
13
  "Component",
13
14
  "Components",
15
+ "PlottingConfig",
14
16
  "PreprocessingConfig",
15
17
  "Region",
16
18
  "ResamplingConfig",
@@ -86,6 +86,9 @@ class Component:
86
86
  if fname is None:
87
87
  fname = self.sample.path
88
88
 
89
+ if "vds_dir" not in kwargs and self.sample.vds_dir is not None:
90
+ kwargs["vds_dir"] = self.sample.vds_dir
91
+
89
92
  self.reader = H5Reader(
90
93
  fname=fname,
91
94
  batch_size=batch_size,
@@ -106,7 +109,8 @@ class Component:
106
109
  Name of the group in which the jets are stored, by default "jets"
107
110
  """
108
111
  dtypes = self.reader.dtypes(variables.combined())
109
- shapes = self.reader.shapes(self.num_jets, variables.keys())
112
+ # num_jets == -1 ("write all") -> 0 leading dim so the writer grows dynamically
113
+ shapes = self.reader.shapes(max(self.num_jets, 0), variables.keys())
110
114
  self.writer = H5Writer(self.out_path, dtypes, shapes, jets_name=jets_name)
111
115
  log.debug(f"Setup component writer at: {self.out_path}")
112
116
 
@@ -209,6 +213,10 @@ class Component:
209
213
  ValueError
210
214
  If more jets are requsted than available
211
215
  """
216
+ # num_req < 0 means "use all available jets" - nothing to check
217
+ if num_req < 0:
218
+ return
219
+
212
220
  # Check if num_jets jets are aviailable after the cuts and sampling fraction
213
221
  num_est = (
214
222
  None if self.num_jets_estimate_available <= 0 else self.num_jets_estimate_available
@@ -313,9 +321,9 @@ class Components:
313
321
  component_list = []
314
322
  for component in config.config["components"]:
315
323
  # Ensure equal_jets flag is correctly set
316
- assert (
317
- "equal_jets" not in component
318
- ), "equal_jets flag should be set in the sample config"
324
+ assert "equal_jets" not in component, (
325
+ "equal_jets flag should be set in the sample config"
326
+ )
319
327
 
320
328
  # Get the region cuts
321
329
  region_cuts = (
@@ -337,6 +345,7 @@ class Components:
337
345
  ntuple_dir=config.ntuple_dir,
338
346
  name=component["sample"]["name"],
339
347
  skip_checks=config.skip_checks,
348
+ vds_dir=config.vds_dir,
340
349
  )
341
350
 
342
351
  # Create the Component instances for the different flavours
@@ -360,8 +369,8 @@ class Components:
360
369
  )
361
370
  components = cls(component_list)
362
371
 
363
- # Check the flavour ratios
364
- if config.sampl_cfg and config.sampl_cfg.method is not None:
372
+ # Check the flavour ratios (not meaningful when resampling is skipped)
373
+ if not config.skip_resampling:
365
374
  components.check_flavour_ratios()
366
375
 
367
376
  return components
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ def _default_variable_labels() -> dict[str, str]:
7
+ return {
8
+ "pt": "Jet $p_\\mathrm{T}$ [GeV]",
9
+ "eta": "Jet $|\\eta|$",
10
+ "mass": "Jet Mass [GeV]",
11
+ }
12
+
13
+
14
+ def _default_sample_labels() -> dict[str, str]:
15
+ return {
16
+ "ttbar": "$t\\bar{t}$",
17
+ "zprime": "$Z'$",
18
+ }
19
+
20
+
21
+ @dataclass
22
+ class PlottingConfig:
23
+ r"""
24
+ Options for the preprocessing resampling distribution plots.
25
+
26
+ These options are specified in the config file under the `plotting:` key.
27
+ Any omitted option uses the default defined by this class.
28
+
29
+ Attributes
30
+ ----------
31
+ num_jets_plotting : int | None, optional
32
+ Number of jets loaded for plotting. If not set, use the global
33
+ `num_jets_estimate_plotting` value. By default None.
34
+ variable_labels : dict[str, str], optional
35
+ Display labels for plotted variables. Keys are matched case-insensitively
36
+ against variable names, with the longest matching key taking precedence.
37
+ User-provided labels are merged with the default pT, eta, and mass labels.
38
+ sample_labels : dict[str, str], optional
39
+ Display labels for input samples. User-provided labels are merged with the
40
+ default ttbar and zprime labels.
41
+ ylabel : str, optional
42
+ Label for the y-axis. The `{jets_name}` placeholder is replaced with the
43
+ configured jet dataset name. By default "Normalised Number of {jets_name}".
44
+ atlas_first_tag : str, optional
45
+ First ATLAS plot label. By default "Simulation Internal".
46
+ atlas_second_tag : str, optional
47
+ Second ATLAS plot label. By default "$\\sqrt{s} = 13/13.6\\,\\mathrm{TeV}$".
48
+ show_num_jets : bool, optional
49
+ Decide, if the number of jets is shown in the ATLAS second tag
50
+ output_formats : list[str], optional
51
+ File formats in which each plot is saved. By default `["pdf", "png"]`.
52
+ linestyles : list[str], optional
53
+ Linestyles used to distinguish input samples. By default
54
+ `["-", "--", "-.", ":"]`.
55
+ bins : int, optional
56
+ Number of histogram bins. By default 50.
57
+ norm : bool, optional
58
+ Normalise each histogram before plotting. By default True.
59
+ underoverflow : bool, optional
60
+ Include underflow and overflow values in the edge bins. By default True.
61
+ y_scale : float, optional
62
+ Scale factor applied to the automatically determined y-axis range.
63
+ By default 1.5.
64
+ figsize : list[float], optional
65
+ Figure width and height. By default `[6, 4]`.
66
+ logy : bool, optional
67
+ Use a logarithmic y-axis. By default True.
68
+ legend_location : str, optional
69
+ Location of the flavour legend. By default "upper right".
70
+ linestyle_legend_location : str, optional
71
+ Location of the sample-linestyle legend. By default "upper center".
72
+ linestyle_legend_anchor : list[float], optional
73
+ Anchor position of the sample-linestyle legend. By default `[0.55, 1]`.
74
+ output_directory : str, optional
75
+ Plot directory relative to the preprocessing output directory.
76
+ By default "plots".
77
+ """
78
+
79
+ num_jets_plotting: int | None = None
80
+ variable_labels: dict[str, str] = field(default_factory=_default_variable_labels)
81
+ sample_labels: dict[str, str] = field(default_factory=_default_sample_labels)
82
+ ylabel: str = "Normalised Number of {jets_name}"
83
+ atlas_first_tag: str = "Simulation Internal"
84
+ atlas_second_tag: str = "$\\sqrt{s} = 13/13.6\\,\\mathrm{TeV}$"
85
+ show_num_jets: bool = True
86
+ output_formats: list[str] = field(default_factory=lambda: ["pdf", "png"])
87
+ linestyles: list[str] = field(default_factory=lambda: ["-", "--", "-.", ":"])
88
+ bins: int = 50
89
+ norm: bool = True
90
+ underoverflow: bool = True
91
+ y_scale: float = 1.5
92
+ figsize: list[float] = field(default_factory=lambda: [6, 4])
93
+ logy: bool = True
94
+ legend_location: str = "upper right"
95
+ linestyle_legend_location: str = "upper center"
96
+ linestyle_legend_anchor: list[float] = field(default_factory=lambda: [0.55, 1])
97
+ output_directory: str = "plots"
98
+
99
+ def __post_init__(self) -> None:
100
+ self.variable_labels = {**_default_variable_labels(), **self.variable_labels}
101
+ self.sample_labels = {**_default_sample_labels(), **self.sample_labels}
102
+ if self.num_jets_plotting is not None and self.num_jets_plotting <= 0:
103
+ raise ValueError("plotting.num_jets_plotting must be a positive integer or None")
104
+ if not self.output_formats:
105
+ raise ValueError("plotting.output_formats must contain at least one format")
106
+ if not self.linestyles:
107
+ raise ValueError("plotting.linestyles must contain at least one linestyle")
108
+
109
+ def variable_label(self, variable: str) -> str:
110
+ """Return the configured display label for a variable."""
111
+ variable_lower = variable.lower()
112
+ for name, label in sorted(
113
+ self.variable_labels.items(), key=lambda item: len(item[0]), reverse=True
114
+ ):
115
+ if name.lower() in variable_lower:
116
+ return label
117
+ return variable
118
+
119
+ def sample_label(self, sample: str) -> str:
120
+ """Return the configured display label for a sample."""
121
+ return self.sample_labels.get(sample, sample)
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  import functools
5
5
  import logging as log
6
+ import subprocess
6
7
  from copy import copy
7
8
  from dataclasses import dataclass
8
9
  from pathlib import Path
@@ -19,6 +20,7 @@ from yamlinclude import YamlIncludeConstructor
19
20
 
20
21
  from upp import __version__
21
22
  from upp.classes.components import Components
23
+ from upp.classes.plotting_config import PlottingConfig
22
24
  from upp.classes.resampling_config import ResamplingConfig
23
25
  from upp.classes.reweight_config import ReweightConfig
24
26
  from upp.classes.variable_config import VariableConfig
@@ -119,6 +121,9 @@ class PreprocessingConfig:
119
121
  Skip checks for the input files. This is used for grid submission
120
122
  skip_config_copy : bool, optional
121
123
  Decide, if the config copying is skipped or not. By default False
124
+ vds_dir : Path | None, optional
125
+ Directory name for creation of virtual datasets. By default None
126
+ If none is given, virtual datasets is created next to input ntuples
122
127
  """
123
128
 
124
129
  config_path: Path
@@ -142,6 +147,7 @@ class PreprocessingConfig:
142
147
  num_jets_per_output_file: int | None = None
143
148
  skip_checks: bool = False
144
149
  skip_config_copy: bool = False
150
+ vds_dir: Path | None = None
145
151
 
146
152
  def __post_init__(self):
147
153
  # postprocess paths
@@ -158,6 +164,9 @@ class PreprocessingConfig:
158
164
  for field in dataclasses.fields(self):
159
165
  if field.type == "Path" and field.name != "out_fname" and field.name != "base_dir":
160
166
  setattr(self, field.name, self.get_path(Path(getattr(self, field.name))))
167
+ # vds_dir is optional (Path | None), so the loop above skips it; resolve it here
168
+ if self.vds_dir is not None:
169
+ self.vds_dir = self.get_path(Path(self.vds_dir))
161
170
  if not self.ntuple_dir.exists() and not self.skip_checks:
162
171
  raise FileNotFoundError(f"Path {self.ntuple_dir} does not exist")
163
172
  self.components_dir = self.components_dir / self.split
@@ -202,7 +211,7 @@ class PreprocessingConfig:
202
211
  self.variables = VariableConfig(
203
212
  self.config["variables"], self.jets_name, self.is_test, selectors
204
213
  )
205
- if self.sampl_cfg is not None:
214
+ if self.sampl_cfg is not None and self.sampl_cfg.variables:
206
215
  self.variables = self.variables.add_jet_vars(
207
216
  list(self.config["resampling"]["variables"].keys()), "labels"
208
217
  )
@@ -217,8 +226,19 @@ class PreprocessingConfig:
217
226
  if "reweighting" in self.config
218
227
  else None
219
228
  )
229
+ self.plotting = PlottingConfig(**self.config.get("plotting", {}))
230
+ if self.plotting.num_jets_plotting is None:
231
+ self.plotting.num_jets_plotting = self.num_jets_estimate_plotting
232
+
220
233
  # reproducibility
221
- self.git_hash = get_git_hash(Path(__file__).parent)
234
+ try:
235
+ self.git_hash = get_git_hash(Path(__file__).parent)
236
+ except (OSError, subprocess.CalledProcessError):
237
+ log.warning(
238
+ "Could not determine the git hash (is git installed and on PATH?); "
239
+ "using the UPP version for reproducibility metadata instead."
240
+ )
241
+ self.git_hash = None
222
242
  if self.git_hash is None:
223
243
  self.git_hash = __version__
224
244
  self.config["upp_hash"] = self.git_hash
@@ -255,12 +275,36 @@ class PreprocessingConfig:
255
275
  def is_test(self):
256
276
  return self.split == "test"
257
277
 
278
+ @property
279
+ def skip_resampling(self) -> bool:
280
+ """Return whether resampling is disabled (no block, or method none).
281
+
282
+ Returns
283
+ -------
284
+ bool
285
+ ``True`` if resampling should be skipped.
286
+ """
287
+ return self.sampl_cfg is None or self.sampl_cfg.method in (None, "none")
288
+
289
+ @property
290
+ def resampling_method(self) -> str:
291
+ """Resampling method recorded in the output metadata ("none" if skipped).
292
+
293
+ Returns
294
+ -------
295
+ str
296
+ The resampling method (e.g. ``"pdf"``/``"countup"``), or ``"none"``.
297
+ """
298
+ if self.skip_resampling:
299
+ return "none"
300
+ return self.sampl_cfg.method
301
+
258
302
  @functools.cached_property
259
303
  def global_cuts(self):
260
304
  cuts_list = self.config["global_cuts"].get("common", [])
261
305
  cuts_list += self.config["global_cuts"][self.split]
262
306
  if not self.is_test and self.config.get("resampling", None) is not None:
263
- for resampling_var, cfg in self.config["resampling"]["variables"].items():
307
+ for resampling_var, cfg in self.config["resampling"].get("variables", {}).items():
264
308
  cuts_list.append([resampling_var, ">", cfg["bins"][0][0]])
265
309
  cuts_list.append([resampling_var, "<", cfg["bins"][-1][1]])
266
310
  return Cuts.from_list(cuts_list)
@@ -1,14 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
3
+ from dataclasses import dataclass, field
4
4
 
5
5
  import numpy as np
6
6
 
7
7
 
8
8
  @dataclass
9
9
  class ResamplingConfig:
10
- variables: dict
11
- target: str
10
+ # variables/target are only needed for pdf/countup resampling; optional when skipping
11
+ variables: dict = field(default_factory=dict)
12
+ target: str | None = None
12
13
  sampling_fraction: float = 1.0
13
14
  method: str | None = None
14
15
  upscale_pdf: int | None = None
@@ -1,12 +1,14 @@
1
1
  """
2
- Preprocessing pipeline for jet taggging.
2
+ Preprocessing pipeline for jet tagging.
3
3
 
4
4
  By default all stages for the training split are run.
5
5
  To run with only specific stages enabled, include the flag for the required stages.
6
6
  To run without certain stages, include the corresponding negative flag.
7
7
 
8
- Note that all stages are required to run the pipeline. If you want to disable resampling,
9
- you need to set method: none in your config file.
8
+ To disable resampling, omit the `resampling` block or set `method: none`. The jets passing
9
+ the cuts are then written directly, capped at each component's `num_jets` (use `num_jets: -1`
10
+ to keep all of them). The `--no-resample` flag only skips the resampling *stage* (e.g. to
11
+ re-run later stages); it does not disable resampling.
10
12
  """
11
13
 
12
14
  from __future__ import annotations
@@ -129,7 +131,11 @@ def parse_args(args: Any) -> argparse.Namespace:
129
131
  "--reweight", "--rw", action="store_true", default=False, help="Run the reweighting stage"
130
132
  )
131
133
  parser.add_argument(
132
- "--rw-merge", "--rwm", action="store_true", default=False, help="Run the reweighting stage"
134
+ "--rw-merge",
135
+ "--rwm",
136
+ action="store_true",
137
+ default=False,
138
+ help="Run the reweighting merge stage",
133
139
  )
134
140
  parser.add_argument(
135
141
  "--rw-merge-idx",
@@ -137,7 +143,7 @@ def parse_args(args: Any) -> argparse.Namespace:
137
143
  type=str,
138
144
  default=None,
139
145
  help=(
140
- "Commar seperated pair of indices representing the range of output "
146
+ "Comma-separated pair of indices representing the range of output "
141
147
  "files to create, e.g '0,10' will create files 0 to 9"
142
148
  ),
143
149
  )
@@ -149,7 +155,7 @@ def parse_args(args: Any) -> argparse.Namespace:
149
155
  parser.add_argument(
150
156
  "--skip-sample-check",
151
157
  action="store_true",
152
- help="Skip the inital input sample check",
158
+ help="Skip the initial input sample check",
153
159
  )
154
160
  parser.add_argument(
155
161
  "--grid", action="store_true", help="Use when running the split stage on the grid. "
@@ -231,7 +237,7 @@ def run_pp(args: argparse.Namespace) -> None:
231
237
  verbose=True,
232
238
  )
233
239
 
234
- if args.split == "train":
240
+ if args.split == "train" and not config.skip_resampling:
235
241
  create_histograms(
236
242
  config=config,
237
243
  component_to_run=args.component,
@@ -41,8 +41,9 @@ def bin_jets(array: dict, bins: list) -> tuple[np.ndarray, np.ndarray]:
41
41
  bin in which this observation falls. The representation depends on the
42
42
  `expand_binnumbers` argument. See `Notes` for details.
43
43
  """
44
+ sample = s2u(array).astype(np.float64, copy=False)
44
45
  hist, _, out_bins = binned_statistic_dd(
45
- sample=s2u(array),
46
+ sample=sample,
46
47
  values=None,
47
48
  statistic="count",
48
49
  bins=bins,
@@ -145,6 +146,9 @@ def create_histograms(
145
146
  """
146
147
  # Setup the logger and load the variables used for resampling
147
148
  setup_logger()
149
+ if config.skip_resampling:
150
+ log.info("Resampling is disabled - skipping histogram/PDF creation.")
151
+ return
148
152
  sampl_vars = config.sampl_cfg.vars
149
153
 
150
154
  title = " Writing PDFs "