umami-preprocessing 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/PKG-INFO +10 -10
  2. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/pyproject.toml +8 -8
  3. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/umami_preprocessing.egg-info/PKG-INFO +10 -10
  4. umami_preprocessing-0.2.4/umami_preprocessing.egg-info/requires.txt +15 -0
  5. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/__init__.py +1 -1
  6. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/classes/preprocessing_config.py +34 -16
  7. umami_preprocessing-0.2.4/upp/logger.py +76 -0
  8. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/main.py +15 -6
  9. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/stages/hist.py +47 -6
  10. umami_preprocessing-0.2.4/upp/stages/merging.py +307 -0
  11. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/stages/normalisation.py +9 -2
  12. umami_preprocessing-0.2.4/upp/stages/plot.py +198 -0
  13. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/stages/resampling.py +192 -74
  14. umami_preprocessing-0.2.2/umami_preprocessing.egg-info/requires.txt +0 -15
  15. umami_preprocessing-0.2.2/upp/logger.py +0 -39
  16. umami_preprocessing-0.2.2/upp/stages/merging.py +0 -176
  17. umami_preprocessing-0.2.2/upp/stages/plot.py +0 -325
  18. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/README.md +0 -0
  19. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/setup.cfg +0 -0
  20. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/umami_preprocessing.egg-info/SOURCES.txt +0 -0
  21. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/umami_preprocessing.egg-info/dependency_links.txt +0 -0
  22. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/umami_preprocessing.egg-info/entry_points.txt +0 -0
  23. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/umami_preprocessing.egg-info/top_level.txt +0 -0
  24. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/classes/__init__.py +0 -0
  25. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/classes/components.py +0 -0
  26. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/classes/region.py +0 -0
  27. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/classes/resampling_config.py +0 -0
  28. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/classes/variable_config.py +0 -0
  29. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/stages/__init__.py +0 -0
  30. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/stages/interpolation.py +0 -0
  31. {umami_preprocessing-0.2.2 → umami_preprocessing-0.2.4}/upp/utils.py +0 -0
@@ -1,25 +1,25 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: umami-preprocessing
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Preprocessing for jet tagging
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
7
7
  Requires-Python: <3.12,>=3.8
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pyyaml-include==1.3
10
- Requires-Dist: PyYAML==6.0.1
10
+ Requires-Dist: PyYAML>=6.0.1
11
11
  Requires-Dist: rich==12.6.0
12
- Requires-Dist: scipy==1.10.1
13
- Requires-Dist: puma-hep==0.4.2
14
- Requires-Dist: atlas-ftag-tools==0.2.8
12
+ Requires-Dist: scipy>=1.15.2
13
+ Requires-Dist: puma-hep==0.4.5
14
+ Requires-Dist: atlas-ftag-tools==0.2.10
15
15
  Requires-Dist: dotmap==1.3.30
16
16
  Provides-Extra: dev
17
- Requires-Dist: ruff==0.1.6; extra == "dev"
18
- Requires-Dist: mypy==1.5.1; extra == "dev"
17
+ Requires-Dist: ruff==0.6.2; extra == "dev"
18
+ Requires-Dist: mypy==1.11.2; extra == "dev"
19
19
  Requires-Dist: pre-commit==3.5.0; extra == "dev"
20
- Requires-Dist: pytest>=7.0.1; extra == "dev"
20
+ Requires-Dist: pytest>=7.2.2; extra == "dev"
21
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
21
22
  Requires-Dist: pytest-mock==3.11.1; extra == "dev"
22
- Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
23
23
 
24
24
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
25
25
  [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)
@@ -8,22 +8,22 @@ requires-python = "<3.12,>=3.8"
8
8
 
9
9
  dependencies = [
10
10
  "pyyaml-include==1.3",
11
- "PyYAML==6.0.1",
11
+ "PyYAML>=6.0.1",
12
12
  "rich==12.6.0",
13
- "scipy==1.10.1",
14
- "puma-hep==0.4.2",
15
- "atlas-ftag-tools==0.2.8",
13
+ "scipy>=1.15.2",
14
+ "puma-hep==0.4.5",
15
+ "atlas-ftag-tools==0.2.10",
16
16
  "dotmap==1.3.30"
17
17
  ]
18
18
 
19
19
  [project.optional-dependencies]
20
20
  dev = [
21
- "ruff==0.1.6",
22
- "mypy==1.5.1",
21
+ "ruff==0.6.2",
22
+ "mypy==1.11.2",
23
23
  "pre-commit==3.5.0",
24
- "pytest>=7.0.1",
24
+ "pytest>=7.2.2",
25
+ "pytest-cov>=4.0.0",
25
26
  "pytest-mock==3.11.1",
26
- "pytest-cov>=3.0.0",
27
27
  ]
28
28
 
29
29
  [project.urls]
@@ -1,25 +1,25 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: umami-preprocessing
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Preprocessing for jet tagging
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
7
7
  Requires-Python: <3.12,>=3.8
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pyyaml-include==1.3
10
- Requires-Dist: PyYAML==6.0.1
10
+ Requires-Dist: PyYAML>=6.0.1
11
11
  Requires-Dist: rich==12.6.0
12
- Requires-Dist: scipy==1.10.1
13
- Requires-Dist: puma-hep==0.4.2
14
- Requires-Dist: atlas-ftag-tools==0.2.8
12
+ Requires-Dist: scipy>=1.15.2
13
+ Requires-Dist: puma-hep==0.4.5
14
+ Requires-Dist: atlas-ftag-tools==0.2.10
15
15
  Requires-Dist: dotmap==1.3.30
16
16
  Provides-Extra: dev
17
- Requires-Dist: ruff==0.1.6; extra == "dev"
18
- Requires-Dist: mypy==1.5.1; extra == "dev"
17
+ Requires-Dist: ruff==0.6.2; extra == "dev"
18
+ Requires-Dist: mypy==1.11.2; extra == "dev"
19
19
  Requires-Dist: pre-commit==3.5.0; extra == "dev"
20
- Requires-Dist: pytest>=7.0.1; extra == "dev"
20
+ Requires-Dist: pytest>=7.2.2; extra == "dev"
21
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
21
22
  Requires-Dist: pytest-mock==3.11.1; extra == "dev"
22
- Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
23
23
 
24
24
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
25
25
  [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)
@@ -0,0 +1,15 @@
1
+ pyyaml-include==1.3
2
+ PyYAML>=6.0.1
3
+ rich==12.6.0
4
+ scipy>=1.15.2
5
+ puma-hep==0.4.5
6
+ atlas-ftag-tools==0.2.10
7
+ dotmap==1.3.30
8
+
9
+ [dev]
10
+ ruff==0.6.2
11
+ mypy==1.11.2
12
+ pre-commit==3.5.0
13
+ pytest>=7.2.2
14
+ pytest-cov>=4.0.0
15
+ pytest-mock==3.11.1
@@ -2,4 +2,4 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "v0.2.2"
5
+ __version__ = "v0.2.4"
@@ -53,42 +53,56 @@ class PreprocessingConfig:
53
53
 
54
54
  Parameters
55
55
  ----------
56
+ config_path : Path
57
+ Path to the config yaml file that is used. Does not need to be set in config.
58
+ split : Split
59
+ For which part the preprocessing is run. Either train, val or test. This needs
60
+ to be set as a command line argument when running the programm. Does not need
61
+ to be set in config.
62
+ config : dict
63
+ Dict with the loaded config. Does not need to be set in config.
56
64
  base_dir : Path
57
65
  Base directory for all other paths.
58
- ntuple_dir : Path
66
+ ntuple_dir : Path, optional
59
67
  Directory containing the input h5 ntuples. If a relative path is given, it is
60
- interpreted as relative to base_dir.
61
- components_dir : Path
68
+ interpreted as relative to base_dir. By default Path("ntuples")
69
+ components_dir : Path, optional
62
70
  Directory for intermediate component files. If a relative path is given, it is
63
- interpreted as relative to base_dir.
64
- out_dir : Path
71
+ interpreted as relative to base_dir. By default Path("components")
72
+ out_dir : Path, optional
65
73
  Directory for output files. If a relative path is given, it is interpreted as
66
- relative to base_dir.
67
- out_fname : Path
68
- Filename stem for the output files.
69
- batch_size : int
74
+ relative to base_dir. By default Path("output")
75
+ out_fname : Path, optional
76
+ Filename stem for the output files. By default Path("pp_output.h5")
77
+ batch_size : int, optional
70
78
  Batch size for the preprocessing. For each batch select
71
79
  `sampling_fraction*batch_size_after_cuts`. It is recommended to choose high batch sizes
72
80
  especially to the `countup` method to achive best agreement of target and resampled
73
- distributions.
74
- num_jets_estimate : int
81
+ distributions. By default 100_000
82
+ num_jets_estimate : int, optional
75
83
  Any of the further three arguments that are not specified will default to this value
76
84
  Is equal to 1_000_000 by default.
77
- num_jets_estimate_available : int | None
85
+ num_jets_estimate_available : int, optional
78
86
  A sabsample taken from the whole sample to estimate the number of jets after the cuts.
79
87
  Please keep this number high in order to not get poisson error of more then 5%.
80
88
  If time allows you can use -1 to get a precise number of jets and not just an estimate
81
89
  although it will be slow for large datasets. Is equal to num_jets_estimate by default.
82
- num_jets_estimate_hist : int
90
+ num_jets_estimate_hist : int, optional
83
91
  Number of jets of each flavour that are used to construct histograms for probability
84
92
  density function estimation. Larger numbers give a better quality estmate of the pdfs.
85
93
  Is equal to num_jets_estimate by default.
86
- num_jets_estimate_norm : int
94
+ num_jets_estimate_norm : int, optional
87
95
  Number of jets of each flavour that are used to estimate shifting and scaling during
88
96
  normalisation step. Larger numbers give a better quality estmates.
89
97
  Is equal to num_jets_estimate by default.
90
- jets_name : str
91
- Name of the jets dataset in the input file.
98
+ num_jets_estimate_plotting : int, optional
99
+ Number of jets of each flavour used for plotting the initial and the final resampling
100
+ variable distributions. Larger numbers give a better estimate of the full distributions.
101
+ Is equal to num_jets_estimate by default.
102
+ merge_test_samples : bool, optional
103
+ Merge the test samples of the different processes into one file. By default False.
104
+ jets_name : str, optional
105
+ Name of the jets dataset in the input file. By default "jets".
92
106
  """
93
107
 
94
108
  config_path: Path
@@ -104,9 +118,11 @@ class PreprocessingConfig:
104
118
  num_jets_estimate_available: int | None = None
105
119
  num_jets_estimate_hist: int | None = None
106
120
  num_jets_estimate_norm: int | None = None
121
+ num_jets_estimate_plotting: int | None = None
107
122
  merge_test_samples: bool = False
108
123
  jets_name: str = "jets"
109
124
  flavour_config: Path | None = None
125
+ num_jets_per_output_file: int | None = None
110
126
 
111
127
  def __post_init__(self):
112
128
  # postprocess paths
@@ -117,6 +133,8 @@ class PreprocessingConfig:
117
133
  self.num_jets_estimate_hist = self.num_jets_estimate
118
134
  if self.num_jets_estimate_norm is None:
119
135
  self.num_jets_estimate_norm = self.num_jets_estimate
136
+ if self.num_jets_estimate_plotting is None:
137
+ self.num_jets_estimate_plotting = self.num_jets_estimate
120
138
 
121
139
  for field in dataclasses.fields(self):
122
140
  if field.type == "Path" and field.name != "out_fname" and field.name != "base_dir":
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+ from functools import partial
6
+
7
+ from rich.console import Console
8
+ from rich.logging import RichHandler
9
+ from rich.progress import (
10
+ BarColumn,
11
+ Progress,
12
+ TextColumn,
13
+ TimeElapsedColumn,
14
+ TimeRemainingColumn,
15
+ )
16
+
17
+ # Detect if the program is executed in an interactive terminal
18
+ _IS_TTY = sys.stderr.isatty()
19
+
20
+ # One console object is reused everywhere so that Rich keeps a consistent idea
21
+ # of whether it may emit ANSI control codes / animations.
22
+ _console = Console(
23
+ width=100,
24
+ force_terminal=_IS_TTY,
25
+ force_interactive=_IS_TTY,
26
+ no_color=not _IS_TTY,
27
+ )
28
+
29
+ # Template for the progress bar
30
+ ProgressBar = partial(
31
+ Progress,
32
+ TextColumn("[task.description]{task.description}"),
33
+ BarColumn(),
34
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
35
+ TextColumn("•"),
36
+ TimeRemainingColumn(),
37
+ TextColumn("•"),
38
+ TimeElapsedColumn(),
39
+ refresh_per_second=1 if _IS_TTY else 0.05,
40
+ speed_estimate_period=30 if _IS_TTY else 120,
41
+ console=_console,
42
+ disable=not _IS_TTY,
43
+ transient=_IS_TTY,
44
+ )
45
+
46
+
47
+ # Helper for setup the logger
48
+ def setup_logger(level: str = "INFO"):
49
+ """Set up the logger.
50
+
51
+ Configure Rich logging so that colourful / interactive output is used when
52
+ the program is attached to a terminal and plain text is written when it is
53
+ executed under a batch system such as Slurm (where stdout / stderr are files).
54
+ """
55
+ FORMAT = "%(message)s"
56
+
57
+ # In a batch job we create a console that never emits colour codes.
58
+ console = None
59
+ if not _IS_TTY:
60
+ console = Console(
61
+ width=120,
62
+ force_terminal=False,
63
+ force_interactive=False,
64
+ no_color=True,
65
+ )
66
+
67
+ handler = RichHandler(
68
+ show_time=False,
69
+ show_path=False,
70
+ markup=True,
71
+ rich_tracebacks=True,
72
+ console=console,
73
+ )
74
+
75
+ logging.basicConfig(level=level, format=FORMAT, handlers=[handler])
76
+ return logging
@@ -21,7 +21,7 @@ from upp.logger import setup_logger
21
21
  from upp.stages.hist import create_histograms
22
22
  from upp.stages.merging import Merging
23
23
  from upp.stages.normalisation import Normalisation
24
- from upp.stages.plot import plot_initial_resampling_dists, plot_resampled_dists
24
+ from upp.stages.plot import plot_resampling_dists
25
25
  from upp.stages.resampling import Resampling
26
26
 
27
27
 
@@ -41,10 +41,16 @@ def parse_args(args):
41
41
  parser.add_argument("--no-plot", dest="plot", action="store_false")
42
42
  splits = ["train", "val", "test", "all"]
43
43
  parser.add_argument("--split", default="train", choices=splits, help="Which file to produce")
44
+ parser.add_argument(
45
+ "--component", default=None, help="Component which is processed during --prep"
46
+ )
47
+ parser.add_argument(
48
+ "--region", default=None, help="Region which is processed during --resample"
49
+ )
44
50
 
45
51
  args = parser.parse_args(args)
46
52
  d = vars(args)
47
- ignore = ["config", "split"]
53
+ ignore = ["config", "split", "component", "region"]
48
54
  if not any(v for a, v in d.items() if a not in ignore):
49
55
  for v in d:
50
56
  if v not in ignore and d[v] is None:
@@ -65,12 +71,15 @@ def run_pp(args) -> None:
65
71
 
66
72
  # create virtual datasets and pdf files
67
73
  if args.prep and args.split == "train":
68
- create_histograms(config)
74
+ create_histograms(
75
+ config=config,
76
+ component_to_run=args.component,
77
+ )
69
78
 
70
79
  # run the resampling
71
80
  if args.resample:
72
81
  resampling = Resampling(config)
73
- resampling.run()
82
+ resampling.run(region=args.region, component=args.component)
74
83
 
75
84
  # run the merging
76
85
  if args.merge:
@@ -86,8 +95,8 @@ def run_pp(args) -> None:
86
95
  if args.plot:
87
96
  title = " Plotting "
88
97
  log.info(f"[bold green]{title:-^100}")
89
- plot_initial_resampling_dists(config=config)
90
- plot_resampled_dists(config=config, stage=args.split)
98
+ plot_resampling_dists(config=config, stage="initial")
99
+ plot_resampling_dists(config=config, stage=args.split)
91
100
 
92
101
  # print end info
93
102
  end = datetime.now()
@@ -5,6 +5,7 @@ import logging as log
5
5
  import math
6
6
  from dataclasses import dataclass
7
7
  from pathlib import Path
8
+ from typing import TYPE_CHECKING
8
9
 
9
10
  import h5py
10
11
  import numpy as np
@@ -13,6 +14,9 @@ from scipy.stats import binned_statistic_dd
13
14
 
14
15
  from upp.logger import setup_logger
15
16
 
17
+ if TYPE_CHECKING: # pragma: no cover
18
+ from upp.classes.preprocessing_config import PreprocessingConfig
19
+
16
20
 
17
21
  def bin_jets(array: dict, bins: list) -> np.ndarray:
18
22
  """Create the histogram and bins for the given resampling variables.
@@ -117,24 +121,39 @@ class Hist:
117
121
  return f["pbin"][:]
118
122
 
119
123
 
120
- def create_histograms(config) -> None:
124
+ def create_histograms(
125
+ config: PreprocessingConfig,
126
+ component_to_run: str | None = None,
127
+ ) -> None:
121
128
  """Create the virtual datasets and pdf files.
122
129
 
123
130
  Parameters
124
131
  ----------
125
132
  config : PreprocessingConfig object
126
133
  PreprocessingConfig object of the current preprocessing.
134
+ component_to_run : str | None
135
+ Component which should be run. By default (None), all components
136
+ are processed sequentially.
127
137
  """
138
+ # Setup the logger and load the variables used for resampling
128
139
  setup_logger()
140
+ sampl_vars = config.sampl_cfg.vars
129
141
 
130
142
  title = " Writing PDFs "
131
143
  log.info(f"[bold green]{title:-^100}")
132
-
133
144
  log.info(f"[bold green]Estimating PDFs using {config.num_jets_estimate_hist:,} jets...")
134
- sampl_vars = config.sampl_cfg.vars
145
+
146
+ # Create check variable to ensure at least one component was processed
147
+ component_processed = not component_to_run
148
+
149
+ # Process the different components
135
150
  for component in config.components:
151
+ # Check if only one component should be processed
152
+ if isinstance(component_to_run, str) and component_to_run != component.name:
153
+ continue
154
+
136
155
  log.info(f"Estimating {component} PDF using {config.num_jets_estimate_hist:,} samples...")
137
- component.setup_reader(config.batch_size, config.jets_name)
156
+ component.setup_reader(batch_size=config.batch_size, jets_name=config.jets_name)
138
157
  cuts_no_split = component.cuts.ignore(["eventNumber"])
139
158
 
140
159
  ###
@@ -146,7 +165,29 @@ def create_histograms(config) -> None:
146
165
  silent=False,
147
166
  raise_error=False,
148
167
  )
149
- jets = component.get_jets(sampl_vars, config.num_jets_estimate_hist, cuts_no_split)
150
- component.hist.write_hist(jets, sampl_vars, config.sampl_cfg.flat_bins)
168
+
169
+ # Load the jets from file used for resampling
170
+ jets = component.get_jets(
171
+ variables=sampl_vars,
172
+ num_jets=config.num_jets_estimate_hist,
173
+ cuts=cuts_no_split,
174
+ )
175
+
176
+ # Write out the hist used for resampling
177
+ component.hist.write_hist(
178
+ jets=jets,
179
+ resampling_vars=sampl_vars,
180
+ bins=config.sampl_cfg.flat_bins,
181
+ )
182
+
183
+ # Set the check variable to true
184
+ component_processed = True
185
+
186
+ # Raise error of no region was processed
187
+ if component_processed is False:
188
+ raise ValueError(
189
+ "No component processed during resampling! Check that you correctly spelled "
190
+ "the component name when running with --component!"
191
+ )
151
192
 
152
193
  log.info(f"[bold green]Saved to {config.components[0].hist.path.parent}/")