legend-dataflow-scripts 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. legend_dataflow_scripts-0.1.0.dist-info/METADATA +57 -0
  2. legend_dataflow_scripts-0.1.0.dist-info/RECORD +36 -0
  3. legend_dataflow_scripts-0.1.0.dist-info/WHEEL +5 -0
  4. legend_dataflow_scripts-0.1.0.dist-info/entry_points.txt +18 -0
  5. legend_dataflow_scripts-0.1.0.dist-info/top_level.txt +1 -0
  6. legenddataflowscripts/__init__.py +17 -0
  7. legenddataflowscripts/_version.py +21 -0
  8. legenddataflowscripts/par/__init__.py +0 -0
  9. legenddataflowscripts/par/geds/__init__.py +0 -0
  10. legenddataflowscripts/par/geds/dsp/__init__.py +0 -0
  11. legenddataflowscripts/par/geds/dsp/dplms.py +145 -0
  12. legenddataflowscripts/par/geds/dsp/eopt.py +398 -0
  13. legenddataflowscripts/par/geds/dsp/evtsel.py +400 -0
  14. legenddataflowscripts/par/geds/dsp/nopt.py +120 -0
  15. legenddataflowscripts/par/geds/dsp/pz.py +217 -0
  16. legenddataflowscripts/par/geds/dsp/svm.py +28 -0
  17. legenddataflowscripts/par/geds/dsp/svm_build.py +69 -0
  18. legenddataflowscripts/par/geds/hit/__init__.py +0 -0
  19. legenddataflowscripts/par/geds/hit/aoe.py +245 -0
  20. legenddataflowscripts/par/geds/hit/ecal.py +778 -0
  21. legenddataflowscripts/par/geds/hit/lq.py +213 -0
  22. legenddataflowscripts/par/geds/hit/qc.py +326 -0
  23. legenddataflowscripts/tier/__init__.py +0 -0
  24. legenddataflowscripts/tier/dsp.py +263 -0
  25. legenddataflowscripts/tier/hit.py +148 -0
  26. legenddataflowscripts/utils/__init__.py +15 -0
  27. legenddataflowscripts/utils/alias_table.py +28 -0
  28. legenddataflowscripts/utils/cfgtools.py +14 -0
  29. legenddataflowscripts/utils/convert_np.py +31 -0
  30. legenddataflowscripts/utils/log.py +77 -0
  31. legenddataflowscripts/utils/pulser_removal.py +16 -0
  32. legenddataflowscripts/workflow/__init__.py +20 -0
  33. legenddataflowscripts/workflow/execenv.py +327 -0
  34. legenddataflowscripts/workflow/filedb.py +107 -0
  35. legenddataflowscripts/workflow/pre_compile_catalog.py +24 -0
  36. legenddataflowscripts/workflow/utils.py +113 -0
@@ -0,0 +1,327 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ import os
6
+ import shlex
7
+ import shutil
8
+ import subprocess
9
+ import sys
10
+ from collections.abc import Iterable, Mapping
11
+ from pathlib import Path
12
+
13
+ import colorlog
14
+ import dbetto
15
+ from dbetto import AttrsDict
16
+
17
+ from . import utils
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+
22
+ def _execenv2str(cmd_expr: Iterable, cmd_env: Mapping) -> str:
23
+ return " ".join([f"{k}={v}" for k, v in cmd_env.items()]) + " " + " ".join(cmd_expr)
24
+
25
+
26
+ def apptainer_env_vars(cmdenv: Mapping) -> list[str]:
27
+ return [f"--env={var}={val}" for var, val in cmdenv.items()]
28
+
29
+
30
+ def docker_env_vars(cmdenv: Mapping) -> list[str]:
31
+ # same syntax
32
+ return apptainer_env_vars(cmdenv)
33
+
34
+
35
+ def shifter_env_vars(cmdenv: Mapping) -> list[str]:
36
+ # same syntax
37
+ return apptainer_env_vars(cmdenv)
38
+
39
+
40
+ def execenv_prefix(
41
+ config: AttrsDict, as_string: bool = True
42
+ ) -> str | tuple[list, dict]:
43
+ """Returns the software environment command prefix.
44
+
45
+ For example: `apptainer run image.sif`
46
+
47
+ Note
48
+ ----
49
+ If `as_string` is True, a space is appended to the returned string.
50
+ """
51
+ config = AttrsDict(config)
52
+
53
+ cmdline = []
54
+ cmdenv = {}
55
+ if "execenv" in config and "env" in config.execenv:
56
+ cmdenv |= config.execenv.env
57
+
58
+ if "execenv" in config and "cmd" in config.execenv and "arg" in config.execenv:
59
+ cmdline = shlex.split(config.execenv.cmd)
60
+
61
+ has_xdg = False
62
+ xdg_runtime_dir = os.getenv("XDG_RUNTIME_DIR")
63
+ if xdg_runtime_dir:
64
+ has_xdg = True
65
+
66
+ if "env" in config.execenv:
67
+ if any(exe in config.execenv.cmd for exe in ("apptainer", "singularity")):
68
+ cmdline += apptainer_env_vars(config.execenv.env)
69
+ if has_xdg:
70
+ cmdline += [f"--bind={xdg_runtime_dir}"]
71
+
72
+ elif "docker" in config.execenv.cmd:
73
+ cmdline += docker_env_vars(config.execenv.env)
74
+
75
+ elif "shifter" in config.execenv.cmd:
76
+ cmdline += shifter_env_vars(config.execenv.env)
77
+
78
+ if (
79
+ any(exe in config.execenv.cmd for exe in ("docker", "shifter"))
80
+ and has_xdg
81
+ ):
82
+ cmdline += [f"--volume={xdg_runtime_dir}:{xdg_runtime_dir}"]
83
+
84
+ # now we can add the arguments
85
+ cmdline += shlex.split(config.execenv.arg)
86
+
87
+ if as_string:
88
+ return _execenv2str(cmdline, cmdenv) + " "
89
+
90
+ return cmdline, cmdenv
91
+
92
+
93
+ def execenv_pyexe(
94
+ config: AttrsDict, exename: str, as_string: bool = True
95
+ ) -> str | tuple[list, dict]:
96
+ """Returns the path to an executable installed in the virtualenv.
97
+
98
+ For example: `apptainer run image.sif path/to/venv/bin/{exename}`
99
+
100
+ Note
101
+ ----
102
+ If `as_string` is True, a space is appended to the returned string.
103
+ """
104
+ config = AttrsDict(config)
105
+
106
+ cmdline, cmdenv = execenv_prefix(config, as_string=False)
107
+ cmdline.append(f"{config.paths.install}/bin/{exename}")
108
+
109
+ if as_string:
110
+ return _execenv2str(cmdline, cmdenv) + " "
111
+
112
+ return cmdline, cmdenv
113
+
114
+
115
+ def dataflow() -> None:
116
+ """dataflow's CLI for installing and loading the software in the data production environment.
117
+
118
+ .. code-block:: console
119
+
120
+ $ dataflow --help
121
+ $ dataflow install --help # help section for a specific sub-command
122
+ """
123
+
124
+ parser = argparse.ArgumentParser(
125
+ prog="dataflow", description="dataflow's command-line interface"
126
+ )
127
+
128
+ parser.add_argument(
129
+ "-v", "--verbose", help="increase verbosity", action="store_true"
130
+ )
131
+
132
+ subparsers = parser.add_subparsers()
133
+
134
+ parser_install = subparsers.add_parser(
135
+ "install", help="install user software in data production environment"
136
+ )
137
+ parser_install.add_argument(
138
+ "config_file", help="production cycle configuration file"
139
+ )
140
+ parser_install.add_argument(
141
+ "-s",
142
+ "--system",
143
+ help="system running on",
144
+ default="bare",
145
+ type=str,
146
+ required=False,
147
+ )
148
+ parser_install.add_argument(
149
+ "-r",
150
+ "--remove",
151
+ help="remove software directory before installing software",
152
+ action="store_true",
153
+ )
154
+ parser_install.add_argument(
155
+ "-e",
156
+ "--editable",
157
+ help="install software with pip's --editable flag",
158
+ action="store_true",
159
+ )
160
+ parser_install.set_defaults(func=install)
161
+
162
+ parser_exec = subparsers.add_parser(
163
+ "exec", help="load data production environment and execute a given command"
164
+ )
165
+ parser_exec.add_argument(
166
+ "config_file", help="production cycle configuration file", type=str
167
+ )
168
+ parser_exec.add_argument(
169
+ "-s",
170
+ "--system",
171
+ help="system running on",
172
+ default="bare",
173
+ type=str,
174
+ required=False,
175
+ )
176
+ parser_exec.add_argument(
177
+ "command", help="command to run within the container", type=str, nargs="+"
178
+ )
179
+ parser_exec.set_defaults(func=cmdexec)
180
+
181
+ if len(sys.argv) < 2:
182
+ parser.print_usage(sys.stderr)
183
+ sys.exit(1)
184
+
185
+ args = parser.parse_args()
186
+
187
+ if args.verbose:
188
+ handler = colorlog.StreamHandler()
189
+ handler.setFormatter(
190
+ colorlog.ColoredFormatter(
191
+ "%(log_color)s%(name)s [%(levelname)s] %(message)s"
192
+ )
193
+ )
194
+
195
+ logger = logging.getLogger("legenddataflow")
196
+ logger.setLevel(logging.DEBUG)
197
+ logger.addHandler(handler)
198
+
199
+ if args.func:
200
+ args.func(args)
201
+
202
+
203
+ def install(args) -> None:
204
+ """Installs user software in the data production environment.
205
+
206
+ The software packages should be specified in the `config_file` with the
207
+ format:
208
+
209
+ .. code-block:: yaml
210
+
211
+ pkg_versions:
212
+ - python_package_spec
213
+ - ...
214
+
215
+ .. code-block:: console
216
+
217
+ $ dataflow install config.yaml
218
+ $ dataflow install --editable config.yaml # install legend-dataflow in editable mode
219
+ $ dataflow install --remove config.yaml # remove install directory
220
+ """
221
+ config_dict = AttrsDict(dbetto.utils.load_dict(args.config_file))
222
+ config_loc = Path(args.config_file).resolve().parent
223
+
224
+ utils.subst_vars(
225
+ config_dict, var_values={"_": config_loc}, use_env=True, ignore_missing=False
226
+ )
227
+ config_dict["execenv"] = config_dict["execenv"][args.system]
228
+
229
+ # path to virtualenv location
230
+ path_install = config_dict.paths.install
231
+
232
+ if args.remove and Path(path_install).exists():
233
+ msg = f"removing: {path_install}"
234
+ log.info(msg)
235
+ shutil.rmtree(path_install)
236
+
237
+ def _runcmd(cmd_expr, cmd_env, **kwargs):
238
+ msg = "running: " + _execenv2str(cmd_expr, cmd_env)
239
+ log.debug(msg)
240
+
241
+ subprocess.run(cmd_expr, env=os.environ | cmd_env, check=True, **kwargs)
242
+
243
+ cmd_prefix, cmd_env = execenv_prefix(config_dict, as_string=False)
244
+ # HACK: get the full path to this python interpreter in case there is no execenv prefix
245
+ python = sys.executable if cmd_prefix == [] else "python"
246
+ python_venv, _ = execenv_pyexe(config_dict, "python", as_string=False)
247
+
248
+ # we'll use uv from the virtualenv (installed below)
249
+ uv_expr = [*python_venv, "-m", "uv"] # , "--quiet"
250
+
251
+ # otherwise use python-venv
252
+ cmd_expr = [*cmd_prefix, python, "-m", "venv", path_install]
253
+
254
+ msg = f"configuring virtual environment in {path_install}"
255
+ log.info(msg)
256
+ _runcmd(cmd_expr, cmd_env)
257
+
258
+ cmd_expr = [
259
+ *python_venv,
260
+ "-m",
261
+ "pip",
262
+ "--quiet",
263
+ "--no-cache-dir",
264
+ "install",
265
+ "--upgrade",
266
+ "--",
267
+ "pip",
268
+ ]
269
+
270
+ log.info("upgrading pip")
271
+ _runcmd(cmd_expr, cmd_env)
272
+
273
+ # install uv
274
+ cmd_expr = [
275
+ *python_venv,
276
+ "-m",
277
+ "pip",
278
+ "--quiet",
279
+ "--no-cache-dir",
280
+ "install",
281
+ "--no-warn-script-location",
282
+ "--",
283
+ "uv",
284
+ ]
285
+
286
+ log.info("installing uv")
287
+ _runcmd(cmd_expr, cmd_env)
288
+
289
+ # and finally install legenddataflow with all dependencies
290
+ # this must be done within the execenv, since jobs will be run within it
291
+
292
+ cmd_expr = [
293
+ *uv_expr,
294
+ "pip",
295
+ "--no-cache",
296
+ "install",
297
+ "--prefix",
298
+ path_install,
299
+ str(config_loc),
300
+ ]
301
+ if args.editable:
302
+ cmd_expr.insert(-1, "--editable")
303
+
304
+ log.info("installing packages")
305
+ _runcmd(cmd_expr, cmd_env)
306
+
307
+
308
+ def cmdexec(args) -> None:
309
+ """Load the data production environment and execute a given command."""
310
+ config_dict = AttrsDict(dbetto.utils.load_dict(args.config_file))
311
+ config_loc = Path(args.config_file).resolve().parent
312
+
313
+ utils.subst_vars(
314
+ config_dict,
315
+ var_values={"_": config_loc},
316
+ use_env=True,
317
+ ignore_missing=False,
318
+ )
319
+ config_dict["execenv"] = config_dict["execenv"][args.system]
320
+
321
+ cmd_prefix, cmd_env = execenv_prefix(config_dict, as_string=False)
322
+ cmd_expr = [*cmd_prefix, *args.command]
323
+
324
+ msg = "running: " + _execenv2str(cmd_expr, cmd_env)
325
+ log.debug(msg)
326
+
327
+ subprocess.run(cmd_expr, env=os.environ | cmd_env, check=True)
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ from dbetto.catalog import Props
9
+ from lgdo import lh5
10
+ from pygama.flow.file_db import FileDB
11
+
12
+
13
+ def build_filedb() -> None:
14
+ argparser = argparse.ArgumentParser()
15
+ argparser.add_argument("--config", required=True)
16
+ argparser.add_argument("--scan-path", required=True)
17
+ argparser.add_argument("--output", required=True)
18
+ argparser.add_argument("--ignore-keys", required=False)
19
+ argparser.add_argument("--log")
20
+ argparser.add_argument("--assume-nonsparse", action="store_true")
21
+ args = argparser.parse_args()
22
+
23
+ config = Props.read_from(args.config)
24
+
25
+ if args.log is not None:
26
+ Path(args.log).parent.mkdir(parents=True, exist_ok=True)
27
+ logging.basicConfig(level=logging.DEBUG, filename=args.log, filemode="w")
28
+ else:
29
+ logging.basicConfig(level=logging.DEBUG)
30
+
31
+ logging.getLogger("legendmeta").setLevel(logging.INFO)
32
+ logging.getLogger("numba").setLevel(logging.INFO)
33
+ logging.getLogger("parse").setLevel(logging.INFO)
34
+ logging.getLogger("lgdo").setLevel(logging.INFO)
35
+ logging.getLogger("h5py._conv").setLevel(logging.INFO)
36
+
37
+ log = logging.getLogger(__name__)
38
+
39
+ if args.ignore_keys is not None:
40
+ ignore = Props.read_from(args.ignore_keys)["unprocessable"]
41
+ else:
42
+ ignore = []
43
+
44
+ fdb = FileDB(config, scan=False)
45
+ fdb.scan_files([args.scan_path])
46
+ fdb.scan_tables_columns(dir_files_conform=True)
47
+
48
+ # augment dataframe with earliest timestamp found in file
49
+
50
+ default = np.finfo("float64").max
51
+ timestamps = np.zeros(len(fdb.df), dtype="float64")
52
+
53
+ drop_rows = []
54
+ for i, row in enumerate(fdb.df.itertuples()):
55
+ if any(key in row.raw_file for key in ignore):
56
+ drop_rows.append(i)
57
+ continue
58
+
59
+ store = lh5.LH5Store(
60
+ base_path=f"{fdb.data_dir}/{fdb.tier_dirs['raw']}", keep_open=True
61
+ )
62
+
63
+ # list of first timestamps for each channel
64
+ loc_timestamps = np.full(
65
+ len(row.raw_tables), fill_value=default, dtype="float64"
66
+ )
67
+
68
+ msg = f"finding first timestamp in {fdb.data_dir}/{fdb.tier_dirs['raw']}/{row.raw_file}"
69
+ log.info(msg)
70
+
71
+ found = False
72
+ for j, table in enumerate(row.raw_tables):
73
+ try:
74
+ loc_timestamps[j] = store.read(
75
+ fdb.table_format["raw"].format(ch=table) + "/timestamp",
76
+ row.raw_file.strip("/"),
77
+ n_rows=1,
78
+ )[0]
79
+ found = True
80
+ except KeyError:
81
+ pass
82
+
83
+ if found and args.assume_nonsparse:
84
+ break
85
+
86
+ if (
87
+ (loc_timestamps == default).all() or not found
88
+ ) and row.raw_file not in ignore:
89
+ msg = "something went wrong! no valid first timestamp found. Likely: the file is empty"
90
+ raise RuntimeError(msg)
91
+
92
+ timestamps[i] = np.min(loc_timestamps)
93
+
94
+ msg = f"found {timestamps[i]}"
95
+ log.info(msg)
96
+
97
+ if (
98
+ timestamps[i] < 0 or timestamps[i] > 4102444800
99
+ ) and row.raw_file not in ignore:
100
+ msg = f"something went wrong! timestamp {timestamps[i]} does not make sense"
101
+ raise RuntimeError(msg)
102
+
103
+ fdb.df["first_timestamp"] = timestamps
104
+
105
+ fdb.df = fdb.df.drop(drop_rows)
106
+
107
+ fdb.to_disk(args.output, wo_mode="of")
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+ from pathlib import Path
5
+
6
+ from dbetto import TextDB
7
+ from dbetto.catalog import Catalog
8
+
9
+
10
+ def pre_compile_catalog(validity_path: str | Path):
11
+ if isinstance(validity_path, str):
12
+ validity_path = Path(validity_path)
13
+ catalog = Catalog.read_from(validity_path / "validity.yaml")
14
+ entries = {}
15
+ textdb = TextDB(validity_path, lazy=False)
16
+ for system in catalog.entries:
17
+ entries[system] = []
18
+ for entry in catalog.entries[system]:
19
+ db = textdb.on(
20
+ datetime.fromtimestamp(entry.valid_from, tz=UTC), system=system
21
+ )
22
+ new_entry = Catalog.Entry(entry.valid_from, db)
23
+ entries[system].append(new_entry)
24
+ return Catalog(entries)
@@ -0,0 +1,113 @@
1
+ """
2
+ This module contains all the utility needed for the data production.
3
+ for substituting the pathvar in the config, also the conversion
4
+ from timestamp to unix time
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import copy
10
+ import os
11
+ import re
12
+ import string
13
+ from pathlib import Path
14
+
15
+
16
+ def subst_vars_impl(x, var_values, ignore_missing=False):
17
+ if isinstance(x, str):
18
+ if "$" in x:
19
+ if ignore_missing:
20
+ return string.Template(x).safe_substitute(var_values)
21
+ return string.Template(x).substitute(var_values)
22
+ return x
23
+ if isinstance(x, dict):
24
+ for key in x:
25
+ value = x[key]
26
+ new_value = subst_vars_impl(value, var_values, ignore_missing)
27
+ if new_value is not value:
28
+ x[key] = new_value
29
+ return x
30
+ if isinstance(x, list):
31
+ for i in range(len(x)):
32
+ value = x[i]
33
+ new_value = subst_vars_impl(value, var_values, ignore_missing)
34
+ if new_value is not value:
35
+ x[i] = new_value
36
+ return x
37
+ return x
38
+
39
+
40
+ def subst_vars(
41
+ props,
42
+ var_values=None,
43
+ use_env=False,
44
+ ignore_missing=False,
45
+ ):
46
+ if var_values is None:
47
+ var_values = {}
48
+ combined_var_values = var_values
49
+ if use_env:
50
+ combined_var_values = dict(iter(os.environ.items()))
51
+ combined_var_values.update(copy.copy(var_values))
52
+ subst_vars_impl(props, combined_var_values, ignore_missing)
53
+
54
+
55
+ def subst_vars_in_snakemake_config(workflow, config):
56
+ if len(workflow.overwrite_configfiles) == 0:
57
+ msg = "configfile not set!"
58
+ raise RuntimeError(msg)
59
+
60
+ config_filename = workflow.overwrite_configfiles[0]
61
+
62
+ subst_vars(
63
+ config,
64
+ var_values={"_": Path(config_filename).parent},
65
+ use_env=True,
66
+ ignore_missing=False,
67
+ )
68
+ if "system" in config:
69
+ config["execenv"] = config["execenv"][config["system"]]
70
+ else:
71
+ config["execenv"] = config["execenv"]["bare"]
72
+
73
+
74
+ def set_last_rule_name(workflow, new_name):
75
+ """Sets the name of the most recently created rule to be `new_name`.
76
+ Useful when creating rules dynamically (i.e. unnamed).
77
+
78
+ Warning
79
+ -------
80
+ This could mess up the workflow. Use at your own risk.
81
+ """
82
+ rules = workflow._rules
83
+ last_key = next(reversed(rules))
84
+ assert last_key == rules[last_key].name
85
+
86
+ rules[new_name] = rules.pop(last_key)
87
+ rules[new_name].name = new_name
88
+
89
+ if workflow.default_target == last_key:
90
+ workflow.default_target = new_name
91
+
92
+ if last_key in workflow._localrules:
93
+ workflow._localrules.remove(last_key)
94
+ workflow._localrules.add(new_name)
95
+
96
+ workflow.check_localrules()
97
+
98
+
99
+ def as_ro(config, path):
100
+ if (
101
+ "read_only_fs_sub_pattern" not in config
102
+ or config["read_only_fs_sub_pattern"] is None
103
+ ):
104
+ return path
105
+
106
+ sub_pattern = config["read_only_fs_sub_pattern"]
107
+
108
+ if isinstance(path, str):
109
+ return re.sub(*sub_pattern, path)
110
+ if isinstance(path, Path):
111
+ return Path(re.sub(*sub_pattern, path.name))
112
+
113
+ return [as_ro(config, p) for p in path]