PyPI - legend-dataflow-scripts - Versions diffs - 0.1.0__py3-none-any.whl - Mend

legend-dataflow-scripts 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

legend_dataflow_scripts-0.1.0.dist-info/METADATA +57 -0
legend_dataflow_scripts-0.1.0.dist-info/RECORD +36 -0
legend_dataflow_scripts-0.1.0.dist-info/WHEEL +5 -0
legend_dataflow_scripts-0.1.0.dist-info/entry_points.txt +18 -0
legend_dataflow_scripts-0.1.0.dist-info/top_level.txt +1 -0
legenddataflowscripts/__init__.py +17 -0
legenddataflowscripts/_version.py +21 -0
legenddataflowscripts/par/__init__.py +0 -0
legenddataflowscripts/par/geds/__init__.py +0 -0
legenddataflowscripts/par/geds/dsp/__init__.py +0 -0
legenddataflowscripts/par/geds/dsp/dplms.py +145 -0
legenddataflowscripts/par/geds/dsp/eopt.py +398 -0
legenddataflowscripts/par/geds/dsp/evtsel.py +400 -0
legenddataflowscripts/par/geds/dsp/nopt.py +120 -0
legenddataflowscripts/par/geds/dsp/pz.py +217 -0
legenddataflowscripts/par/geds/dsp/svm.py +28 -0
legenddataflowscripts/par/geds/dsp/svm_build.py +69 -0
legenddataflowscripts/par/geds/hit/__init__.py +0 -0
legenddataflowscripts/par/geds/hit/aoe.py +245 -0
legenddataflowscripts/par/geds/hit/ecal.py +778 -0
legenddataflowscripts/par/geds/hit/lq.py +213 -0
legenddataflowscripts/par/geds/hit/qc.py +326 -0
legenddataflowscripts/tier/__init__.py +0 -0
legenddataflowscripts/tier/dsp.py +263 -0
legenddataflowscripts/tier/hit.py +148 -0
legenddataflowscripts/utils/__init__.py +15 -0
legenddataflowscripts/utils/alias_table.py +28 -0
legenddataflowscripts/utils/cfgtools.py +14 -0
legenddataflowscripts/utils/convert_np.py +31 -0
legenddataflowscripts/utils/log.py +77 -0
legenddataflowscripts/utils/pulser_removal.py +16 -0
legenddataflowscripts/workflow/__init__.py +20 -0
legenddataflowscripts/workflow/execenv.py +327 -0
legenddataflowscripts/workflow/filedb.py +107 -0
legenddataflowscripts/workflow/pre_compile_catalog.py +24 -0
legenddataflowscripts/workflow/utils.py +113 -0

legenddataflowscripts/workflow/execenv.py ADDED Viewed

@@ -0,0 +1,327 @@
+from __future__ import annotations
+import argparse
+import logging
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+from collections.abc import Iterable, Mapping
+from pathlib import Path
+import colorlog
+import dbetto
+from dbetto import AttrsDict
+from . import utils
+log = logging.getLogger(__name__)
+def _execenv2str(cmd_expr: Iterable, cmd_env: Mapping) -> str:
+    return " ".join([f"{k}={v}" for k, v in cmd_env.items()]) + " " + " ".join(cmd_expr)
+def apptainer_env_vars(cmdenv: Mapping) -> list[str]:
+    return [f"--env={var}={val}" for var, val in cmdenv.items()]
+def docker_env_vars(cmdenv: Mapping) -> list[str]:
+    # same syntax
+    return apptainer_env_vars(cmdenv)
+def shifter_env_vars(cmdenv: Mapping) -> list[str]:
+    # same syntax
+    return apptainer_env_vars(cmdenv)
+def execenv_prefix(
+    config: AttrsDict, as_string: bool = True
+) -> str | tuple[list, dict]:
+    """Returns the software environment command prefix.
+    For example: `apptainer run image.sif`
+    Note
+    ----
+    If `as_string` is True, a space is appended to the returned string.
+    """
+    config = AttrsDict(config)
+    cmdline = []
+    cmdenv = {}
+    if "execenv" in config and "env" in config.execenv:
+        cmdenv |= config.execenv.env
+    if "execenv" in config and "cmd" in config.execenv and "arg" in config.execenv:
+        cmdline = shlex.split(config.execenv.cmd)
+        has_xdg = False
+        xdg_runtime_dir = os.getenv("XDG_RUNTIME_DIR")
+        if xdg_runtime_dir:
+            has_xdg = True
+        if "env" in config.execenv:
+            if any(exe in config.execenv.cmd for exe in ("apptainer", "singularity")):
+                cmdline += apptainer_env_vars(config.execenv.env)
+                if has_xdg:
+                    cmdline += [f"--bind={xdg_runtime_dir}"]
+            elif "docker" in config.execenv.cmd:
+                cmdline += docker_env_vars(config.execenv.env)
+            elif "shifter" in config.execenv.cmd:
+                cmdline += shifter_env_vars(config.execenv.env)
+            if (
+                any(exe in config.execenv.cmd for exe in ("docker", "shifter"))
+                and has_xdg
+            ):
+                cmdline += [f"--volume={xdg_runtime_dir}:{xdg_runtime_dir}"]
+        # now we can add the arguments
+        cmdline += shlex.split(config.execenv.arg)
+    if as_string:
+        return _execenv2str(cmdline, cmdenv) + " "
+    return cmdline, cmdenv
+def execenv_pyexe(
+    config: AttrsDict, exename: str, as_string: bool = True
+) -> str | tuple[list, dict]:
+    """Returns the path to an executable installed in the virtualenv.
+    For example: `apptainer run image.sif path/to/venv/bin/{exename}`
+    Note
+    ----
+    If `as_string` is True, a space is appended to the returned string.
+    """
+    config = AttrsDict(config)
+    cmdline, cmdenv = execenv_prefix(config, as_string=False)
+    cmdline.append(f"{config.paths.install}/bin/{exename}")
+    if as_string:
+        return _execenv2str(cmdline, cmdenv) + " "
+    return cmdline, cmdenv
+def dataflow() -> None:
+    """dataflow's CLI for installing and loading the software in the data production environment.
+    .. code-block:: console
+      $ dataflow --help
+      $ dataflow install --help  # help section for a specific sub-command
+    """
+    parser = argparse.ArgumentParser(
+        prog="dataflow", description="dataflow's command-line interface"
+    )
+    parser.add_argument(
+        "-v", "--verbose", help="increase verbosity", action="store_true"
+    )
+    subparsers = parser.add_subparsers()
+    parser_install = subparsers.add_parser(
+        "install", help="install user software in data production environment"
+    )
+    parser_install.add_argument(
+        "config_file", help="production cycle configuration file"
+    )
+    parser_install.add_argument(
+        "-s",
+        "--system",
+        help="system running on",
+        default="bare",
+        type=str,
+        required=False,
+    )
+    parser_install.add_argument(
+        "-r",
+        "--remove",
+        help="remove software directory before installing software",
+        action="store_true",
+    )
+    parser_install.add_argument(
+        "-e",
+        "--editable",
+        help="install software with pip's --editable flag",
+        action="store_true",
+    )
+    parser_install.set_defaults(func=install)
+    parser_exec = subparsers.add_parser(
+        "exec", help="load data production environment and execute a given command"
+    )
+    parser_exec.add_argument(
+        "config_file", help="production cycle configuration file", type=str
+    )
+    parser_exec.add_argument(
+        "-s",
+        "--system",
+        help="system running on",
+        default="bare",
+        type=str,
+        required=False,
+    )
+    parser_exec.add_argument(
+        "command", help="command to run within the container", type=str, nargs="+"
+    )
+    parser_exec.set_defaults(func=cmdexec)
+    if len(sys.argv) < 2:
+        parser.print_usage(sys.stderr)
+        sys.exit(1)
+    args = parser.parse_args()
+    if args.verbose:
+        handler = colorlog.StreamHandler()
+        handler.setFormatter(
+            colorlog.ColoredFormatter(
+                "%(log_color)s%(name)s [%(levelname)s] %(message)s"
+            )
+        )
+        logger = logging.getLogger("legenddataflow")
+        logger.setLevel(logging.DEBUG)
+        logger.addHandler(handler)
+    if args.func:
+        args.func(args)
+def install(args) -> None:
+    """Installs user software in the data production environment.
+    The software packages should be specified in the `config_file` with the
+    format:
+    .. code-block:: yaml
+        pkg_versions:
+          - python_package_spec
+          - ...
+    .. code-block:: console
+      $ dataflow install config.yaml
+      $ dataflow install --editable config.yaml  # install legend-dataflow in editable mode
+      $ dataflow install --remove config.yaml  # remove install directory
+    """
+    config_dict = AttrsDict(dbetto.utils.load_dict(args.config_file))
+    config_loc = Path(args.config_file).resolve().parent
+    utils.subst_vars(
+        config_dict, var_values={"_": config_loc}, use_env=True, ignore_missing=False
+    )
+    config_dict["execenv"] = config_dict["execenv"][args.system]
+    # path to virtualenv location
+    path_install = config_dict.paths.install
+    if args.remove and Path(path_install).exists():
+        msg = f"removing: {path_install}"
+        log.info(msg)
+        shutil.rmtree(path_install)
+    def _runcmd(cmd_expr, cmd_env, **kwargs):
+        msg = "running: " + _execenv2str(cmd_expr, cmd_env)
+        log.debug(msg)
+        subprocess.run(cmd_expr, env=os.environ | cmd_env, check=True, **kwargs)
+    cmd_prefix, cmd_env = execenv_prefix(config_dict, as_string=False)
+    # HACK: get the full path to this python interpreter in case there is no execenv prefix
+    python = sys.executable if cmd_prefix == [] else "python"
+    python_venv, _ = execenv_pyexe(config_dict, "python", as_string=False)
+    # we'll use uv from the virtualenv (installed below)
+    uv_expr = [*python_venv, "-m", "uv"]  # , "--quiet"
+    # otherwise use python-venv
+    cmd_expr = [*cmd_prefix, python, "-m", "venv", path_install]
+    msg = f"configuring virtual environment in {path_install}"
+    log.info(msg)
+    _runcmd(cmd_expr, cmd_env)
+    cmd_expr = [
+        *python_venv,
+        "-m",
+        "pip",
+        "--quiet",
+        "--no-cache-dir",
+        "install",
+        "--upgrade",
+        "--",
+        "pip",
+    ]
+    log.info("upgrading pip")
+    _runcmd(cmd_expr, cmd_env)
+    # install uv
+    cmd_expr = [
+        *python_venv,
+        "-m",
+        "pip",
+        "--quiet",
+        "--no-cache-dir",
+        "install",
+        "--no-warn-script-location",
+        "--",
+        "uv",
+    ]
+    log.info("installing uv")
+    _runcmd(cmd_expr, cmd_env)
+    # and finally install legenddataflow with all dependencies
+    # this must be done within the execenv, since jobs will be run within it
+    cmd_expr = [
+        *uv_expr,
+        "pip",
+        "--no-cache",
+        "install",
+        "--prefix",
+        path_install,
+        str(config_loc),
+    ]
+    if args.editable:
+        cmd_expr.insert(-1, "--editable")
+    log.info("installing packages")
+    _runcmd(cmd_expr, cmd_env)
+def cmdexec(args) -> None:
+    """Load the data production environment and execute a given command."""
+    config_dict = AttrsDict(dbetto.utils.load_dict(args.config_file))
+    config_loc = Path(args.config_file).resolve().parent
+    utils.subst_vars(
+        config_dict,
+        var_values={"_": config_loc},
+        use_env=True,
+        ignore_missing=False,
+    )
+    config_dict["execenv"] = config_dict["execenv"][args.system]
+    cmd_prefix, cmd_env = execenv_prefix(config_dict, as_string=False)
+    cmd_expr = [*cmd_prefix, *args.command]
+    msg = "running: " + _execenv2str(cmd_expr, cmd_env)
+    log.debug(msg)
+    subprocess.run(cmd_expr, env=os.environ | cmd_env, check=True)

legenddataflowscripts/workflow/filedb.py ADDED Viewed

@@ -0,0 +1,107 @@
+from __future__ import annotations
+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+from dbetto.catalog import Props
+from lgdo import lh5
+from pygama.flow.file_db import FileDB
+def build_filedb() -> None:
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--config", required=True)
+    argparser.add_argument("--scan-path", required=True)
+    argparser.add_argument("--output", required=True)
+    argparser.add_argument("--ignore-keys", required=False)
+    argparser.add_argument("--log")
+    argparser.add_argument("--assume-nonsparse", action="store_true")
+    args = argparser.parse_args()
+    config = Props.read_from(args.config)
+    if args.log is not None:
+        Path(args.log).parent.mkdir(parents=True, exist_ok=True)
+        logging.basicConfig(level=logging.DEBUG, filename=args.log, filemode="w")
+    else:
+        logging.basicConfig(level=logging.DEBUG)
+    logging.getLogger("legendmeta").setLevel(logging.INFO)
+    logging.getLogger("numba").setLevel(logging.INFO)
+    logging.getLogger("parse").setLevel(logging.INFO)
+    logging.getLogger("lgdo").setLevel(logging.INFO)
+    logging.getLogger("h5py._conv").setLevel(logging.INFO)
+    log = logging.getLogger(__name__)
+    if args.ignore_keys is not None:
+        ignore = Props.read_from(args.ignore_keys)["unprocessable"]
+    else:
+        ignore = []
+    fdb = FileDB(config, scan=False)
+    fdb.scan_files([args.scan_path])
+    fdb.scan_tables_columns(dir_files_conform=True)
+    # augment dataframe with earliest timestamp found in file
+    default = np.finfo("float64").max
+    timestamps = np.zeros(len(fdb.df), dtype="float64")
+    drop_rows = []
+    for i, row in enumerate(fdb.df.itertuples()):
+        if any(key in row.raw_file for key in ignore):
+            drop_rows.append(i)
+            continue
+        store = lh5.LH5Store(
+            base_path=f"{fdb.data_dir}/{fdb.tier_dirs['raw']}", keep_open=True
+        )
+        # list of first timestamps for each channel
+        loc_timestamps = np.full(
+            len(row.raw_tables), fill_value=default, dtype="float64"
+        )
+        msg = f"finding first timestamp in {fdb.data_dir}/{fdb.tier_dirs['raw']}/{row.raw_file}"
+        log.info(msg)
+        found = False
+        for j, table in enumerate(row.raw_tables):
+            try:
+                loc_timestamps[j] = store.read(
+                    fdb.table_format["raw"].format(ch=table) + "/timestamp",
+                    row.raw_file.strip("/"),
+                    n_rows=1,
+                )[0]
+                found = True
+            except KeyError:
+                pass
+            if found and args.assume_nonsparse:
+                break
+        if (
+            (loc_timestamps == default).all() or not found
+        ) and row.raw_file not in ignore:
+            msg = "something went wrong! no valid first timestamp found. Likely: the file is empty"
+            raise RuntimeError(msg)
+        timestamps[i] = np.min(loc_timestamps)
+        msg = f"found {timestamps[i]}"
+        log.info(msg)
+        if (
+            timestamps[i] < 0 or timestamps[i] > 4102444800
+        ) and row.raw_file not in ignore:
+            msg = f"something went wrong! timestamp {timestamps[i]} does not make sense"
+            raise RuntimeError(msg)
+    fdb.df["first_timestamp"] = timestamps
+    fdb.df = fdb.df.drop(drop_rows)
+    fdb.to_disk(args.output, wo_mode="of")

legenddataflowscripts/workflow/pre_compile_catalog.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+from datetime import UTC, datetime
+from pathlib import Path
+from dbetto import TextDB
+from dbetto.catalog import Catalog
+def pre_compile_catalog(validity_path: str | Path):
+    if isinstance(validity_path, str):
+        validity_path = Path(validity_path)
+    catalog = Catalog.read_from(validity_path / "validity.yaml")
+    entries = {}
+    textdb = TextDB(validity_path, lazy=False)
+    for system in catalog.entries:
+        entries[system] = []
+        for entry in catalog.entries[system]:
+            db = textdb.on(
+                datetime.fromtimestamp(entry.valid_from, tz=UTC), system=system
+            )
+            new_entry = Catalog.Entry(entry.valid_from, db)
+            entries[system].append(new_entry)
+    return Catalog(entries)

legenddataflowscripts/workflow/utils.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""
+This module contains all the utility needed for the data production.
+for substituting the pathvar in the config, also the conversion
+from timestamp to unix time
+"""
+from __future__ import annotations
+import copy
+import os
+import re
+import string
+from pathlib import Path
+def subst_vars_impl(x, var_values, ignore_missing=False):
+    if isinstance(x, str):
+        if "$" in x:
+            if ignore_missing:
+                return string.Template(x).safe_substitute(var_values)
+            return string.Template(x).substitute(var_values)
+        return x
+    if isinstance(x, dict):
+        for key in x:
+            value = x[key]
+            new_value = subst_vars_impl(value, var_values, ignore_missing)
+            if new_value is not value:
+                x[key] = new_value
+        return x
+    if isinstance(x, list):
+        for i in range(len(x)):
+            value = x[i]
+            new_value = subst_vars_impl(value, var_values, ignore_missing)
+            if new_value is not value:
+                x[i] = new_value
+        return x
+    return x
+def subst_vars(
+    props,
+    var_values=None,
+    use_env=False,
+    ignore_missing=False,
+):
+    if var_values is None:
+        var_values = {}
+    combined_var_values = var_values
+    if use_env:
+        combined_var_values = dict(iter(os.environ.items()))
+        combined_var_values.update(copy.copy(var_values))
+    subst_vars_impl(props, combined_var_values, ignore_missing)
+def subst_vars_in_snakemake_config(workflow, config):
+    if len(workflow.overwrite_configfiles) == 0:
+        msg = "configfile not set!"
+        raise RuntimeError(msg)
+    config_filename = workflow.overwrite_configfiles[0]
+    subst_vars(
+        config,
+        var_values={"_": Path(config_filename).parent},
+        use_env=True,
+        ignore_missing=False,
+    )
+    if "system" in config:
+        config["execenv"] = config["execenv"][config["system"]]
+    else:
+        config["execenv"] = config["execenv"]["bare"]
+def set_last_rule_name(workflow, new_name):
+    """Sets the name of the most recently created rule to be `new_name`.
+    Useful when creating rules dynamically (i.e. unnamed).
+    Warning
+    -------
+    This could mess up the workflow. Use at your own risk.
+    """
+    rules = workflow._rules
+    last_key = next(reversed(rules))
+    assert last_key == rules[last_key].name
+    rules[new_name] = rules.pop(last_key)
+    rules[new_name].name = new_name
+    if workflow.default_target == last_key:
+        workflow.default_target = new_name
+    if last_key in workflow._localrules:
+        workflow._localrules.remove(last_key)
+        workflow._localrules.add(new_name)
+    workflow.check_localrules()
+def as_ro(config, path):
+    if (
+        "read_only_fs_sub_pattern" not in config
+        or config["read_only_fs_sub_pattern"] is None
+    ):
+        return path
+    sub_pattern = config["read_only_fs_sub_pattern"]
+    if isinstance(path, str):
+        return re.sub(*sub_pattern, path)
+    if isinstance(path, Path):
+        return Path(re.sub(*sub_pattern, path.name))
+    return [as_ro(config, p) for p in path]