PyPI - triggerflow - Versions diffs - 0.1.12__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

triggerflow 0.1.12py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/base_dataset.py ADDED Viewed

@@ -0,0 +1,137 @@
+import logging, uproot, json, os
+import pandas as pd
+import numpy as np
+from abc import abstractmethod
+from fnmatch import filter as fnmatch_filter
+from kedro.io import AbstractDataset
+class BaseDataset(AbstractDataset):
+    """
+    Abstract Base Class for loading data from ROOT files.
+    Users must inherit from this class and implement the abstract methods.
+    The core processing logic in `_load` is fixed and cannot be overridden.
+    """
+    def __init__(self, sample_info: str, sample_key: str):
+        with open(sample_info, "r") as f:
+            data = json.load(f)
+        self._sample_info = data[sample_key]
+        self._sample_key = sample_key
+        # get logger for reporting
+        self.logger = logging.getLogger(__name__)
+        self.logger.info(f"Initializing dataset: {self.__class__.__name__}")
+    @abstractmethod
+    def get_branches_to_keep(self) -> list[str]:
+        """
+        USER MUST IMPLEMENT: Return a list of branch names or patterns (with wildcards)
+        to keep from the ROOT file.
+        Example:
+            return ["Jet_*", "PuppiMET_pt", "nJet"]
+        """
+        pass
+    @abstractmethod
+    def get_cut(self) -> str | None:
+        """
+        USER MUST IMPLEMENT: Return a string representing the cuts to apply to the data.
+        """
+        pass
+    @abstractmethod
+    def convert_to_pandas(self, data: dict) -> pd.DataFrame:
+        """
+        USER MUST IMPLEMENT: Convert the loaded data from a dictionary format to a pandas DataFrame.
+        """
+        pass
+    def get_tree_name(self) -> str:
+        return "Events"
+    def _resolve_branches(self, all_branches: list) -> list[str]:
+        """Internal method to resolve wildcard patterns."""
+        selected = []
+        for pattern in self.get_branches_to_keep():
+            matched = fnmatch_filter(all_branches, pattern)
+            if not matched:
+                self.logger.warning(f"Pattern '{pattern}' did not match any branches.")
+            selected.extend(matched)
+        return sorted(list(set(selected)))
+    def _load(self) -> pd.DataFrame:
+        """
+        CORE LOGIC (NOT OVERRIDABLE): Loads and processes a single ROOT file.
+        """
+        # Process all files in sample
+        df = pd.DataFrame()
+        all_root_files = []
+        for key in self._sample_info.keys():
+            files = os.listdir(self._sample_info[key]["folder"])
+            cur_files = []
+            for file_pattern in self._sample_info[key]["file_pattern"]:
+                for f in fnmatch_filter(files, file_pattern):
+                    cur_files.append(os.path.join(self._sample_info[key]["folder"], f))
+            all_root_files.append(cur_files)
+        is_signals = [
+            self._sample_info[key]["is_signal"] for key in self._sample_info.keys()
+        ]
+        self.logger.info("Processing files")
+        for root_files, is_signal in zip(all_root_files, is_signals):
+            self.logger.info(f"Processing files: {root_files}")
+            for root_file in root_files:
+                if f"{root_file}" == "data/01_raw/samples_dummy.json":
+                    n = 100
+                    # generate dummy features
+                    dummy_data = {}
+                    for branch in self.get_branches_to_keep():
+                        dummy_data[branch] = np.random.randn(n)
+                    if is_signal:
+                        dummy_data["is_signal"] = np.ones(n)
+                    else:
+                        dummy_data["is_signal"] = np.zeros(n)
+                    cur_df = pd.DataFrame(dummy_data)
+                    # generate a binary target (0/1)
+                    cur_df["y"] = np.random.choice([0, 1], size=n)
+                    df = pd.concat([df, cur_df])
+                else:
+                    with uproot.open(f"{root_file}") as f:
+                        tree = f[self.get_tree_name()]
+                        all_branches = tree.keys()
+                        branches_to_load = self._resolve_branches(all_branches)
+                        if not branches_to_load:
+                            self.logger.warning(
+                                f"No valid branches to load for {root_file}. Skipping."
+                            )
+                            continue
+                        data = tree.arrays(branches_to_load, cut=self.get_cut())
+                        cur_df = self.convert_to_pandas(data)
+                        # set background or signal
+                        if is_signal:
+                            cur_df["is_signal"] = [1 for _ in range(len(cur_df))]
+                        else:
+                            cur_df["is_signal"] = [0 for _ in range(len(cur_df))]
+                        df = pd.concat([df, cur_df])
+        return df
+    def _save(self, data: pd.DataFrame) -> pd.DataFrame:
+        return data
+    def _describe(self) -> dict:
+        return {"output_sample_info": self._sample_info, "sample_key": self._sample_key}

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/meta_dataset.py ADDED Viewed

@@ -0,0 +1,88 @@
+import logging, json
+from glob import glob
+from kedro.io import AbstractDataset
+METADATA_CONFIG = {"x": 0}
+class MetaDataset(AbstractDataset):
+    """
+    Dataset class to load a json file.
+    """
+    def __init__(self, filepath: str, sample_key: str):
+        self._filepath = filepath
+        self._sample_key = sample_key
+        # get logger for reporting
+        self.logger = logging.getLogger(__name__)
+    def get_dasgoclient_metadata(self, das_name: dict, config: dict) -> dict:
+        """
+        Get metadata from DAS for a given sample.
+        """
+        self.logger.info(f"Fetching DAS metadata for dataset: {das_name}")
+        # # Use sys to run the command and keep the output as a dict
+        # cmnd = f'dasgoclient -query="dataset dataset={das_name}" -json'
+        # output = sys.command(cmnd)
+        # # Parse the output and extract relevant metadata
+        # if output:
+        #     das_json = json.loads(output)[0]
+        #     for k, v in config["metadata"].items():
+        #         if k in das_json:
+        #             for item in v:
+        #                 metadata[item] = das_json[k].get(item)
+        #         else:
+        #              self.logger.warning(f"{k} not found for dataset: {das_name}")
+        # else:
+        #     self.logger.warning("No metadata found.")
+        #     return {}
+        metadata = {"gridpack": "0.0.0"}
+        return metadata
+    def _load(self) -> dict:
+        """
+        Load a json file and return a python dict.
+        """
+        self.logger.info(f"Processing file: {self._filepath}")
+        with open(self._filepath, "r") as f:
+            data = json.load(f)
+        return data
+    def _save(self, samples: dict) -> dict:
+        """
+        Get the meta data from all samples and store the result.
+        """
+        metadata = {}
+        for sample_name, sample_info in samples[self._sample_key].items():
+            self.logger.info(f"Processing sample: {sample_name}")
+            # Get sample files
+            sample_path = sample_info.get("path")
+            if len(sample_path) == 0:
+                self.logger.warning(f"No files found for sample {sample_name}.")
+            sample_info.update({"files": glob(sample_path)})
+            self.logger.info(
+                f"Found {len(sample_info.get('files', []))} files for sample {sample_name}."
+            )
+            # Get sample metadata
+            metadata[sample_name] = self.get_dasgoclient_metadata(
+                sample_info["DAS"], METADATA_CONFIG
+            )
+            # sample_info.update(metadata)
+        with open(self._filepath, "w") as f:
+            json.dump(metadata, f)
+    def _describe(self) -> dict:
+        return {"filepath": self._filepath, "sample_key": self._sample_key}

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/{{ cookiecutter.python_package }}_dataset.py ADDED Viewed

@@ -0,0 +1,35 @@
+import pandas as pd
+from .base_dataset import BaseDataset
+class {{ cookiecutter.python_package }}Dataset(BaseDataset):
+    """
+    A custom dataset example.
+    """
+    def get_branches_to_keep(self) -> list[str]:
+        """
+        Define the branches you needed.
+        """
+        return [
+            "PuppiMET_pt",
+            "CaloMET_pt",
+            "event",  # <-- we need this for meta data
+            # "Jet_pt",
+            # "Jet_eta",
+            # "Jet_phi",
+            # "Jet_btag*", # Use a wildcard to get all b-tagging info
+            "nJet",
+        ]
+    def get_cut(self) -> str | None:
+        """
+        Apply a pre-selection cut to keep only events with exactly 1 jet.
+        """
+        return "nJet == 1"
+    def convert_to_pandas(self, data: dict):
+        """
+        Logic to convert from dict of (potentially nested) arrays to a pandas DataFrame.
+        """
+        return pd.DataFrame(data)

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/models/__init__.py ADDED Viewed

File without changes

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/models/base_model.py ADDED Viewed

@@ -0,0 +1,155 @@
+import inspect
+import pandas as pd
+from abc import ABC, abstractmethod
+from typing import Any
+from sklearn.base import BaseEstimator
+class BaseModel(ABC, BaseEstimator):
+    """
+    Standard Wrapper for a model
+    """
+    def __init__(self, name: str, hps: dict):
+        self.name = name
+        # this will be overwritten after training
+        self.model = None
+        self.history = None
+        self.callbacks = []
+        self.hps = hps
+    @abstractmethod
+    def train(self, X: pd.DataFrame, y: pd.DataFrame, hps: dict, **kwargs):
+        """
+        User code function.
+        Args:
+            X: features
+            y: label
+            hps: hyperparameters
+            kwargs: anything else needed for training
+        """
+        pass
+    @abstractmethod
+    def build(self):
+        """
+        User code function to build the model.
+        """
+        pass
+    def predict(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        """
+        Calculates predictions of the model
+        Args:
+            X: features
+        Returns:
+            predictions
+            (optional in user code) kwargs: anything else needed for predicting
+        """
+        y_pred = self.model.predict(X.astype("float32"))
+        return pd.DataFrame(y_pred)
+    def predict_proba(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        """
+        Calculates proba predictions of the model
+        Args:
+            X: features
+        Returns:
+            predictions
+            (optional in user code) kwargs: anything else needed for predicting
+        """
+        y_pred = self.model.predict_proba(X.astype("float32"))
+        return pd.DataFrame(y_pred)
+    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
+        """
+        Same as train but get kwargs from __init__ for sklearn API
+        Args:
+            X: features
+            y: label
+        X can also contain optional inputs https://github.com/scikit-learn/scikit-learn/issues/2879.
+        Which should be specified in the user code.
+        For example when the train function needs additional inputs:
+        ```python
+            curX = X.copy()
+            kwargs = {"S": curX["S"]}
+            del curX["S"]
+            self.train(curX, y, self.hps, **kwargs)
+        ```
+        """
+        self.train(X, y, self.hps)
+    def get_params(self, deep=True):
+        """
+        Get parameters for self.model and self.
+        Args:
+            deep : bool, default=True
+                If True, will return the parameters for this estimator and
+                contained subobjects that are estimators.
+        Returns:
+            params : dict
+                Parameter names mapped to their values.
+        """
+        out = dict()
+        # if self.hps is set return them and not the default values
+        for key in self.hps:
+            out[key] = self.hps[key]
+        for key in get_param_names(self):
+            value = getattr(self, key)
+            if deep and hasattr(value, "get_params") and not isinstance(value, type):
+                deep_items = value.get_params().items()
+                out.update((key + "__" + k, val) for k, val in deep_items)
+            out[key] = value
+        return out
+    def set_params(self, **params):
+        """
+        Set the parameters of this estimator.
+        We overwrite the sklearn BaseEstimator and set params to self.hps
+        Args:
+            **params : dict
+                Estimator parameters.
+        Returns:
+            self : estimator instance
+                Estimator instance.
+        """
+        self.hps = params
+        return self
+def get_param_names(cls):
+    """Get parameter names for the estimator"""
+    # fetch the constructor or the original constructor before
+    # deprecation wrapping if any
+    init = getattr(cls.__init__, "deprecated_original", cls.__init__)
+    if init is object.__init__:
+        # No explicit constructor to introspect
+        return []
+    # introspect the constructor arguments to find the model parameters
+    # to represent
+    init_signature = inspect.signature(init)
+    # Consider the constructor parameters excluding 'self'
+    parameters = [
+        p
+        for p in init_signature.parameters.values()
+        if p.name != "self" and p.kind != p.VAR_KEYWORD
+    ]
+    for p in parameters:
+        if p.kind == p.VAR_POSITIONAL:
+            raise RuntimeError(
+                "scikit-learn estimators should always "
+                "specify their parameters in the signature"
+                " of their __init__ (no varargs)."
+                " %s with constructor %s doesn't "
+                " follow this convention." % (cls, init_signature)
+            )
+    # Extract and sort argument names excluding 'self'
+    return sorted([p.name for p in parameters])

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/models/{{ cookiecutter.python_package }}_model.py ADDED Viewed

@@ -0,0 +1,16 @@
+import pandas as pd
+from .base_model import BaseModel
+from sklearn.dummy import DummyClassifier
+class {{ cookiecutter.python_package }}(BaseModel):
+    def train(self, X: pd.DataFrame, y: pd.DataFrame, **kwargs):
+        self.build()
+        self.history = self.model.fit(X, y)
+    def build(self):
+        """Build the test Model.
+        self.hps:
+            -
+        """
+        self.model = DummyClassifier()

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Project pipelines."""
+from __future__ import annotations
+from kedro.framework.project import find_pipelines
+from kedro.pipeline import Pipeline
+def register_pipelines() -> dict[str, Pipeline]:
+    """Register the project's pipelines.
+    Returns:
+        A mapping from pipeline names to ``Pipeline`` objects.
+    """
+    pipelines = find_pipelines()
+    pipelines["__default__"] = sum(pipelines.values())
+    return pipelines

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/compile/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+This is a boilerplate pipeline 'compile'
+generated using Kedro 1.0.0
+"""
+from .pipeline import create_pipeline
+__all__ = ["create_pipeline"]
+__version__ = "0.1"

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/compile/nodes.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""
+This is a boilerplate pipeline 'compile'
+generated using Kedro 1.0.0
+"""
+import logging
+import pandas as pd
+from triggerflow.core import TriggerModel
+from sklearn.metrics import roc_auc_score
+def compile_model(
+    model, X_test: pd.DataFrame, y_test: pd.DataFrame, config: dict
+) -> pd.DataFrame:
+    """Compiles the model and runs some further checks.
+    Args:
+        model:
+        X_test:
+        y_test:
+        config:
+    Returns:
+        Model prediction.
+    """
+    # get logger for reporting
+    logger = logging.getLogger(__name__)
+    triggerflow = TriggerModel(
+        name=config["name"],
+        ml_backend=config["ml_backend"],
+        compiler=config["compiler"],
+        model=model,
+        # compiler_config or None
+        compiler_config=None,
+    )
+    triggerflow()
+    output_software = triggerflow.software_predict(X_test)
+    output_firmware = triggerflow.firmware_predict(X_test)
+    output_qonnx = triggerflow.qonnx_predict(X_test)
+    auc_software = roc_auc_score(y_test, output_software)
+    auc_firmware = roc_auc_score(y_test, output_firmware)
+    auc_qonnx = roc_auc_score(y_test, output_qonnx)
+    logger.info(f"Area under ROC curve Software: {auc_software:.4f}")
+    logger.info(f"Area under ROC curve Firmware: {auc_firmware:.4f}")
+    logger.info(f"Area under ROC curve QONNX: {auc_qonnx:.4f}")
+    return triggerflow, [auc_software, auc_firmware, auc_qonnx]

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/compile/pipeline.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+This is a boilerplate pipeline 'compile'
+generated using Kedro 1.0.0
+"""
+from kedro.pipeline import node, Pipeline, pipeline  # noqa
+def create_pipeline(**kwargs) -> Pipeline:
+    return pipeline([])

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_processing/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+This is a boilerplate pipeline 'data_processing'
+generated using Kedro 1.0.0
+"""
+from .pipeline import create_pipeline
+__all__ = ["create_pipeline"]
+__version__ = "0.1"

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_processing/nodes.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""
+This is a boilerplate pipeline 'data_processing'
+generated using Kedro 1.0.0
+"""
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+def data_processing(
+    data: pd.DataFrame, random_state: int, test_size: float
+) -> pd.DataFrame:
+    """Preprocesses some data.
+    Args:
+        data: Raw data.
+        random_state:
+    Returns:
+        X_train:
+        X_test:
+        y_train:
+        y_test:
+        event_ids:
+        scaler:
+    """
+    y = data["y"].to_frame()
+    event_ids = data["event"].to_frame()
+    X = data.drop(columns=["y", "event"])
+    # Normalize features
+    scaler = StandardScaler()
+    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
+    # Split into training and test sets
+    X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
+        X_scaled, y, event_ids, test_size=test_size, random_state=random_state
+    )
+    return X_train, X_test, y_train, y_test, scaler, ids_train, ids_test

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_processing/pipeline.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""
+This is a boilerplate pipeline 'data_processing'
+generated using Kedro 1.0.0
+"""
+from kedro.pipeline import node, Pipeline, pipeline  # noqa
+from .nodes import data_processing
+def create_pipeline(**kwargs) -> Pipeline:
+    return pipeline(
+        [
+            node(
+                func=data_processing,
+                inputs=["{{ cookiecutter.python_package }}_data_loaded", "params:random_state", "params:test_size"],
+                outputs=[
+                    "processed_{{ cookiecutter.python_package }}_X_train",
+                    "processed_{{ cookiecutter.python_package }}_X_test",
+                    "processed_{{ cookiecutter.python_package }}_y_train",
+                    "processed_{{ cookiecutter.python_package }}_y_test",
+                    "scaler",
+                    "event_ids_train",
+                    "event_ids_test",
+                ],
+                name="data_processing_{{ cookiecutter.python_package }}_node",
+            )
+        ]
+    )

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/load_data/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+This is a boilerplate pipeline 'model_training'
+generated using Kedro 1.0.0
+"""
+from .pipeline import create_pipeline
+__all__ = ["create_pipeline"]
+__version__ = "0.1"

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/load_data/nodes.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""
+This is a boilerplate pipeline 'model_training'
+generated using Kedro 1.0.0
+"""
+import logging
+import pandas as pd
+from glob import glob
+def load_data({{ cookiecutter.python_package }}_data: pd.DataFrame, meta_data: dict) -> list[dict, pd.DataFrame]:
+    return {{ cookiecutter.python_package }}_data, meta_data

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/load_data/pipeline.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+This is a boilerplate pipeline 'model_training'
+generated using Kedro 1.0.0
+"""
+from kedro.pipeline import node, Pipeline, pipeline  # noqa
+from .nodes import load_data
+def create_pipeline(**kwargs) -> Pipeline:
+    return pipeline(
+        [
+            node(
+                func=load_data,
+                inputs=["{{ cookiecutter.python_package }}_data", "{{ cookiecutter.python_package }}_meta_data"],
+                outputs=["{{ cookiecutter.python_package }}_data_loaded", "{{ cookiecutter.python_package }}_meta_data_loaded"],
+                name="load_data",
+            )
+        ]
+    )

triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_training/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+This is a boilerplate pipeline 'model_training'
+generated using Kedro 1.0.0
+"""
+from .pipeline import create_pipeline
+__all__ = ["create_pipeline"]
+__version__ = "0.1"

triggerflow 0.1.12__py3-none-any.whl → 0.2.1__py3-none-any.whl

triggerflow 0.1.12py3-none-any.whl → 0.2.1py3-none-any.whl