triggerflow 0.1.12__py3-none-any.whl → 0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trigger_dataset/__init__.py +0 -0
- trigger_dataset/core.py +88 -0
- trigger_loader/__init__.py +0 -0
- trigger_loader/cluster_manager.py +107 -0
- trigger_loader/loader.py +95 -0
- trigger_loader/processor.py +211 -0
- triggerflow/cli.py +122 -0
- triggerflow/core.py +118 -114
- triggerflow/mlflow_wrapper.py +54 -49
- triggerflow/starter/.gitignore +143 -0
- triggerflow/starter/README.md +0 -0
- triggerflow/starter/cookiecutter.json +5 -0
- triggerflow/starter/prompts.yml +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/.dvcignore +3 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/.gitignore +143 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/.gitlab-ci.yml +56 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/README.md +29 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/README.md +26 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +84 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/parameters.yml +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/parameters_compile.yml +14 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/parameters_data_processing.yml +8 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/parameters_load_data.yml +5 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/parameters_model_training.yml +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/base/parameters_model_validation.yml +5 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/catalog.yml +84 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/parameters.yml +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/parameters_compile.yml +14 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/parameters_data_processing.yml +8 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/parameters_load_data.yml +5 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/parameters_model_training.yml +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/local/parameters_model_validation.yml +5 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/conf/logging.yml +43 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/01_raw/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/01_raw/samples.json +15 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/01_raw/samples_dummy.json +26 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/02_loaded/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/03_preprocessed/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/04_models/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/05_validation/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/06_compile/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/data/07_reporting/.gitkeep +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/dvc.yaml +7 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/environment.yml +21 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/pyproject.toml +50 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py +3 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py +25 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/any_object.py +20 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/base_dataset.py +137 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/meta_dataset.py +88 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/datasets/{{ cookiecutter.python_package }}_dataset.py +35 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/models/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/models/base_model.py +155 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/models/{{ cookiecutter.python_package }}_model.py +16 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py +17 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/compile/__init__.py +10 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/compile/nodes.py +50 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/compile/pipeline.py +10 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_processing/__init__.py +10 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_processing/nodes.py +40 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_processing/pipeline.py +28 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/load_data/__init__.py +10 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/load_data/nodes.py +12 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/load_data/pipeline.py +20 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_training/__init__.py +10 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_training/nodes.py +31 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_training/pipeline.py +24 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_validation/__init__.py +10 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_validation/nodes.py +29 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/model_validation/pipeline.py +24 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py +46 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/utils/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/utils/metric.py +4 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/utils/plotting.py +598 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/compile/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/compile/test_pipeline.py +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/data_processing/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/data_processing/test_pipeline.py +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/load_data/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/load_data/test_pipeline.py +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/model_training/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/model_training/test_pipeline.py +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/model_validation/__init__.py +0 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/pipelines/model_validation/test_pipeline.py +9 -0
- triggerflow/starter/{{ cookiecutter.repo_name }}/tests/test_run.py +27 -0
- triggerflow-0.2.dist-info/METADATA +97 -0
- triggerflow-0.2.dist-info/RECORD +97 -0
- triggerflow-0.2.dist-info/entry_points.txt +2 -0
- triggerflow-0.2.dist-info/top_level.txt +3 -0
- triggerflow-0.1.12.dist-info/METADATA +0 -61
- triggerflow-0.1.12.dist-info/RECORD +0 -11
- triggerflow-0.1.12.dist-info/top_level.txt +0 -1
- {triggerflow-0.1.12.dist-info → triggerflow-0.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import logging, json
|
|
2
|
+
from glob import glob
|
|
3
|
+
from kedro.io import AbstractDataset
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
METADATA_CONFIG = {"x": 0}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MetaDataset(AbstractDataset):
|
|
10
|
+
"""
|
|
11
|
+
Dataset class to load a json file.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, filepath: str, sample_key: str):
|
|
15
|
+
self._filepath = filepath
|
|
16
|
+
self._sample_key = sample_key
|
|
17
|
+
# get logger for reporting
|
|
18
|
+
self.logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
def get_dasgoclient_metadata(self, das_name: dict, config: dict) -> dict:
|
|
21
|
+
"""
|
|
22
|
+
Get metadata from DAS for a given sample.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
self.logger.info(f"Fetching DAS metadata for dataset: {das_name}")
|
|
26
|
+
|
|
27
|
+
# # Use sys to run the command and keep the output as a dict
|
|
28
|
+
# cmnd = f'dasgoclient -query="dataset dataset={das_name}" -json'
|
|
29
|
+
# output = sys.command(cmnd)
|
|
30
|
+
|
|
31
|
+
# # Parse the output and extract relevant metadata
|
|
32
|
+
# if output:
|
|
33
|
+
# das_json = json.loads(output)[0]
|
|
34
|
+
# for k, v in config["metadata"].items():
|
|
35
|
+
# if k in das_json:
|
|
36
|
+
# for item in v:
|
|
37
|
+
# metadata[item] = das_json[k].get(item)
|
|
38
|
+
# else:
|
|
39
|
+
# self.logger.warning(f"{k} not found for dataset: {das_name}")
|
|
40
|
+
# else:
|
|
41
|
+
# self.logger.warning("No metadata found.")
|
|
42
|
+
# return {}
|
|
43
|
+
|
|
44
|
+
metadata = {"gridpack": "0.0.0"}
|
|
45
|
+
|
|
46
|
+
return metadata
|
|
47
|
+
|
|
48
|
+
def _load(self) -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Load a json file and return a python dict.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
self.logger.info(f"Processing file: {self._filepath}")
|
|
54
|
+
|
|
55
|
+
with open(self._filepath, "r") as f:
|
|
56
|
+
data = json.load(f)
|
|
57
|
+
|
|
58
|
+
return data
|
|
59
|
+
|
|
60
|
+
def _save(self, samples: dict) -> dict:
|
|
61
|
+
"""
|
|
62
|
+
Get the meta data from all samples and store the result.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
metadata = {}
|
|
66
|
+
for sample_name, sample_info in samples[self._sample_key].items():
|
|
67
|
+
self.logger.info(f"Processing sample: {sample_name}")
|
|
68
|
+
|
|
69
|
+
# Get sample files
|
|
70
|
+
sample_path = sample_info.get("path")
|
|
71
|
+
if len(sample_path) == 0:
|
|
72
|
+
self.logger.warning(f"No files found for sample {sample_name}.")
|
|
73
|
+
sample_info.update({"files": glob(sample_path)})
|
|
74
|
+
self.logger.info(
|
|
75
|
+
f"Found {len(sample_info.get('files', []))} files for sample {sample_name}."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Get sample metadata
|
|
79
|
+
metadata[sample_name] = self.get_dasgoclient_metadata(
|
|
80
|
+
sample_info["DAS"], METADATA_CONFIG
|
|
81
|
+
)
|
|
82
|
+
# sample_info.update(metadata)
|
|
83
|
+
|
|
84
|
+
with open(self._filepath, "w") as f:
|
|
85
|
+
json.dump(metadata, f)
|
|
86
|
+
|
|
87
|
+
def _describe(self) -> dict:
|
|
88
|
+
return {"filepath": self._filepath, "sample_key": self._sample_key}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from .base_dataset import BaseDataset
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class {{ cookiecutter.python_package }}Dataset(BaseDataset):
|
|
6
|
+
"""
|
|
7
|
+
A custom dataset example.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def get_branches_to_keep(self) -> list[str]:
|
|
11
|
+
"""
|
|
12
|
+
Define the branches you needed.
|
|
13
|
+
"""
|
|
14
|
+
return [
|
|
15
|
+
"PuppiMET_pt",
|
|
16
|
+
"CaloMET_pt",
|
|
17
|
+
"event", # <-- we need this for meta data
|
|
18
|
+
# "Jet_pt",
|
|
19
|
+
# "Jet_eta",
|
|
20
|
+
# "Jet_phi",
|
|
21
|
+
# "Jet_btag*", # Use a wildcard to get all b-tagging info
|
|
22
|
+
"nJet",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
def get_cut(self) -> str | None:
|
|
26
|
+
"""
|
|
27
|
+
Apply a pre-selection cut to keep only events with exactly 1 jet.
|
|
28
|
+
"""
|
|
29
|
+
return "nJet == 1"
|
|
30
|
+
|
|
31
|
+
def convert_to_pandas(self, data: dict):
|
|
32
|
+
"""
|
|
33
|
+
Logic to convert from dict of (potentially nested) arrays to a pandas DataFrame.
|
|
34
|
+
"""
|
|
35
|
+
return pd.DataFrame(data)
|
|
File without changes
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
from sklearn.base import BaseEstimator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseModel(ABC, BaseEstimator):
|
|
9
|
+
"""
|
|
10
|
+
Standard Wrapper for a model
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, name: str, hps: dict):
|
|
14
|
+
self.name = name
|
|
15
|
+
# this will be overwritten after training
|
|
16
|
+
self.model = None
|
|
17
|
+
self.history = None
|
|
18
|
+
self.callbacks = []
|
|
19
|
+
self.hps = hps
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def train(self, X: pd.DataFrame, y: pd.DataFrame, hps: dict, **kwargs):
|
|
23
|
+
"""
|
|
24
|
+
User code function.
|
|
25
|
+
Args:
|
|
26
|
+
X: features
|
|
27
|
+
y: label
|
|
28
|
+
hps: hyperparameters
|
|
29
|
+
kwargs: anything else needed for training
|
|
30
|
+
"""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def build(self):
|
|
35
|
+
"""
|
|
36
|
+
User code function to build the model.
|
|
37
|
+
"""
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
def predict(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
41
|
+
"""
|
|
42
|
+
Calculates predictions of the model
|
|
43
|
+
Args:
|
|
44
|
+
X: features
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
predictions
|
|
48
|
+
(optional in user code) kwargs: anything else needed for predicting
|
|
49
|
+
"""
|
|
50
|
+
y_pred = self.model.predict(X.astype("float32"))
|
|
51
|
+
return pd.DataFrame(y_pred)
|
|
52
|
+
|
|
53
|
+
def predict_proba(self, X: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Calculates proba predictions of the model
|
|
56
|
+
Args:
|
|
57
|
+
X: features
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
predictions
|
|
61
|
+
(optional in user code) kwargs: anything else needed for predicting
|
|
62
|
+
"""
|
|
63
|
+
y_pred = self.model.predict_proba(X.astype("float32"))
|
|
64
|
+
return pd.DataFrame(y_pred)
|
|
65
|
+
|
|
66
|
+
def fit(self, X: pd.DataFrame, y: pd.DataFrame):
|
|
67
|
+
"""
|
|
68
|
+
Same as train but get kwargs from __init__ for sklearn API
|
|
69
|
+
Args:
|
|
70
|
+
X: features
|
|
71
|
+
y: label
|
|
72
|
+
|
|
73
|
+
X can also contain optional inputs https://github.com/scikit-learn/scikit-learn/issues/2879.
|
|
74
|
+
Which should be specified in the user code.
|
|
75
|
+
For example when the train function needs additional inputs:
|
|
76
|
+
```python
|
|
77
|
+
curX = X.copy()
|
|
78
|
+
kwargs = {"S": curX["S"]}
|
|
79
|
+
del curX["S"]
|
|
80
|
+
self.train(curX, y, self.hps, **kwargs)
|
|
81
|
+
```
|
|
82
|
+
"""
|
|
83
|
+
self.train(X, y, self.hps)
|
|
84
|
+
|
|
85
|
+
def get_params(self, deep=True):
|
|
86
|
+
"""
|
|
87
|
+
Get parameters for self.model and self.
|
|
88
|
+
Args:
|
|
89
|
+
deep : bool, default=True
|
|
90
|
+
If True, will return the parameters for this estimator and
|
|
91
|
+
contained subobjects that are estimators.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
params : dict
|
|
95
|
+
Parameter names mapped to their values.
|
|
96
|
+
"""
|
|
97
|
+
out = dict()
|
|
98
|
+
# if self.hps is set return them and not the default values
|
|
99
|
+
for key in self.hps:
|
|
100
|
+
out[key] = self.hps[key]
|
|
101
|
+
for key in get_param_names(self):
|
|
102
|
+
value = getattr(self, key)
|
|
103
|
+
if deep and hasattr(value, "get_params") and not isinstance(value, type):
|
|
104
|
+
deep_items = value.get_params().items()
|
|
105
|
+
out.update((key + "__" + k, val) for k, val in deep_items)
|
|
106
|
+
out[key] = value
|
|
107
|
+
return out
|
|
108
|
+
|
|
109
|
+
def set_params(self, **params):
|
|
110
|
+
"""
|
|
111
|
+
Set the parameters of this estimator.
|
|
112
|
+
|
|
113
|
+
We overwrite the sklearn BaseEstimator and set params to self.hps
|
|
114
|
+
Args:
|
|
115
|
+
**params : dict
|
|
116
|
+
Estimator parameters.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
self : estimator instance
|
|
120
|
+
Estimator instance.
|
|
121
|
+
"""
|
|
122
|
+
self.hps = params
|
|
123
|
+
|
|
124
|
+
return self
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def get_param_names(cls):
|
|
128
|
+
"""Get parameter names for the estimator"""
|
|
129
|
+
# fetch the constructor or the original constructor before
|
|
130
|
+
# deprecation wrapping if any
|
|
131
|
+
init = getattr(cls.__init__, "deprecated_original", cls.__init__)
|
|
132
|
+
if init is object.__init__:
|
|
133
|
+
# No explicit constructor to introspect
|
|
134
|
+
return []
|
|
135
|
+
|
|
136
|
+
# introspect the constructor arguments to find the model parameters
|
|
137
|
+
# to represent
|
|
138
|
+
init_signature = inspect.signature(init)
|
|
139
|
+
# Consider the constructor parameters excluding 'self'
|
|
140
|
+
parameters = [
|
|
141
|
+
p
|
|
142
|
+
for p in init_signature.parameters.values()
|
|
143
|
+
if p.name != "self" and p.kind != p.VAR_KEYWORD
|
|
144
|
+
]
|
|
145
|
+
for p in parameters:
|
|
146
|
+
if p.kind == p.VAR_POSITIONAL:
|
|
147
|
+
raise RuntimeError(
|
|
148
|
+
"scikit-learn estimators should always "
|
|
149
|
+
"specify their parameters in the signature"
|
|
150
|
+
" of their __init__ (no varargs)."
|
|
151
|
+
" %s with constructor %s doesn't "
|
|
152
|
+
" follow this convention." % (cls, init_signature)
|
|
153
|
+
)
|
|
154
|
+
# Extract and sort argument names excluding 'self'
|
|
155
|
+
return sorted([p.name for p in parameters])
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from .base_model import BaseModel
|
|
3
|
+
from sklearn.dummy import DummyClassifier
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class {{ cookiecutter.python_package }}(BaseModel):
|
|
7
|
+
def train(self, X: pd.DataFrame, y: pd.DataFrame, **kwargs):
|
|
8
|
+
self.build()
|
|
9
|
+
self.history = self.model.fit(X, y)
|
|
10
|
+
|
|
11
|
+
def build(self):
|
|
12
|
+
"""Build the test Model.
|
|
13
|
+
self.hps:
|
|
14
|
+
-
|
|
15
|
+
"""
|
|
16
|
+
self.model = DummyClassifier()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Project pipelines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from kedro.framework.project import find_pipelines
|
|
6
|
+
from kedro.pipeline import Pipeline
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def register_pipelines() -> dict[str, Pipeline]:
|
|
10
|
+
"""Register the project's pipelines.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
A mapping from pipeline names to ``Pipeline`` objects.
|
|
14
|
+
"""
|
|
15
|
+
pipelines = find_pipelines()
|
|
16
|
+
pipelines["__default__"] = sum(pipelines.values())
|
|
17
|
+
return pipelines
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'compile'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from triggerflow.core import TriggerModel
|
|
9
|
+
from sklearn.metrics import roc_auc_score
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compile_model(
|
|
13
|
+
model, X_test: pd.DataFrame, y_test: pd.DataFrame, config: dict
|
|
14
|
+
) -> pd.DataFrame:
|
|
15
|
+
"""Compiles the model and runs some further checks.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
model:
|
|
19
|
+
X_test:
|
|
20
|
+
y_test:
|
|
21
|
+
config:
|
|
22
|
+
Returns:
|
|
23
|
+
Model prediction.
|
|
24
|
+
"""
|
|
25
|
+
# get logger for reporting
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
triggerflow = TriggerModel(
|
|
29
|
+
name=config["name"],
|
|
30
|
+
ml_backend=config["ml_backend"],
|
|
31
|
+
compiler=config["compiler"],
|
|
32
|
+
model=model,
|
|
33
|
+
# compiler_config or None
|
|
34
|
+
compiler_config=None,
|
|
35
|
+
)
|
|
36
|
+
triggerflow()
|
|
37
|
+
|
|
38
|
+
output_software = triggerflow.software_predict(X_test)
|
|
39
|
+
output_firmware = triggerflow.firmware_predict(X_test)
|
|
40
|
+
output_qonnx = triggerflow.qonnx_predict(X_test)
|
|
41
|
+
|
|
42
|
+
auc_software = roc_auc_score(y_test, output_software)
|
|
43
|
+
auc_firmware = roc_auc_score(y_test, output_firmware)
|
|
44
|
+
auc_qonnx = roc_auc_score(y_test, output_qonnx)
|
|
45
|
+
|
|
46
|
+
logger.info(f"Area under ROC curve Software: {auc_software:.4f}")
|
|
47
|
+
logger.info(f"Area under ROC curve Firmware: {auc_firmware:.4f}")
|
|
48
|
+
logger.info(f"Area under ROC curve QONNX: {auc_qonnx:.4f}")
|
|
49
|
+
|
|
50
|
+
return triggerflow, [auc_software, auc_firmware, auc_qonnx]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'data_processing'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def data_processing(
|
|
12
|
+
data: pd.DataFrame, random_state: int, test_size: float
|
|
13
|
+
) -> pd.DataFrame:
|
|
14
|
+
"""Preprocesses some data.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
data: Raw data.
|
|
18
|
+
random_state:
|
|
19
|
+
Returns:
|
|
20
|
+
X_train:
|
|
21
|
+
X_test:
|
|
22
|
+
y_train:
|
|
23
|
+
y_test:
|
|
24
|
+
event_ids:
|
|
25
|
+
scaler:
|
|
26
|
+
"""
|
|
27
|
+
y = data["y"].to_frame()
|
|
28
|
+
event_ids = data["event"].to_frame()
|
|
29
|
+
X = data.drop(columns=["y", "event"])
|
|
30
|
+
|
|
31
|
+
# Normalize features
|
|
32
|
+
scaler = StandardScaler()
|
|
33
|
+
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
|
|
34
|
+
|
|
35
|
+
# Split into training and test sets
|
|
36
|
+
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
|
|
37
|
+
X_scaled, y, event_ids, test_size=test_size, random_state=random_state
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return X_train, X_test, y_train, y_test, scaler, ids_train, ids_test
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'data_processing'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from kedro.pipeline import node, Pipeline, pipeline # noqa
|
|
7
|
+
from .nodes import data_processing
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_pipeline(**kwargs) -> Pipeline:
|
|
11
|
+
return pipeline(
|
|
12
|
+
[
|
|
13
|
+
node(
|
|
14
|
+
func=data_processing,
|
|
15
|
+
inputs=["{{ cookiecutter.python_package }}_data_loaded", "params:random_state", "params:test_size"],
|
|
16
|
+
outputs=[
|
|
17
|
+
"processed_{{ cookiecutter.python_package }}_X_train",
|
|
18
|
+
"processed_{{ cookiecutter.python_package }}_X_test",
|
|
19
|
+
"processed_{{ cookiecutter.python_package }}_y_train",
|
|
20
|
+
"processed_{{ cookiecutter.python_package }}_y_test",
|
|
21
|
+
"scaler",
|
|
22
|
+
"event_ids_train",
|
|
23
|
+
"event_ids_test",
|
|
24
|
+
],
|
|
25
|
+
name="data_processing_{{ cookiecutter.python_package }}_node",
|
|
26
|
+
)
|
|
27
|
+
]
|
|
28
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'model_training'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from glob import glob
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_data({{ cookiecutter.python_package }}_data: pd.DataFrame, meta_data: dict) -> list[dict, pd.DataFrame]:
|
|
12
|
+
return {{ cookiecutter.python_package }}_data, meta_data
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'model_training'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from kedro.pipeline import node, Pipeline, pipeline # noqa
|
|
7
|
+
from .nodes import load_data
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_pipeline(**kwargs) -> Pipeline:
|
|
11
|
+
return pipeline(
|
|
12
|
+
[
|
|
13
|
+
node(
|
|
14
|
+
func=load_data,
|
|
15
|
+
inputs=["{{ cookiecutter.python_package }}_data", "{{ cookiecutter.python_package }}_meta_data"],
|
|
16
|
+
outputs=["{{ cookiecutter.python_package }}_data_loaded", "{{ cookiecutter.python_package }}_meta_data_loaded"],
|
|
17
|
+
name="load_data",
|
|
18
|
+
)
|
|
19
|
+
]
|
|
20
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'model_training'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from {{ cookiecutter.python_package }}.utils.plotting import plotTrainingHistory, get_dummy
|
|
8
|
+
from {{ cookiecutter.python_package }}.models.{{ cookiecutter.python_package }}_model import {{ cookiecutter.python_package }}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def train_model(
|
|
12
|
+
X_train: pd.DataFrame, y_train: pd.DataFrame, params: dict
|
|
13
|
+
) -> pd.DataFrame:
|
|
14
|
+
"""Trains a dummy model on some data.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
X_train:
|
|
18
|
+
y_train:
|
|
19
|
+
params:
|
|
20
|
+
Returns:
|
|
21
|
+
Trained model.
|
|
22
|
+
"""
|
|
23
|
+
params["hps"]["nInputs"] = X_train.shape[-1]
|
|
24
|
+
model = {{ cookiecutter.python_package }}(name=params["hps"]["name"], hps=params["hps"])
|
|
25
|
+
model.train(X_train, y_train)
|
|
26
|
+
|
|
27
|
+
f, _ = get_dummy()
|
|
28
|
+
# NOTE: one can also plot the history
|
|
29
|
+
# f, _ = plotTrainingHistory(model.history)
|
|
30
|
+
|
|
31
|
+
return model, f
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'model_training'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from kedro.pipeline import node, Pipeline, pipeline # noqa
|
|
7
|
+
from .nodes import train_model
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_pipeline(**kwargs) -> Pipeline:
|
|
11
|
+
return pipeline(
|
|
12
|
+
[
|
|
13
|
+
node(
|
|
14
|
+
func=train_model,
|
|
15
|
+
inputs=[
|
|
16
|
+
"processed_{{ cookiecutter.python_package }}_X_train",
|
|
17
|
+
"processed_{{ cookiecutter.python_package }}_y_train",
|
|
18
|
+
"params:{{ cookiecutter.python_package }}_model",
|
|
19
|
+
],
|
|
20
|
+
outputs=["train_model", "training_history"],
|
|
21
|
+
name="train_model_node",
|
|
22
|
+
)
|
|
23
|
+
]
|
|
24
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'model_validation'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from sklearn.metrics import roc_auc_score
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validated_model(model, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:
|
|
12
|
+
"""Trains a dummy model on some data.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
model:
|
|
16
|
+
X_test:
|
|
17
|
+
y_test:
|
|
18
|
+
Returns:
|
|
19
|
+
Model prediction.
|
|
20
|
+
"""
|
|
21
|
+
# get logger for reporting
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
pred = model.predict(X_test)
|
|
25
|
+
auc = roc_auc_score(y_test, pred)
|
|
26
|
+
|
|
27
|
+
logger.info(f"Area under ROC curve: {auc:.4f}")
|
|
28
|
+
|
|
29
|
+
return pred
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a boilerplate pipeline 'model_validation'
|
|
3
|
+
generated using Kedro 1.0.0
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from kedro.pipeline import node, Pipeline, pipeline # noqa
|
|
7
|
+
from .nodes import validated_model
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_pipeline(**kwargs) -> Pipeline:
|
|
11
|
+
return pipeline(
|
|
12
|
+
[
|
|
13
|
+
node(
|
|
14
|
+
func=validated_model,
|
|
15
|
+
inputs=[
|
|
16
|
+
"train_model",
|
|
17
|
+
"processed_{{ cookiecutter.python_package }}_X_test",
|
|
18
|
+
"processed_{{ cookiecutter.python_package }}_y_test",
|
|
19
|
+
],
|
|
20
|
+
outputs="model_pred",
|
|
21
|
+
name="validated_model_node",
|
|
22
|
+
)
|
|
23
|
+
]
|
|
24
|
+
)
|