expops 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- expops-0.1.3.dist-info/METADATA +826 -0
- expops-0.1.3.dist-info/RECORD +86 -0
- expops-0.1.3.dist-info/WHEEL +5 -0
- expops-0.1.3.dist-info/entry_points.txt +3 -0
- expops-0.1.3.dist-info/licenses/LICENSE +674 -0
- expops-0.1.3.dist-info/top_level.txt +1 -0
- mlops/__init__.py +0 -0
- mlops/__main__.py +11 -0
- mlops/_version.py +34 -0
- mlops/adapters/__init__.py +12 -0
- mlops/adapters/base.py +86 -0
- mlops/adapters/config_schema.py +89 -0
- mlops/adapters/custom/__init__.py +3 -0
- mlops/adapters/custom/custom_adapter.py +447 -0
- mlops/adapters/plugin_manager.py +113 -0
- mlops/adapters/sklearn/__init__.py +3 -0
- mlops/adapters/sklearn/adapter.py +94 -0
- mlops/cluster/__init__.py +3 -0
- mlops/cluster/controller.py +496 -0
- mlops/cluster/process_runner.py +91 -0
- mlops/cluster/providers.py +258 -0
- mlops/core/__init__.py +95 -0
- mlops/core/custom_model_base.py +38 -0
- mlops/core/dask_networkx_executor.py +1265 -0
- mlops/core/executor_worker.py +1239 -0
- mlops/core/experiment_tracker.py +81 -0
- mlops/core/graph_types.py +64 -0
- mlops/core/networkx_parser.py +135 -0
- mlops/core/payload_spill.py +278 -0
- mlops/core/pipeline_utils.py +162 -0
- mlops/core/process_hashing.py +216 -0
- mlops/core/step_state_manager.py +1298 -0
- mlops/core/step_system.py +956 -0
- mlops/core/workspace.py +99 -0
- mlops/environment/__init__.py +10 -0
- mlops/environment/base.py +43 -0
- mlops/environment/conda_manager.py +307 -0
- mlops/environment/factory.py +70 -0
- mlops/environment/pyenv_manager.py +146 -0
- mlops/environment/setup_env.py +31 -0
- mlops/environment/system_manager.py +66 -0
- mlops/environment/utils.py +105 -0
- mlops/environment/venv_manager.py +134 -0
- mlops/main.py +527 -0
- mlops/managers/project_manager.py +400 -0
- mlops/managers/reproducibility_manager.py +575 -0
- mlops/platform.py +996 -0
- mlops/reporting/__init__.py +16 -0
- mlops/reporting/context.py +187 -0
- mlops/reporting/entrypoint.py +292 -0
- mlops/reporting/kv_utils.py +77 -0
- mlops/reporting/registry.py +50 -0
- mlops/runtime/__init__.py +9 -0
- mlops/runtime/context.py +34 -0
- mlops/runtime/env_export.py +113 -0
- mlops/storage/__init__.py +12 -0
- mlops/storage/adapters/__init__.py +9 -0
- mlops/storage/adapters/gcp_kv_store.py +778 -0
- mlops/storage/adapters/gcs_object_store.py +96 -0
- mlops/storage/adapters/memory_store.py +240 -0
- mlops/storage/adapters/redis_store.py +438 -0
- mlops/storage/factory.py +199 -0
- mlops/storage/interfaces/__init__.py +6 -0
- mlops/storage/interfaces/kv_store.py +118 -0
- mlops/storage/path_utils.py +38 -0
- mlops/templates/premier-league/charts/plot_metrics.js +70 -0
- mlops/templates/premier-league/charts/plot_metrics.py +145 -0
- mlops/templates/premier-league/charts/requirements.txt +6 -0
- mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
- mlops/templates/premier-league/configs/project_config.yaml +207 -0
- mlops/templates/premier-league/data/England CSV.csv +12154 -0
- mlops/templates/premier-league/models/premier_league_model.py +638 -0
- mlops/templates/premier-league/requirements.txt +8 -0
- mlops/templates/sklearn-basic/README.md +22 -0
- mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
- mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
- mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
- mlops/templates/sklearn-basic/data/train.csv +14 -0
- mlops/templates/sklearn-basic/models/model.py +62 -0
- mlops/templates/sklearn-basic/requirements.txt +10 -0
- mlops/web/__init__.py +3 -0
- mlops/web/server.py +585 -0
- mlops/web/ui/index.html +52 -0
- mlops/web/ui/mlops-charts.js +357 -0
- mlops/web/ui/script.js +1244 -0
- mlops/web/ui/styles.css +248 -0
|
@@ -0,0 +1,638 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from sklearn.model_selection import train_test_split
|
|
12
|
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
|
13
|
+
from sklearn.compose import ColumnTransformer
|
|
14
|
+
from sklearn.decomposition import PCA
|
|
15
|
+
from sklearn.linear_model import LogisticRegression
|
|
16
|
+
from sklearn.neural_network import MLPClassifier
|
|
17
|
+
from xgboost import XGBClassifier
|
|
18
|
+
from sklearn.metrics import accuracy_score, precision_score, f1_score
|
|
19
|
+
|
|
20
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "src"))
|
|
21
|
+
|
|
22
|
+
from mlops.core import (
|
|
23
|
+
step, process, SerializableData, log_metric
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
def _csv_path() -> Path:
|
|
29
|
+
#return Path(__file__).parent.parent / "data" / "England CSV.csv"
|
|
30
|
+
return Path("/home/e/e0958526/mlops-platform/projects/premier-league/data/England CSV.csv")
|
|
31
|
+
|
|
32
|
+
def _get_result_column_name(df: pd.DataFrame) -> str:
|
|
33
|
+
if 'FT Result' in df.columns:
|
|
34
|
+
return 'FT Result'
|
|
35
|
+
if 'FTR' in df.columns:
|
|
36
|
+
return 'FTR'
|
|
37
|
+
raise ValueError("Missing required result column: expected 'FT Result' or 'FTR'")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _derive_outcome_labels(df: pd.DataFrame) -> np.ndarray:
|
|
41
|
+
result_col = _get_result_column_name(df)
|
|
42
|
+
mapping = {'H': 0, 'D': 1, 'A': 2}
|
|
43
|
+
y = df[result_col].astype(str).map(mapping)
|
|
44
|
+
if y.isnull().any():
|
|
45
|
+
bad = df.loc[y.isnull(), result_col].unique().tolist()
|
|
46
|
+
raise ValueError(f"Unexpected values in {result_col}: {bad}")
|
|
47
|
+
return y.astype(int).to_numpy()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _get_cat_num_cols(df: pd.DataFrame) -> tuple[list[str], list[str]]:
|
|
51
|
+
cat_cols = [c for c in ['Season', 'HomeTeam', 'AwayTeam', 'Referee', 'League'] if c in df.columns]
|
|
52
|
+
num_cols = [
|
|
53
|
+
c for c in [
|
|
54
|
+
'HTH Goals', 'HTA Goals', 'H Shots', 'A Shots', 'H SOT', 'A SOT',
|
|
55
|
+
'H Fouls', 'A Fouls', 'H Corners', 'A Corners', 'H Yellow', 'A Yellow',
|
|
56
|
+
'H Red', 'A Red', 'Display_Order', 'DayOfWeek', 'Month'
|
|
57
|
+
] if c in df.columns
|
|
58
|
+
]
|
|
59
|
+
return cat_cols, num_cols
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _build_features_dataframe(df: pd.DataFrame, cat_cols: list[str], num_cols: list[str]) -> pd.DataFrame:
|
|
63
|
+
X_df = pd.DataFrame(index=df.index)
|
|
64
|
+
# Numeric
|
|
65
|
+
for c in num_cols:
|
|
66
|
+
s = pd.to_numeric(df[c], errors='coerce')
|
|
67
|
+
if s.isnull().any():
|
|
68
|
+
med = s.median()
|
|
69
|
+
s = s.fillna(med if not np.isnan(med) else 0)
|
|
70
|
+
X_df[c] = s.astype(float)
|
|
71
|
+
for c in cat_cols:
|
|
72
|
+
X_df[c] = df[c].astype(str)
|
|
73
|
+
for drop_c in ['FT Result', 'FTR', 'HT Result', 'Date']:
|
|
74
|
+
if drop_c in X_df.columns:
|
|
75
|
+
X_df = X_df.drop(columns=[drop_c])
|
|
76
|
+
return X_df
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@process()
|
|
80
|
+
def define_feature_engineering_generic_process(data, hyperparameters):
|
|
81
|
+
"""Load CSV, parse dates, derive labels (H/D/A), stratified split indices, and log analysis metrics."""
|
|
82
|
+
|
|
83
|
+
@step()
|
|
84
|
+
def load_csv():
|
|
85
|
+
path = _csv_path()
|
|
86
|
+
if not path.exists():
|
|
87
|
+
raise FileNotFoundError(f"Premier League CSV not found at {path}")
|
|
88
|
+
df = pd.read_csv(path)
|
|
89
|
+
try:
|
|
90
|
+
logger.info(f"[feature_engineering_generic.load_csv] Loaded df shape: {df.shape}")
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
return {'df': df.to_dict(orient='list')}
|
|
94
|
+
|
|
95
|
+
@step()
|
|
96
|
+
def derive_labels_and_indices(raw: SerializableData, hyperparameters: Dict[str, Any] | None = None):
|
|
97
|
+
df = pd.DataFrame(raw['df'])
|
|
98
|
+
# Parse date-based features
|
|
99
|
+
if 'Date' in df.columns:
|
|
100
|
+
dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
|
|
101
|
+
df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
|
|
102
|
+
df['Month'] = dt.dt.month.fillna(1).astype(int)
|
|
103
|
+
else:
|
|
104
|
+
df['DayOfWeek'] = 0
|
|
105
|
+
df['Month'] = 1
|
|
106
|
+
|
|
107
|
+
y = _derive_outcome_labels(df)
|
|
108
|
+
|
|
109
|
+
# Stratified split indices
|
|
110
|
+
test_size = float((hyperparameters or {}).get('test_size', 0.2))
|
|
111
|
+
idx = np.arange(len(df))
|
|
112
|
+
idx_train, idx_test = train_test_split(idx, test_size=test_size, shuffle=True, stratify=y)
|
|
113
|
+
|
|
114
|
+
# Goals histograms for static charts
|
|
115
|
+
hist_home = {}
|
|
116
|
+
hist_away = {}
|
|
117
|
+
if 'FTH Goals' in df.columns and 'FTA Goals' in df.columns:
|
|
118
|
+
goals_home = pd.to_numeric(df['FTH Goals'], errors='coerce').fillna(0).astype(int)
|
|
119
|
+
goals_away = pd.to_numeric(df['FTA Goals'], errors='coerce').fillna(0).astype(int)
|
|
120
|
+
hist_home = goals_home.value_counts().sort_index().astype(int).to_dict()
|
|
121
|
+
hist_away = goals_away.value_counts().sort_index().astype(int).to_dict()
|
|
122
|
+
log_metric('goals_hist_home', hist_home)
|
|
123
|
+
log_metric('goals_hist_away', hist_away)
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
'df': df.to_dict(orient='list'),
|
|
127
|
+
'labels': y.astype(int).tolist(),
|
|
128
|
+
'train_idx': idx_train.astype(int).tolist(),
|
|
129
|
+
'test_idx': idx_test.astype(int).tolist(),
|
|
130
|
+
'n_train': int(idx_train.shape[0]),
|
|
131
|
+
'n_test': int(idx_test.shape[0])
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
@step()
|
|
135
|
+
def feature_analysis(basic: SerializableData, hyperparameters: Dict[str, Any] | None = None):
|
|
136
|
+
df = pd.DataFrame(basic['df'])
|
|
137
|
+
if 'DayOfWeek' not in df.columns or 'Month' not in df.columns:
|
|
138
|
+
if 'Date' in df.columns:
|
|
139
|
+
dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
|
|
140
|
+
df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
|
|
141
|
+
df['Month'] = dt.dt.month.fillna(1).astype(int)
|
|
142
|
+
else:
|
|
143
|
+
df['DayOfWeek'] = 0
|
|
144
|
+
df['Month'] = 1
|
|
145
|
+
|
|
146
|
+
cat_cols, num_cols = _get_cat_num_cols(df)
|
|
147
|
+
X_df = _build_features_dataframe(df, cat_cols, num_cols)
|
|
148
|
+
|
|
149
|
+
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
150
|
+
preprocessor = ColumnTransformer(
|
|
151
|
+
transformers=[
|
|
152
|
+
('cat', encoder, cat_cols),
|
|
153
|
+
('num', StandardScaler(), num_cols)
|
|
154
|
+
],
|
|
155
|
+
remainder='drop'
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
X_all = preprocessor.fit_transform(X_df)
|
|
159
|
+
pca_components = int((hyperparameters or {}).get('pca_components', 16))
|
|
160
|
+
n_components = min(pca_components, X_all.shape[1]) if X_all.shape[1] > 0 else 0
|
|
161
|
+
if n_components > 0:
|
|
162
|
+
pca = PCA(n_components=n_components, random_state=int((hyperparameters or {}).get('random_seed', 42)))
|
|
163
|
+
_ = pca.fit_transform(X_all)
|
|
164
|
+
evr = pca.explained_variance_ratio_.tolist()
|
|
165
|
+
cum = np.cumsum(pca.explained_variance_ratio_).tolist()
|
|
166
|
+
else:
|
|
167
|
+
evr = []
|
|
168
|
+
cum = []
|
|
169
|
+
|
|
170
|
+
log_metric('pca_explained_variance_ratio', evr)
|
|
171
|
+
log_metric('pca_cumulative_variance', cum)
|
|
172
|
+
return {}
|
|
173
|
+
|
|
174
|
+
raw = load_csv()
|
|
175
|
+
basic = derive_labels_and_indices(raw=raw, hyperparameters=hyperparameters)
|
|
176
|
+
_ = feature_analysis(basic=basic, hyperparameters=hyperparameters)
|
|
177
|
+
return basic
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@process()
|
|
181
|
+
def define_preprocess_linear_nn_process(data):
|
|
182
|
+
"""Preprocess for Linear/NN: OHE categorical + StandardScaler numeric."""
|
|
183
|
+
src = data.get('feature_engineering_generic', {})
|
|
184
|
+
df = pd.DataFrame(src['df'])
|
|
185
|
+
y = np.asarray(src['labels'], dtype=int)
|
|
186
|
+
idx_train = np.asarray(src['train_idx'], dtype=int)
|
|
187
|
+
idx_test = np.asarray(src['test_idx'], dtype=int)
|
|
188
|
+
|
|
189
|
+
# Date-derived columns already present from FE; if not, add defaults
|
|
190
|
+
if 'DayOfWeek' not in df.columns or 'Month' not in df.columns:
|
|
191
|
+
if 'Date' in df.columns:
|
|
192
|
+
dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
|
|
193
|
+
df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
|
|
194
|
+
df['Month'] = dt.dt.month.fillna(1).astype(int)
|
|
195
|
+
else:
|
|
196
|
+
df['DayOfWeek'] = 0
|
|
197
|
+
df['Month'] = 1
|
|
198
|
+
|
|
199
|
+
cat_cols, num_cols = _get_cat_num_cols(df)
|
|
200
|
+
X_df = _build_features_dataframe(df, cat_cols, num_cols)
|
|
201
|
+
|
|
202
|
+
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
203
|
+
preprocessor = ColumnTransformer(
|
|
204
|
+
transformers=[
|
|
205
|
+
('cat', encoder, cat_cols),
|
|
206
|
+
('num', StandardScaler(), num_cols)
|
|
207
|
+
],
|
|
208
|
+
remainder='drop'
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
X_train = preprocessor.fit_transform(X_df.iloc[idx_train])
|
|
212
|
+
X_test = preprocessor.transform(X_df.iloc[idx_test])
|
|
213
|
+
y_train = y[idx_train]
|
|
214
|
+
y_test = y[idx_test]
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
'X_train': X_train.astype(float).tolist(),
|
|
218
|
+
'X_test': X_test.astype(float).tolist(),
|
|
219
|
+
'y_train': y_train.astype(int).tolist(),
|
|
220
|
+
'y_test': y_test.astype(int).tolist(),
|
|
221
|
+
'row_indices_train': idx_train.astype(int).tolist(),
|
|
222
|
+
'row_indices_test': idx_test.astype(int).tolist(),
|
|
223
|
+
'n_train': int(X_train.shape[0]),
|
|
224
|
+
'n_test': int(X_test.shape[0])
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@process()
|
|
229
|
+
def define_preprocess_xgb_process(data):
|
|
230
|
+
"""Preprocess for XGB: OHE categorical only (no scaling)."""
|
|
231
|
+
src = data.get('feature_engineering_generic', {})
|
|
232
|
+
df = pd.DataFrame(src['df'])
|
|
233
|
+
y = np.asarray(src['labels'], dtype=int)
|
|
234
|
+
idx_train = np.asarray(src['train_idx'], dtype=int)
|
|
235
|
+
idx_test = np.asarray(src['test_idx'], dtype=int)
|
|
236
|
+
|
|
237
|
+
if 'DayOfWeek' not in df.columns or 'Month' not in df.columns:
|
|
238
|
+
if 'Date' in df.columns:
|
|
239
|
+
dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
|
|
240
|
+
df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
|
|
241
|
+
df['Month'] = dt.dt.month.fillna(1).astype(int)
|
|
242
|
+
else:
|
|
243
|
+
df['DayOfWeek'] = 1
|
|
244
|
+
df['Month'] = 1
|
|
245
|
+
|
|
246
|
+
cat_cols, num_cols = _get_cat_num_cols(df)
|
|
247
|
+
X_df = _build_features_dataframe(df, cat_cols, num_cols)
|
|
248
|
+
|
|
249
|
+
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
250
|
+
preprocessor = ColumnTransformer(
|
|
251
|
+
transformers=[
|
|
252
|
+
('cat', encoder, cat_cols),
|
|
253
|
+
('num', 'passthrough', num_cols)
|
|
254
|
+
],
|
|
255
|
+
remainder='drop'
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
X_train = preprocessor.fit_transform(X_df.iloc[idx_train])
|
|
259
|
+
X_test = preprocessor.transform(X_df.iloc[idx_test])
|
|
260
|
+
y_train = y[idx_train]
|
|
261
|
+
y_test = y[idx_test]
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
'X_train': X_train.astype(float).tolist(),
|
|
265
|
+
'X_test': X_test.astype(float).tolist(),
|
|
266
|
+
'y_train': y_train.astype(int).tolist(),
|
|
267
|
+
'y_test': y_test.astype(int).tolist(),
|
|
268
|
+
'row_indices_train': idx_train.astype(int).tolist(),
|
|
269
|
+
'row_indices_test': idx_test.astype(int).tolist(),
|
|
270
|
+
'n_train': int(X_train.shape[0]),
|
|
271
|
+
'n_test': int(X_test.shape[0])
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@step()
|
|
276
|
+
def train_logistic_classifier(prep_data: SerializableData, hyperparameters: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
|
277
|
+
X_train = np.asarray(prep_data.get('X_train', []), dtype=float)
|
|
278
|
+
y_train = np.asarray(prep_data.get('y_train', []), dtype=int)
|
|
279
|
+
if X_train.size == 0:
|
|
280
|
+
raise ValueError("Empty training data provided to Logistic training step")
|
|
281
|
+
|
|
282
|
+
params = (hyperparameters or {}).get('logreg_params', {})
|
|
283
|
+
max_iter = int(params.get('max_iter', 500))
|
|
284
|
+
class_weight = params.get('class_weight', None)
|
|
285
|
+
|
|
286
|
+
model = LogisticRegression(
|
|
287
|
+
solver='lbfgs',
|
|
288
|
+
max_iter=max_iter,
|
|
289
|
+
class_weight=class_weight
|
|
290
|
+
)
|
|
291
|
+
model.fit(X_train, y_train)
|
|
292
|
+
return {'model': model}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@step()
|
|
296
|
+
def train_and_evaluate_nn_classifier(prep_data: SerializableData, hyperparameters: Dict[str, Any] | None = None, branch_name: str = "") -> Dict[str, Any]:
|
|
297
|
+
hparams = (hyperparameters or {}).get("nn_params", {})
|
|
298
|
+
hidden_layers = tuple(hparams.get("hidden_layers", [128, 64]))
|
|
299
|
+
learning_rate = float(hparams.get("learning_rate", 0.001))
|
|
300
|
+
epochs = int(hparams.get("epochs", 50))
|
|
301
|
+
random_seed = int(hparams.get("random_seed", 30))
|
|
302
|
+
|
|
303
|
+
X_train = np.asarray(prep_data.get('X_train', []), dtype=float)
|
|
304
|
+
y_train = np.asarray(prep_data.get('y_train', []), dtype=int)
|
|
305
|
+
if X_train.size == 0:
|
|
306
|
+
raise ValueError("Empty training data provided to NN classifier training step")
|
|
307
|
+
|
|
308
|
+
clf = MLPClassifier(
|
|
309
|
+
hidden_layer_sizes=hidden_layers,
|
|
310
|
+
learning_rate_init=learning_rate,
|
|
311
|
+
activation='relu',
|
|
312
|
+
solver='adam',
|
|
313
|
+
alpha=0.0001,
|
|
314
|
+
max_iter=1,
|
|
315
|
+
warm_start=True,
|
|
316
|
+
early_stopping=False,
|
|
317
|
+
shuffle=True,
|
|
318
|
+
random_state=random_seed,
|
|
319
|
+
verbose=False
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
for epoch in range(epochs):
|
|
323
|
+
clf.fit(X_train, y_train)
|
|
324
|
+
try:
|
|
325
|
+
if hasattr(clf, 'loss_'):
|
|
326
|
+
log_metric('train_loss', float(clf.loss_), step=epoch + 1)
|
|
327
|
+
preds = clf.predict(X_train)
|
|
328
|
+
f1 = float(f1_score(y_train, preds, average='macro'))
|
|
329
|
+
log_metric('train_f1', f1, step=epoch + 1)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.warning(f"[{branch_name or 'nn'}] Failed to log training metrics @epoch {epoch + 1}: {e}")
|
|
332
|
+
return {'model': clf}
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
@step()
|
|
336
|
+
def train_xgb_classifier(prep_data: SerializableData, hyperparameters: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
|
337
|
+
xgb_params = (hyperparameters or {}).get("xgb_params", {})
|
|
338
|
+
params = {
|
|
339
|
+
'n_estimators': int(xgb_params.get('n_estimators', 400)),
|
|
340
|
+
'max_depth': int(xgb_params.get('max_depth', 4)),
|
|
341
|
+
'learning_rate': float(xgb_params.get('learning_rate', 0.1)),
|
|
342
|
+
'subsample': float(xgb_params.get('subsample', 0.9)),
|
|
343
|
+
'colsample_bytree': float(xgb_params.get('colsample_bytree', 0.9)),
|
|
344
|
+
'n_jobs': int(xgb_params.get('n_jobs', 1)),
|
|
345
|
+
'verbosity': 0,
|
|
346
|
+
'random_state': int(xgb_params.get('random_state', 42)) if 'random_state' in xgb_params else None,
|
|
347
|
+
'tree_method': xgb_params.get('tree_method', 'auto'),
|
|
348
|
+
'objective': 'multi:softprob',
|
|
349
|
+
'num_class': 3,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
params = {k: v for k, v in params.items() if v is not None}
|
|
353
|
+
|
|
354
|
+
X_train = np.asarray(prep_data.get('X_train', []), dtype=float)
|
|
355
|
+
y_train = np.asarray(prep_data.get('y_train', []), dtype=int)
|
|
356
|
+
if X_train.size == 0:
|
|
357
|
+
raise ValueError("Empty training data provided to XGB classifier training step")
|
|
358
|
+
|
|
359
|
+
model = XGBClassifier(**params)
|
|
360
|
+
model.fit(X_train, y_train)
|
|
361
|
+
return {'model': model}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
@step()
|
|
365
|
+
def test_inference_classification(model: SerializableData, X_test: SerializableData, y_test: SerializableData) -> Dict[str, Any]:
|
|
366
|
+
X = np.asarray(X_test or [], dtype=float)
|
|
367
|
+
y_true = np.asarray(y_test or [], dtype=int)
|
|
368
|
+
if X.size == 0 or y_true.size == 0:
|
|
369
|
+
try:
|
|
370
|
+
log_metric('test_accuracy', 0.0)
|
|
371
|
+
log_metric('test_precision', 0.0)
|
|
372
|
+
log_metric('test_f1', 0.0)
|
|
373
|
+
except Exception:
|
|
374
|
+
pass
|
|
375
|
+
return {'test_accuracy': 0.0, 'test_precision': 0.0, 'test_f1': 0.0}
|
|
376
|
+
|
|
377
|
+
# Predict probabilities if available
|
|
378
|
+
if hasattr(model, 'predict_proba'):
|
|
379
|
+
proba = model.predict_proba(X)
|
|
380
|
+
if isinstance(proba, list):
|
|
381
|
+
proba = np.stack(proba, axis=-1)
|
|
382
|
+
if proba.ndim == 3:
|
|
383
|
+
proba = proba
|
|
384
|
+
else:
|
|
385
|
+
preds = model.predict(X)
|
|
386
|
+
n_classes = len(np.unique(y_true))
|
|
387
|
+
proba = np.eye(n_classes)[preds]
|
|
388
|
+
|
|
389
|
+
y_pred = np.asarray(np.argmax(proba, axis=1), dtype=int)
|
|
390
|
+
|
|
391
|
+
acc = float(accuracy_score(y_true, y_pred))
|
|
392
|
+
prec = float(precision_score(y_true, y_pred, average='macro', zero_division=0))
|
|
393
|
+
f1 = float(f1_score(y_true, y_pred, average='macro'))
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
log_metric('test_accuracy', acc)
|
|
397
|
+
log_metric('test_precision', prec)
|
|
398
|
+
log_metric('test_f1', f1)
|
|
399
|
+
except Exception:
|
|
400
|
+
pass
|
|
401
|
+
|
|
402
|
+
return {'test_accuracy': acc, 'test_precision': prec, 'test_f1': f1}
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# Override training processes to consume new preprocess outputs
|
|
406
|
+
@process()
|
|
407
|
+
def define_linear_training_process(data, hyperparameters):
|
|
408
|
+
prep = data.get('preprocess_linear_nn', {})
|
|
409
|
+
result = train_logistic_classifier(prep_data=prep, hyperparameters=hyperparameters)
|
|
410
|
+
result['X_test'] = prep.get('X_test')
|
|
411
|
+
result['y_test'] = prep.get('y_test')
|
|
412
|
+
result['row_indices_test'] = prep.get('row_indices_test')
|
|
413
|
+
return result
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
@process()
|
|
417
|
+
def define_nn_training_process(data, hyperparameters):
|
|
418
|
+
prep = data.get('preprocess_linear_nn', {})
|
|
419
|
+
result = train_and_evaluate_nn_classifier(prep_data=prep, hyperparameters=hyperparameters)
|
|
420
|
+
result['X_test'] = prep.get('X_test')
|
|
421
|
+
result['y_test'] = prep.get('y_test')
|
|
422
|
+
result['row_indices_test'] = prep.get('row_indices_test')
|
|
423
|
+
return result
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
@process()
|
|
427
|
+
def define_xgb_training_process(data, hyperparameters):
|
|
428
|
+
prep = data.get('preprocess_xgb', {})
|
|
429
|
+
result = train_xgb_classifier(prep_data=prep, hyperparameters=hyperparameters)
|
|
430
|
+
result['X_test'] = prep.get('X_test')
|
|
431
|
+
result['y_test'] = prep.get('y_test')
|
|
432
|
+
result['row_indices_test'] = prep.get('row_indices_test')
|
|
433
|
+
return result
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
@process()
|
|
437
|
+
def define_linear_inference_process(data):
|
|
438
|
+
train_res = data.get('linear_training', {})
|
|
439
|
+
model = train_res.get('model')
|
|
440
|
+
X_test = train_res.get('X_test')
|
|
441
|
+
y_test = train_res.get('y_test')
|
|
442
|
+
result = test_inference_classification(model=model, X_test=X_test, y_test=y_test)
|
|
443
|
+
result['model'] = model
|
|
444
|
+
result['X_test'] = X_test
|
|
445
|
+
result['y_test'] = y_test
|
|
446
|
+
result['row_indices_test'] = train_res.get('row_indices_test')
|
|
447
|
+
result['source_training'] = 'linear_training'
|
|
448
|
+
return result
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
@process()
|
|
452
|
+
def define_nn_inference_process(data, hyperparameters):
|
|
453
|
+
train_key = (hyperparameters or {}).get('train_key', 'nn_training_a')
|
|
454
|
+
train_res = data.get(str(train_key), {})
|
|
455
|
+
model = train_res.get('model')
|
|
456
|
+
X_test = train_res.get('X_test')
|
|
457
|
+
y_test = train_res.get('y_test')
|
|
458
|
+
result = test_inference_classification(model=model, X_test=X_test, y_test=y_test)
|
|
459
|
+
result['model'] = model
|
|
460
|
+
result['X_test'] = X_test
|
|
461
|
+
result['y_test'] = y_test
|
|
462
|
+
result['row_indices_test'] = train_res.get('row_indices_test')
|
|
463
|
+
result['source_training'] = str(train_key)
|
|
464
|
+
return result
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@process()
|
|
468
|
+
def define_xgb_inference_process(data, hyperparameters):
|
|
469
|
+
train_key = (hyperparameters or {}).get('train_key', 'xgb_training_a')
|
|
470
|
+
train_res = data.get(str(train_key), {})
|
|
471
|
+
model = train_res.get('model')
|
|
472
|
+
X_test = train_res.get('X_test')
|
|
473
|
+
y_test = train_res.get('y_test')
|
|
474
|
+
result = test_inference_classification(model=model, X_test=X_test, y_test=y_test)
|
|
475
|
+
result['model'] = model
|
|
476
|
+
result['X_test'] = X_test
|
|
477
|
+
result['y_test'] = y_test
|
|
478
|
+
result['row_indices_test'] = train_res.get('row_indices_test')
|
|
479
|
+
result['source_training'] = str(train_key)
|
|
480
|
+
return result
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
@process()
|
|
484
|
+
def define_select_best_nn_process(data):
|
|
485
|
+
inf_a = data.get('nn_inference_a', {}) or {}
|
|
486
|
+
inf_b = data.get('nn_inference_b', {}) or {}
|
|
487
|
+
f1_a = float(inf_a.get('test_f1', 0.0) or 0.0)
|
|
488
|
+
f1_b = float(inf_b.get('test_f1', 0.0) or 0.0)
|
|
489
|
+
|
|
490
|
+
best_key = 'nn_training_a'
|
|
491
|
+
best_f1 = f1_a
|
|
492
|
+
best_inf = inf_a
|
|
493
|
+
if f1_b >= f1_a:
|
|
494
|
+
best_key = 'nn_training_b'
|
|
495
|
+
best_f1 = f1_b
|
|
496
|
+
best_inf = inf_b
|
|
497
|
+
|
|
498
|
+
return {
|
|
499
|
+
'model': best_inf.get('model'),
|
|
500
|
+
'X_test': best_inf.get('X_test'),
|
|
501
|
+
'y_test': best_inf.get('y_test'),
|
|
502
|
+
'row_indices_test': best_inf.get('row_indices_test'),
|
|
503
|
+
'f1': best_f1,
|
|
504
|
+
'best_key': best_key
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
@process()
|
|
509
|
+
def define_select_best_xgb_process(data):
|
|
510
|
+
inf_a = data.get('xgb_inference_a', {}) or {}
|
|
511
|
+
inf_b = data.get('xgb_inference_b', {}) or {}
|
|
512
|
+
f1_a = float(inf_a.get('test_f1', 0.0) or 0.0)
|
|
513
|
+
f1_b = float(inf_b.get('test_f1', 0.0) or 0.0)
|
|
514
|
+
|
|
515
|
+
best_key = 'xgb_training_a'
|
|
516
|
+
best_f1 = f1_a
|
|
517
|
+
best_inf = inf_a
|
|
518
|
+
if f1_b >= f1_a:
|
|
519
|
+
best_key = 'xgb_training_b'
|
|
520
|
+
best_f1 = f1_b
|
|
521
|
+
best_inf = inf_b
|
|
522
|
+
|
|
523
|
+
return {
|
|
524
|
+
'model': best_inf.get('model'),
|
|
525
|
+
'X_test': best_inf.get('X_test'),
|
|
526
|
+
'y_test': best_inf.get('y_test'),
|
|
527
|
+
'row_indices_test': best_inf.get('row_indices_test'),
|
|
528
|
+
'f1': best_f1,
|
|
529
|
+
'best_key': best_key
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
@process()
|
|
534
|
+
def define_nn_best_inference_process(data):
|
|
535
|
+
sel = data.get('nn_best_selection', {})
|
|
536
|
+
return test_inference_classification(model=sel.get('model'), X_test=sel.get('X_test'), y_test=sel.get('y_test'))
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
@process()
|
|
540
|
+
def define_xgb_best_inference_process(data):
|
|
541
|
+
sel = data.get('xgb_best_selection', {})
|
|
542
|
+
return test_inference_classification(model=sel.get('model'), X_test=sel.get('X_test'), y_test=sel.get('y_test'))
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
@process()
|
|
546
|
+
def define_ensemble_inference_process(data):
|
|
547
|
+
lin = data.get('linear_training', {}) or {}
|
|
548
|
+
xgb_sel = data.get('xgb_best_selection', {}) or {}
|
|
549
|
+
|
|
550
|
+
lin_model = lin.get('model')
|
|
551
|
+
xgb_model = xgb_sel.get('model')
|
|
552
|
+
|
|
553
|
+
X_lin = np.asarray(lin.get('X_test') or [], dtype=float)
|
|
554
|
+
y_true = np.asarray(lin.get('y_test') or [], dtype=int)
|
|
555
|
+
idx_lin = np.asarray(lin.get('row_indices_test') or [], dtype=int)
|
|
556
|
+
|
|
557
|
+
X_xgb = np.asarray(xgb_sel.get('X_test') or [], dtype=float)
|
|
558
|
+
idx_xgb = np.asarray(xgb_sel.get('row_indices_test') or [], dtype=int)
|
|
559
|
+
|
|
560
|
+
# Obtain weights from prior inferences (F1 scores)
|
|
561
|
+
w_lin = float((data.get('linear_inference', {}) or {}).get('test_f1', 0.0) or 0.0)
|
|
562
|
+
w_xgb = float((data.get('xgb_best_inference', {}) or {}).get('test_f1', 0.0) or 0.0)
|
|
563
|
+
|
|
564
|
+
weights = np.array([w_lin, w_xgb], dtype=float)
|
|
565
|
+
if not np.isfinite(weights).all() or weights.sum() <= 0:
|
|
566
|
+
weights = np.array([1.0, 1.0], dtype=float)
|
|
567
|
+
weights = weights / weights.sum()
|
|
568
|
+
|
|
569
|
+
# Predict probabilities
|
|
570
|
+
def _predict_proba_safe(m, X):
|
|
571
|
+
if m is None or X.size == 0:
|
|
572
|
+
return None
|
|
573
|
+
if hasattr(m, 'predict_proba'):
|
|
574
|
+
p = m.predict_proba(X)
|
|
575
|
+
if isinstance(p, list):
|
|
576
|
+
p = np.stack(p, axis=-1)
|
|
577
|
+
return p
|
|
578
|
+
preds = m.predict(X)
|
|
579
|
+
n_classes = 3
|
|
580
|
+
return np.eye(n_classes)[preds]
|
|
581
|
+
|
|
582
|
+
P_lin = _predict_proba_safe(lin_model, X_lin)
|
|
583
|
+
P_xgb = _predict_proba_safe(xgb_model, X_xgb)
|
|
584
|
+
|
|
585
|
+
# Align by row indices if provided
|
|
586
|
+
def _align_to(reference_idx, idx_other, P_other):
|
|
587
|
+
if P_other is None or reference_idx.size == 0 or idx_other.size == 0:
|
|
588
|
+
return None
|
|
589
|
+
if np.array_equal(reference_idx, idx_other):
|
|
590
|
+
return P_other
|
|
591
|
+
order = {int(v): i for i, v in enumerate(idx_other.tolist())}
|
|
592
|
+
aligned = np.zeros_like(P_other)
|
|
593
|
+
for pos, rid in enumerate(reference_idx.tolist()):
|
|
594
|
+
j = order.get(int(rid))
|
|
595
|
+
if j is None:
|
|
596
|
+
continue
|
|
597
|
+
aligned[pos] = P_other[j]
|
|
598
|
+
return aligned
|
|
599
|
+
|
|
600
|
+
P_xgb_aligned = _align_to(idx_lin, idx_xgb, P_xgb) if P_xgb is not None else None
|
|
601
|
+
|
|
602
|
+
# Combine probabilities (weighted soft vote)
|
|
603
|
+
probas = []
|
|
604
|
+
wlist = []
|
|
605
|
+
if P_lin is not None:
|
|
606
|
+
probas.append(P_lin)
|
|
607
|
+
wlist.append(weights[0])
|
|
608
|
+
if P_xgb_aligned is not None:
|
|
609
|
+
probas.append(P_xgb_aligned)
|
|
610
|
+
wlist.append(weights[1])
|
|
611
|
+
|
|
612
|
+
if not probas or y_true.size == 0:
|
|
613
|
+
try:
|
|
614
|
+
log_metric('test_accuracy', 0.0)
|
|
615
|
+
log_metric('test_precision', 0.0)
|
|
616
|
+
log_metric('test_f1', 0.0)
|
|
617
|
+
except Exception:
|
|
618
|
+
pass
|
|
619
|
+
return {'test_accuracy': 0.0, 'test_precision': 0.0, 'test_f1': 0.0}
|
|
620
|
+
|
|
621
|
+
W = np.array(wlist, dtype=float)
|
|
622
|
+
W = W / W.sum()
|
|
623
|
+
stacked = np.stack(probas, axis=0)
|
|
624
|
+
ens = np.tensordot(W, stacked, axes=(0, 0))
|
|
625
|
+
y_pred = np.argmax(ens, axis=1).astype(int)
|
|
626
|
+
|
|
627
|
+
acc = float(accuracy_score(y_true, y_pred))
|
|
628
|
+
prec = float(precision_score(y_true, y_pred, average='macro', zero_division=0))
|
|
629
|
+
f1 = float(f1_score(y_true, y_pred, average='macro'))
|
|
630
|
+
try:
|
|
631
|
+
log_metric('test_accuracy', acc)
|
|
632
|
+
log_metric('test_precision', prec)
|
|
633
|
+
log_metric('test_f1', f1)
|
|
634
|
+
except Exception:
|
|
635
|
+
pass
|
|
636
|
+
return {'test_accuracy': acc, 'test_precision': prec, 'test_f1': f1}
|
|
637
|
+
|
|
638
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Template: `sklearn-basic`
|
|
2
|
+
|
|
3
|
+
This is a minimal runnable template for **ExpOps** that uses the **custom** adapter (your code),
|
|
4
|
+
but trains a tiny **scikit-learn** model inside `models/model.py`.
|
|
5
|
+
|
|
6
|
+
What it includes:
|
|
7
|
+
- `configs/project_config.yaml`: config with a `{{PROJECT_ID}}` placeholder (filled automatically)
|
|
8
|
+
- `data/train.csv`: tiny dataset with a required `label` column
|
|
9
|
+
- `models/model.py`: defines `train_model` and `evaluate_model` processes
|
|
10
|
+
- `charts/plot_metrics.py`: basic matplotlib report (generates PNGs)
|
|
11
|
+
- `requirements.txt` and `charts/requirements.txt`
|
|
12
|
+
|
|
13
|
+
Run it via the CLI:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
mlops create my-project --template sklearn-basic
|
|
17
|
+
mlops run my-project
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
After the run, look under:
|
|
21
|
+
- `projects/<id>/artifacts/charts/<run-id>/plot_metrics/.../*.png` (generated by the chart node)
|
|
22
|
+
|