expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. expops-0.1.3.dist-info/METADATA +826 -0
  2. expops-0.1.3.dist-info/RECORD +86 -0
  3. expops-0.1.3.dist-info/WHEEL +5 -0
  4. expops-0.1.3.dist-info/entry_points.txt +3 -0
  5. expops-0.1.3.dist-info/licenses/LICENSE +674 -0
  6. expops-0.1.3.dist-info/top_level.txt +1 -0
  7. mlops/__init__.py +0 -0
  8. mlops/__main__.py +11 -0
  9. mlops/_version.py +34 -0
  10. mlops/adapters/__init__.py +12 -0
  11. mlops/adapters/base.py +86 -0
  12. mlops/adapters/config_schema.py +89 -0
  13. mlops/adapters/custom/__init__.py +3 -0
  14. mlops/adapters/custom/custom_adapter.py +447 -0
  15. mlops/adapters/plugin_manager.py +113 -0
  16. mlops/adapters/sklearn/__init__.py +3 -0
  17. mlops/adapters/sklearn/adapter.py +94 -0
  18. mlops/cluster/__init__.py +3 -0
  19. mlops/cluster/controller.py +496 -0
  20. mlops/cluster/process_runner.py +91 -0
  21. mlops/cluster/providers.py +258 -0
  22. mlops/core/__init__.py +95 -0
  23. mlops/core/custom_model_base.py +38 -0
  24. mlops/core/dask_networkx_executor.py +1265 -0
  25. mlops/core/executor_worker.py +1239 -0
  26. mlops/core/experiment_tracker.py +81 -0
  27. mlops/core/graph_types.py +64 -0
  28. mlops/core/networkx_parser.py +135 -0
  29. mlops/core/payload_spill.py +278 -0
  30. mlops/core/pipeline_utils.py +162 -0
  31. mlops/core/process_hashing.py +216 -0
  32. mlops/core/step_state_manager.py +1298 -0
  33. mlops/core/step_system.py +956 -0
  34. mlops/core/workspace.py +99 -0
  35. mlops/environment/__init__.py +10 -0
  36. mlops/environment/base.py +43 -0
  37. mlops/environment/conda_manager.py +307 -0
  38. mlops/environment/factory.py +70 -0
  39. mlops/environment/pyenv_manager.py +146 -0
  40. mlops/environment/setup_env.py +31 -0
  41. mlops/environment/system_manager.py +66 -0
  42. mlops/environment/utils.py +105 -0
  43. mlops/environment/venv_manager.py +134 -0
  44. mlops/main.py +527 -0
  45. mlops/managers/project_manager.py +400 -0
  46. mlops/managers/reproducibility_manager.py +575 -0
  47. mlops/platform.py +996 -0
  48. mlops/reporting/__init__.py +16 -0
  49. mlops/reporting/context.py +187 -0
  50. mlops/reporting/entrypoint.py +292 -0
  51. mlops/reporting/kv_utils.py +77 -0
  52. mlops/reporting/registry.py +50 -0
  53. mlops/runtime/__init__.py +9 -0
  54. mlops/runtime/context.py +34 -0
  55. mlops/runtime/env_export.py +113 -0
  56. mlops/storage/__init__.py +12 -0
  57. mlops/storage/adapters/__init__.py +9 -0
  58. mlops/storage/adapters/gcp_kv_store.py +778 -0
  59. mlops/storage/adapters/gcs_object_store.py +96 -0
  60. mlops/storage/adapters/memory_store.py +240 -0
  61. mlops/storage/adapters/redis_store.py +438 -0
  62. mlops/storage/factory.py +199 -0
  63. mlops/storage/interfaces/__init__.py +6 -0
  64. mlops/storage/interfaces/kv_store.py +118 -0
  65. mlops/storage/path_utils.py +38 -0
  66. mlops/templates/premier-league/charts/plot_metrics.js +70 -0
  67. mlops/templates/premier-league/charts/plot_metrics.py +145 -0
  68. mlops/templates/premier-league/charts/requirements.txt +6 -0
  69. mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
  70. mlops/templates/premier-league/configs/project_config.yaml +207 -0
  71. mlops/templates/premier-league/data/England CSV.csv +12154 -0
  72. mlops/templates/premier-league/models/premier_league_model.py +638 -0
  73. mlops/templates/premier-league/requirements.txt +8 -0
  74. mlops/templates/sklearn-basic/README.md +22 -0
  75. mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
  76. mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
  77. mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
  78. mlops/templates/sklearn-basic/data/train.csv +14 -0
  79. mlops/templates/sklearn-basic/models/model.py +62 -0
  80. mlops/templates/sklearn-basic/requirements.txt +10 -0
  81. mlops/web/__init__.py +3 -0
  82. mlops/web/server.py +585 -0
  83. mlops/web/ui/index.html +52 -0
  84. mlops/web/ui/mlops-charts.js +357 -0
  85. mlops/web/ui/script.js +1244 -0
  86. mlops/web/ui/styles.css +248 -0
@@ -0,0 +1,638 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List
6
+
7
+ import logging
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
13
+ from sklearn.compose import ColumnTransformer
14
+ from sklearn.decomposition import PCA
15
+ from sklearn.linear_model import LogisticRegression
16
+ from sklearn.neural_network import MLPClassifier
17
+ from xgboost import XGBClassifier
18
+ from sklearn.metrics import accuracy_score, precision_score, f1_score
19
+
20
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "src"))
21
+
22
+ from mlops.core import (
23
+ step, process, SerializableData, log_metric
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ def _csv_path() -> Path:
29
+ #return Path(__file__).parent.parent / "data" / "England CSV.csv"
30
+ return Path("/home/e/e0958526/mlops-platform/projects/premier-league/data/England CSV.csv")
31
+
32
+ def _get_result_column_name(df: pd.DataFrame) -> str:
33
+ if 'FT Result' in df.columns:
34
+ return 'FT Result'
35
+ if 'FTR' in df.columns:
36
+ return 'FTR'
37
+ raise ValueError("Missing required result column: expected 'FT Result' or 'FTR'")
38
+
39
+
40
+ def _derive_outcome_labels(df: pd.DataFrame) -> np.ndarray:
41
+ result_col = _get_result_column_name(df)
42
+ mapping = {'H': 0, 'D': 1, 'A': 2}
43
+ y = df[result_col].astype(str).map(mapping)
44
+ if y.isnull().any():
45
+ bad = df.loc[y.isnull(), result_col].unique().tolist()
46
+ raise ValueError(f"Unexpected values in {result_col}: {bad}")
47
+ return y.astype(int).to_numpy()
48
+
49
+
50
+ def _get_cat_num_cols(df: pd.DataFrame) -> tuple[list[str], list[str]]:
51
+ cat_cols = [c for c in ['Season', 'HomeTeam', 'AwayTeam', 'Referee', 'League'] if c in df.columns]
52
+ num_cols = [
53
+ c for c in [
54
+ 'HTH Goals', 'HTA Goals', 'H Shots', 'A Shots', 'H SOT', 'A SOT',
55
+ 'H Fouls', 'A Fouls', 'H Corners', 'A Corners', 'H Yellow', 'A Yellow',
56
+ 'H Red', 'A Red', 'Display_Order', 'DayOfWeek', 'Month'
57
+ ] if c in df.columns
58
+ ]
59
+ return cat_cols, num_cols
60
+
61
+
62
+ def _build_features_dataframe(df: pd.DataFrame, cat_cols: list[str], num_cols: list[str]) -> pd.DataFrame:
63
+ X_df = pd.DataFrame(index=df.index)
64
+ # Numeric
65
+ for c in num_cols:
66
+ s = pd.to_numeric(df[c], errors='coerce')
67
+ if s.isnull().any():
68
+ med = s.median()
69
+ s = s.fillna(med if not np.isnan(med) else 0)
70
+ X_df[c] = s.astype(float)
71
+ for c in cat_cols:
72
+ X_df[c] = df[c].astype(str)
73
+ for drop_c in ['FT Result', 'FTR', 'HT Result', 'Date']:
74
+ if drop_c in X_df.columns:
75
+ X_df = X_df.drop(columns=[drop_c])
76
+ return X_df
77
+
78
+
79
+ @process()
80
+ def define_feature_engineering_generic_process(data, hyperparameters):
81
+ """Load CSV, parse dates, derive labels (H/D/A), stratified split indices, and log analysis metrics."""
82
+
83
+ @step()
84
+ def load_csv():
85
+ path = _csv_path()
86
+ if not path.exists():
87
+ raise FileNotFoundError(f"Premier League CSV not found at {path}")
88
+ df = pd.read_csv(path)
89
+ try:
90
+ logger.info(f"[feature_engineering_generic.load_csv] Loaded df shape: {df.shape}")
91
+ except Exception:
92
+ pass
93
+ return {'df': df.to_dict(orient='list')}
94
+
95
+ @step()
96
+ def derive_labels_and_indices(raw: SerializableData, hyperparameters: Dict[str, Any] | None = None):
97
+ df = pd.DataFrame(raw['df'])
98
+ # Parse date-based features
99
+ if 'Date' in df.columns:
100
+ dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
101
+ df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
102
+ df['Month'] = dt.dt.month.fillna(1).astype(int)
103
+ else:
104
+ df['DayOfWeek'] = 0
105
+ df['Month'] = 1
106
+
107
+ y = _derive_outcome_labels(df)
108
+
109
+ # Stratified split indices
110
+ test_size = float((hyperparameters or {}).get('test_size', 0.2))
111
+ idx = np.arange(len(df))
112
+ idx_train, idx_test = train_test_split(idx, test_size=test_size, shuffle=True, stratify=y)
113
+
114
+ # Goals histograms for static charts
115
+ hist_home = {}
116
+ hist_away = {}
117
+ if 'FTH Goals' in df.columns and 'FTA Goals' in df.columns:
118
+ goals_home = pd.to_numeric(df['FTH Goals'], errors='coerce').fillna(0).astype(int)
119
+ goals_away = pd.to_numeric(df['FTA Goals'], errors='coerce').fillna(0).astype(int)
120
+ hist_home = goals_home.value_counts().sort_index().astype(int).to_dict()
121
+ hist_away = goals_away.value_counts().sort_index().astype(int).to_dict()
122
+ log_metric('goals_hist_home', hist_home)
123
+ log_metric('goals_hist_away', hist_away)
124
+
125
+ return {
126
+ 'df': df.to_dict(orient='list'),
127
+ 'labels': y.astype(int).tolist(),
128
+ 'train_idx': idx_train.astype(int).tolist(),
129
+ 'test_idx': idx_test.astype(int).tolist(),
130
+ 'n_train': int(idx_train.shape[0]),
131
+ 'n_test': int(idx_test.shape[0])
132
+ }
133
+
134
+ @step()
135
+ def feature_analysis(basic: SerializableData, hyperparameters: Dict[str, Any] | None = None):
136
+ df = pd.DataFrame(basic['df'])
137
+ if 'DayOfWeek' not in df.columns or 'Month' not in df.columns:
138
+ if 'Date' in df.columns:
139
+ dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
140
+ df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
141
+ df['Month'] = dt.dt.month.fillna(1).astype(int)
142
+ else:
143
+ df['DayOfWeek'] = 0
144
+ df['Month'] = 1
145
+
146
+ cat_cols, num_cols = _get_cat_num_cols(df)
147
+ X_df = _build_features_dataframe(df, cat_cols, num_cols)
148
+
149
+ encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
150
+ preprocessor = ColumnTransformer(
151
+ transformers=[
152
+ ('cat', encoder, cat_cols),
153
+ ('num', StandardScaler(), num_cols)
154
+ ],
155
+ remainder='drop'
156
+ )
157
+
158
+ X_all = preprocessor.fit_transform(X_df)
159
+ pca_components = int((hyperparameters or {}).get('pca_components', 16))
160
+ n_components = min(pca_components, X_all.shape[1]) if X_all.shape[1] > 0 else 0
161
+ if n_components > 0:
162
+ pca = PCA(n_components=n_components, random_state=int((hyperparameters or {}).get('random_seed', 42)))
163
+ _ = pca.fit_transform(X_all)
164
+ evr = pca.explained_variance_ratio_.tolist()
165
+ cum = np.cumsum(pca.explained_variance_ratio_).tolist()
166
+ else:
167
+ evr = []
168
+ cum = []
169
+
170
+ log_metric('pca_explained_variance_ratio', evr)
171
+ log_metric('pca_cumulative_variance', cum)
172
+ return {}
173
+
174
+ raw = load_csv()
175
+ basic = derive_labels_and_indices(raw=raw, hyperparameters=hyperparameters)
176
+ _ = feature_analysis(basic=basic, hyperparameters=hyperparameters)
177
+ return basic
178
+
179
+
180
+ @process()
181
+ def define_preprocess_linear_nn_process(data):
182
+ """Preprocess for Linear/NN: OHE categorical + StandardScaler numeric."""
183
+ src = data.get('feature_engineering_generic', {})
184
+ df = pd.DataFrame(src['df'])
185
+ y = np.asarray(src['labels'], dtype=int)
186
+ idx_train = np.asarray(src['train_idx'], dtype=int)
187
+ idx_test = np.asarray(src['test_idx'], dtype=int)
188
+
189
+ # Date-derived columns already present from FE; if not, add defaults
190
+ if 'DayOfWeek' not in df.columns or 'Month' not in df.columns:
191
+ if 'Date' in df.columns:
192
+ dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
193
+ df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
194
+ df['Month'] = dt.dt.month.fillna(1).astype(int)
195
+ else:
196
+ df['DayOfWeek'] = 0
197
+ df['Month'] = 1
198
+
199
+ cat_cols, num_cols = _get_cat_num_cols(df)
200
+ X_df = _build_features_dataframe(df, cat_cols, num_cols)
201
+
202
+ encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
203
+ preprocessor = ColumnTransformer(
204
+ transformers=[
205
+ ('cat', encoder, cat_cols),
206
+ ('num', StandardScaler(), num_cols)
207
+ ],
208
+ remainder='drop'
209
+ )
210
+
211
+ X_train = preprocessor.fit_transform(X_df.iloc[idx_train])
212
+ X_test = preprocessor.transform(X_df.iloc[idx_test])
213
+ y_train = y[idx_train]
214
+ y_test = y[idx_test]
215
+
216
+ return {
217
+ 'X_train': X_train.astype(float).tolist(),
218
+ 'X_test': X_test.astype(float).tolist(),
219
+ 'y_train': y_train.astype(int).tolist(),
220
+ 'y_test': y_test.astype(int).tolist(),
221
+ 'row_indices_train': idx_train.astype(int).tolist(),
222
+ 'row_indices_test': idx_test.astype(int).tolist(),
223
+ 'n_train': int(X_train.shape[0]),
224
+ 'n_test': int(X_test.shape[0])
225
+ }
226
+
227
+
228
+ @process()
229
+ def define_preprocess_xgb_process(data):
230
+ """Preprocess for XGB: OHE categorical only (no scaling)."""
231
+ src = data.get('feature_engineering_generic', {})
232
+ df = pd.DataFrame(src['df'])
233
+ y = np.asarray(src['labels'], dtype=int)
234
+ idx_train = np.asarray(src['train_idx'], dtype=int)
235
+ idx_test = np.asarray(src['test_idx'], dtype=int)
236
+
237
+ if 'DayOfWeek' not in df.columns or 'Month' not in df.columns:
238
+ if 'Date' in df.columns:
239
+ dt = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
240
+ df['DayOfWeek'] = dt.dt.weekday.fillna(0).astype(int)
241
+ df['Month'] = dt.dt.month.fillna(1).astype(int)
242
+ else:
243
+ df['DayOfWeek'] = 1
244
+ df['Month'] = 1
245
+
246
+ cat_cols, num_cols = _get_cat_num_cols(df)
247
+ X_df = _build_features_dataframe(df, cat_cols, num_cols)
248
+
249
+ encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
250
+ preprocessor = ColumnTransformer(
251
+ transformers=[
252
+ ('cat', encoder, cat_cols),
253
+ ('num', 'passthrough', num_cols)
254
+ ],
255
+ remainder='drop'
256
+ )
257
+
258
+ X_train = preprocessor.fit_transform(X_df.iloc[idx_train])
259
+ X_test = preprocessor.transform(X_df.iloc[idx_test])
260
+ y_train = y[idx_train]
261
+ y_test = y[idx_test]
262
+
263
+ return {
264
+ 'X_train': X_train.astype(float).tolist(),
265
+ 'X_test': X_test.astype(float).tolist(),
266
+ 'y_train': y_train.astype(int).tolist(),
267
+ 'y_test': y_test.astype(int).tolist(),
268
+ 'row_indices_train': idx_train.astype(int).tolist(),
269
+ 'row_indices_test': idx_test.astype(int).tolist(),
270
+ 'n_train': int(X_train.shape[0]),
271
+ 'n_test': int(X_test.shape[0])
272
+ }
273
+
274
+
275
+ @step()
276
+ def train_logistic_classifier(prep_data: SerializableData, hyperparameters: Dict[str, Any] | None = None) -> Dict[str, Any]:
277
+ X_train = np.asarray(prep_data.get('X_train', []), dtype=float)
278
+ y_train = np.asarray(prep_data.get('y_train', []), dtype=int)
279
+ if X_train.size == 0:
280
+ raise ValueError("Empty training data provided to Logistic training step")
281
+
282
+ params = (hyperparameters or {}).get('logreg_params', {})
283
+ max_iter = int(params.get('max_iter', 500))
284
+ class_weight = params.get('class_weight', None)
285
+
286
+ model = LogisticRegression(
287
+ solver='lbfgs',
288
+ max_iter=max_iter,
289
+ class_weight=class_weight
290
+ )
291
+ model.fit(X_train, y_train)
292
+ return {'model': model}
293
+
294
+
295
+ @step()
296
+ def train_and_evaluate_nn_classifier(prep_data: SerializableData, hyperparameters: Dict[str, Any] | None = None, branch_name: str = "") -> Dict[str, Any]:
297
+ hparams = (hyperparameters or {}).get("nn_params", {})
298
+ hidden_layers = tuple(hparams.get("hidden_layers", [128, 64]))
299
+ learning_rate = float(hparams.get("learning_rate", 0.001))
300
+ epochs = int(hparams.get("epochs", 50))
301
+ random_seed = int(hparams.get("random_seed", 30))
302
+
303
+ X_train = np.asarray(prep_data.get('X_train', []), dtype=float)
304
+ y_train = np.asarray(prep_data.get('y_train', []), dtype=int)
305
+ if X_train.size == 0:
306
+ raise ValueError("Empty training data provided to NN classifier training step")
307
+
308
+ clf = MLPClassifier(
309
+ hidden_layer_sizes=hidden_layers,
310
+ learning_rate_init=learning_rate,
311
+ activation='relu',
312
+ solver='adam',
313
+ alpha=0.0001,
314
+ max_iter=1,
315
+ warm_start=True,
316
+ early_stopping=False,
317
+ shuffle=True,
318
+ random_state=random_seed,
319
+ verbose=False
320
+ )
321
+
322
+ for epoch in range(epochs):
323
+ clf.fit(X_train, y_train)
324
+ try:
325
+ if hasattr(clf, 'loss_'):
326
+ log_metric('train_loss', float(clf.loss_), step=epoch + 1)
327
+ preds = clf.predict(X_train)
328
+ f1 = float(f1_score(y_train, preds, average='macro'))
329
+ log_metric('train_f1', f1, step=epoch + 1)
330
+ except Exception as e:
331
+ logger.warning(f"[{branch_name or 'nn'}] Failed to log training metrics @epoch {epoch + 1}: {e}")
332
+ return {'model': clf}
333
+
334
+
335
+ @step()
336
+ def train_xgb_classifier(prep_data: SerializableData, hyperparameters: Dict[str, Any] | None = None) -> Dict[str, Any]:
337
+ xgb_params = (hyperparameters or {}).get("xgb_params", {})
338
+ params = {
339
+ 'n_estimators': int(xgb_params.get('n_estimators', 400)),
340
+ 'max_depth': int(xgb_params.get('max_depth', 4)),
341
+ 'learning_rate': float(xgb_params.get('learning_rate', 0.1)),
342
+ 'subsample': float(xgb_params.get('subsample', 0.9)),
343
+ 'colsample_bytree': float(xgb_params.get('colsample_bytree', 0.9)),
344
+ 'n_jobs': int(xgb_params.get('n_jobs', 1)),
345
+ 'verbosity': 0,
346
+ 'random_state': int(xgb_params.get('random_state', 42)) if 'random_state' in xgb_params else None,
347
+ 'tree_method': xgb_params.get('tree_method', 'auto'),
348
+ 'objective': 'multi:softprob',
349
+ 'num_class': 3,
350
+ }
351
+
352
+ params = {k: v for k, v in params.items() if v is not None}
353
+
354
+ X_train = np.asarray(prep_data.get('X_train', []), dtype=float)
355
+ y_train = np.asarray(prep_data.get('y_train', []), dtype=int)
356
+ if X_train.size == 0:
357
+ raise ValueError("Empty training data provided to XGB classifier training step")
358
+
359
+ model = XGBClassifier(**params)
360
+ model.fit(X_train, y_train)
361
+ return {'model': model}
362
+
363
+
364
+ @step()
365
+ def test_inference_classification(model: SerializableData, X_test: SerializableData, y_test: SerializableData) -> Dict[str, Any]:
366
+ X = np.asarray(X_test or [], dtype=float)
367
+ y_true = np.asarray(y_test or [], dtype=int)
368
+ if X.size == 0 or y_true.size == 0:
369
+ try:
370
+ log_metric('test_accuracy', 0.0)
371
+ log_metric('test_precision', 0.0)
372
+ log_metric('test_f1', 0.0)
373
+ except Exception:
374
+ pass
375
+ return {'test_accuracy': 0.0, 'test_precision': 0.0, 'test_f1': 0.0}
376
+
377
+ # Predict probabilities if available
378
+ if hasattr(model, 'predict_proba'):
379
+ proba = model.predict_proba(X)
380
+ if isinstance(proba, list):
381
+ proba = np.stack(proba, axis=-1)
382
+ if proba.ndim == 3:
383
+ proba = proba
384
+ else:
385
+ preds = model.predict(X)
386
+ n_classes = len(np.unique(y_true))
387
+ proba = np.eye(n_classes)[preds]
388
+
389
+ y_pred = np.asarray(np.argmax(proba, axis=1), dtype=int)
390
+
391
+ acc = float(accuracy_score(y_true, y_pred))
392
+ prec = float(precision_score(y_true, y_pred, average='macro', zero_division=0))
393
+ f1 = float(f1_score(y_true, y_pred, average='macro'))
394
+
395
+ try:
396
+ log_metric('test_accuracy', acc)
397
+ log_metric('test_precision', prec)
398
+ log_metric('test_f1', f1)
399
+ except Exception:
400
+ pass
401
+
402
+ return {'test_accuracy': acc, 'test_precision': prec, 'test_f1': f1}
403
+
404
+
405
+ # Override training processes to consume new preprocess outputs
406
+ @process()
407
+ def define_linear_training_process(data, hyperparameters):
408
+ prep = data.get('preprocess_linear_nn', {})
409
+ result = train_logistic_classifier(prep_data=prep, hyperparameters=hyperparameters)
410
+ result['X_test'] = prep.get('X_test')
411
+ result['y_test'] = prep.get('y_test')
412
+ result['row_indices_test'] = prep.get('row_indices_test')
413
+ return result
414
+
415
+
416
+ @process()
417
+ def define_nn_training_process(data, hyperparameters):
418
+ prep = data.get('preprocess_linear_nn', {})
419
+ result = train_and_evaluate_nn_classifier(prep_data=prep, hyperparameters=hyperparameters)
420
+ result['X_test'] = prep.get('X_test')
421
+ result['y_test'] = prep.get('y_test')
422
+ result['row_indices_test'] = prep.get('row_indices_test')
423
+ return result
424
+
425
+
426
+ @process()
427
+ def define_xgb_training_process(data, hyperparameters):
428
+ prep = data.get('preprocess_xgb', {})
429
+ result = train_xgb_classifier(prep_data=prep, hyperparameters=hyperparameters)
430
+ result['X_test'] = prep.get('X_test')
431
+ result['y_test'] = prep.get('y_test')
432
+ result['row_indices_test'] = prep.get('row_indices_test')
433
+ return result
434
+
435
+
436
+ @process()
437
+ def define_linear_inference_process(data):
438
+ train_res = data.get('linear_training', {})
439
+ model = train_res.get('model')
440
+ X_test = train_res.get('X_test')
441
+ y_test = train_res.get('y_test')
442
+ result = test_inference_classification(model=model, X_test=X_test, y_test=y_test)
443
+ result['model'] = model
444
+ result['X_test'] = X_test
445
+ result['y_test'] = y_test
446
+ result['row_indices_test'] = train_res.get('row_indices_test')
447
+ result['source_training'] = 'linear_training'
448
+ return result
449
+
450
+
451
+ @process()
452
+ def define_nn_inference_process(data, hyperparameters):
453
+ train_key = (hyperparameters or {}).get('train_key', 'nn_training_a')
454
+ train_res = data.get(str(train_key), {})
455
+ model = train_res.get('model')
456
+ X_test = train_res.get('X_test')
457
+ y_test = train_res.get('y_test')
458
+ result = test_inference_classification(model=model, X_test=X_test, y_test=y_test)
459
+ result['model'] = model
460
+ result['X_test'] = X_test
461
+ result['y_test'] = y_test
462
+ result['row_indices_test'] = train_res.get('row_indices_test')
463
+ result['source_training'] = str(train_key)
464
+ return result
465
+
466
+
467
+ @process()
468
+ def define_xgb_inference_process(data, hyperparameters):
469
+ train_key = (hyperparameters or {}).get('train_key', 'xgb_training_a')
470
+ train_res = data.get(str(train_key), {})
471
+ model = train_res.get('model')
472
+ X_test = train_res.get('X_test')
473
+ y_test = train_res.get('y_test')
474
+ result = test_inference_classification(model=model, X_test=X_test, y_test=y_test)
475
+ result['model'] = model
476
+ result['X_test'] = X_test
477
+ result['y_test'] = y_test
478
+ result['row_indices_test'] = train_res.get('row_indices_test')
479
+ result['source_training'] = str(train_key)
480
+ return result
481
+
482
+
483
+ @process()
484
+ def define_select_best_nn_process(data):
485
+ inf_a = data.get('nn_inference_a', {}) or {}
486
+ inf_b = data.get('nn_inference_b', {}) or {}
487
+ f1_a = float(inf_a.get('test_f1', 0.0) or 0.0)
488
+ f1_b = float(inf_b.get('test_f1', 0.0) or 0.0)
489
+
490
+ best_key = 'nn_training_a'
491
+ best_f1 = f1_a
492
+ best_inf = inf_a
493
+ if f1_b >= f1_a:
494
+ best_key = 'nn_training_b'
495
+ best_f1 = f1_b
496
+ best_inf = inf_b
497
+
498
+ return {
499
+ 'model': best_inf.get('model'),
500
+ 'X_test': best_inf.get('X_test'),
501
+ 'y_test': best_inf.get('y_test'),
502
+ 'row_indices_test': best_inf.get('row_indices_test'),
503
+ 'f1': best_f1,
504
+ 'best_key': best_key
505
+ }
506
+
507
+
508
+ @process()
509
+ def define_select_best_xgb_process(data):
510
+ inf_a = data.get('xgb_inference_a', {}) or {}
511
+ inf_b = data.get('xgb_inference_b', {}) or {}
512
+ f1_a = float(inf_a.get('test_f1', 0.0) or 0.0)
513
+ f1_b = float(inf_b.get('test_f1', 0.0) or 0.0)
514
+
515
+ best_key = 'xgb_training_a'
516
+ best_f1 = f1_a
517
+ best_inf = inf_a
518
+ if f1_b >= f1_a:
519
+ best_key = 'xgb_training_b'
520
+ best_f1 = f1_b
521
+ best_inf = inf_b
522
+
523
+ return {
524
+ 'model': best_inf.get('model'),
525
+ 'X_test': best_inf.get('X_test'),
526
+ 'y_test': best_inf.get('y_test'),
527
+ 'row_indices_test': best_inf.get('row_indices_test'),
528
+ 'f1': best_f1,
529
+ 'best_key': best_key
530
+ }
531
+
532
+
533
+ @process()
534
+ def define_nn_best_inference_process(data):
535
+ sel = data.get('nn_best_selection', {})
536
+ return test_inference_classification(model=sel.get('model'), X_test=sel.get('X_test'), y_test=sel.get('y_test'))
537
+
538
+
539
+ @process()
540
+ def define_xgb_best_inference_process(data):
541
+ sel = data.get('xgb_best_selection', {})
542
+ return test_inference_classification(model=sel.get('model'), X_test=sel.get('X_test'), y_test=sel.get('y_test'))
543
+
544
+
545
+ @process()
546
+ def define_ensemble_inference_process(data):
547
+ lin = data.get('linear_training', {}) or {}
548
+ xgb_sel = data.get('xgb_best_selection', {}) or {}
549
+
550
+ lin_model = lin.get('model')
551
+ xgb_model = xgb_sel.get('model')
552
+
553
+ X_lin = np.asarray(lin.get('X_test') or [], dtype=float)
554
+ y_true = np.asarray(lin.get('y_test') or [], dtype=int)
555
+ idx_lin = np.asarray(lin.get('row_indices_test') or [], dtype=int)
556
+
557
+ X_xgb = np.asarray(xgb_sel.get('X_test') or [], dtype=float)
558
+ idx_xgb = np.asarray(xgb_sel.get('row_indices_test') or [], dtype=int)
559
+
560
+ # Obtain weights from prior inferences (F1 scores)
561
+ w_lin = float((data.get('linear_inference', {}) or {}).get('test_f1', 0.0) or 0.0)
562
+ w_xgb = float((data.get('xgb_best_inference', {}) or {}).get('test_f1', 0.0) or 0.0)
563
+
564
+ weights = np.array([w_lin, w_xgb], dtype=float)
565
+ if not np.isfinite(weights).all() or weights.sum() <= 0:
566
+ weights = np.array([1.0, 1.0], dtype=float)
567
+ weights = weights / weights.sum()
568
+
569
+ # Predict probabilities
570
+ def _predict_proba_safe(m, X):
571
+ if m is None or X.size == 0:
572
+ return None
573
+ if hasattr(m, 'predict_proba'):
574
+ p = m.predict_proba(X)
575
+ if isinstance(p, list):
576
+ p = np.stack(p, axis=-1)
577
+ return p
578
+ preds = m.predict(X)
579
+ n_classes = 3
580
+ return np.eye(n_classes)[preds]
581
+
582
+ P_lin = _predict_proba_safe(lin_model, X_lin)
583
+ P_xgb = _predict_proba_safe(xgb_model, X_xgb)
584
+
585
+ # Align by row indices if provided
586
+ def _align_to(reference_idx, idx_other, P_other):
587
+ if P_other is None or reference_idx.size == 0 or idx_other.size == 0:
588
+ return None
589
+ if np.array_equal(reference_idx, idx_other):
590
+ return P_other
591
+ order = {int(v): i for i, v in enumerate(idx_other.tolist())}
592
+ aligned = np.zeros_like(P_other)
593
+ for pos, rid in enumerate(reference_idx.tolist()):
594
+ j = order.get(int(rid))
595
+ if j is None:
596
+ continue
597
+ aligned[pos] = P_other[j]
598
+ return aligned
599
+
600
+ P_xgb_aligned = _align_to(idx_lin, idx_xgb, P_xgb) if P_xgb is not None else None
601
+
602
+ # Combine probabilities (weighted soft vote)
603
+ probas = []
604
+ wlist = []
605
+ if P_lin is not None:
606
+ probas.append(P_lin)
607
+ wlist.append(weights[0])
608
+ if P_xgb_aligned is not None:
609
+ probas.append(P_xgb_aligned)
610
+ wlist.append(weights[1])
611
+
612
+ if not probas or y_true.size == 0:
613
+ try:
614
+ log_metric('test_accuracy', 0.0)
615
+ log_metric('test_precision', 0.0)
616
+ log_metric('test_f1', 0.0)
617
+ except Exception:
618
+ pass
619
+ return {'test_accuracy': 0.0, 'test_precision': 0.0, 'test_f1': 0.0}
620
+
621
+ W = np.array(wlist, dtype=float)
622
+ W = W / W.sum()
623
+ stacked = np.stack(probas, axis=0)
624
+ ens = np.tensordot(W, stacked, axes=(0, 0))
625
+ y_pred = np.argmax(ens, axis=1).astype(int)
626
+
627
+ acc = float(accuracy_score(y_true, y_pred))
628
+ prec = float(precision_score(y_true, y_pred, average='macro', zero_division=0))
629
+ f1 = float(f1_score(y_true, y_pred, average='macro'))
630
+ try:
631
+ log_metric('test_accuracy', acc)
632
+ log_metric('test_precision', prec)
633
+ log_metric('test_f1', f1)
634
+ except Exception:
635
+ pass
636
+ return {'test_accuracy': acc, 'test_precision': prec, 'test_f1': f1}
637
+
638
+
@@ -0,0 +1,8 @@
1
+ numpy>=1.24.0
2
+ pandas>=2.0.0
3
+ scikit-learn>=1.8.0
4
+ xgboost>=1.7.0
5
+
6
+
7
+
8
+
@@ -0,0 +1,22 @@
1
+ # Template: `sklearn-basic`
2
+
3
+ This is a minimal runnable template for **ExpOps** that uses the **custom** adapter (your code),
4
+ but trains a tiny **scikit-learn** model inside `models/model.py`.
5
+
6
+ What it includes:
7
+ - `configs/project_config.yaml`: config with a `{{PROJECT_ID}}` placeholder (filled automatically)
8
+ - `data/train.csv`: tiny dataset with a required `label` column
9
+ - `models/model.py`: defines `train_model` and `evaluate_model` processes
10
+ - `charts/plot_metrics.py`: basic matplotlib report (generates PNGs)
11
+ - `requirements.txt` and `charts/requirements.txt`
12
+
13
+ Run it via the CLI:
14
+
15
+ ```bash
16
+ mlops create my-project --template sklearn-basic
17
+ mlops run my-project
18
+ ```
19
+
20
+ After the run, look under:
21
+ - `projects/<id>/artifacts/charts/<run-id>/plot_metrics/.../*.png` (generated by the chart node)
22
+