phoenix-ml-workflow 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoenix_ml/__init__.py +6 -0
- phoenix_ml/data_preprocessing.py +335 -0
- phoenix_ml/hyperparameter_optimisation.py +969 -0
- phoenix_ml/interpretability.py +230 -0
- phoenix_ml/model_training.py +178 -0
- phoenix_ml/models.py +93 -0
- phoenix_ml/persistence.py +122 -0
- phoenix_ml/physics_model.py +109 -0
- phoenix_ml/postprocessing.py +400 -0
- phoenix_ml/report_generation.py +812 -0
- phoenix_ml/system_info.py +97 -0
- phoenix_ml/uncertainty_quantification.py +238 -0
- phoenix_ml/workflow.py +273 -0
- phoenix_ml_workflow-1.0.0.dist-info/METADATA +84 -0
- phoenix_ml_workflow-1.0.0.dist-info/RECORD +17 -0
- phoenix_ml_workflow-1.0.0.dist-info/WHEEL +5 -0
- phoenix_ml_workflow-1.0.0.dist-info/top_level.txt +1 -0
phoenix_ml/__init__.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
# data_preprocessing.py
|
|
2
|
+
# This module serves as the analysis of the dataset provided before it undergoes any model evaluation.
|
|
3
|
+
# This includes the test/train split, features-target scatter plots, boxplots, and distance correlation.
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
from sklearn.preprocessing import StandardScaler
|
|
11
|
+
|
|
12
|
+
import matplotlib.lines as mlines
|
|
13
|
+
import matplotlib.patches as mpatches
|
|
14
|
+
|
|
15
|
+
import seaborn as sns
|
|
16
|
+
import dcor
|
|
17
|
+
|
|
18
|
+
def load_and_preprocess_data(filepath, test_size, split_method="random", target_columns=None):
|
|
19
|
+
"""
|
|
20
|
+
Load a CSV, choose targets, split into train/test by a chosen method, and standardize features.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filepath (str): Path to CSV.
|
|
24
|
+
test_size (float): Proportion of rows in the test split (0–1).
|
|
25
|
+
split_method (str): 'random' | 'first' | 'last'.
|
|
26
|
+
target_columns (list[str] | None): Columns to treat as targets. If None, uses last column.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
tuple: (df, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler,
|
|
30
|
+
target_columns, feature_names)
|
|
31
|
+
"""
|
|
32
|
+
df = pd.read_csv(filepath)
|
|
33
|
+
|
|
34
|
+
# Automatically make the last column the target variable if none are specified
|
|
35
|
+
if target_columns is None:
|
|
36
|
+
target_columns = df.columns[-1:]
|
|
37
|
+
|
|
38
|
+
# Split into features (X) and target variables (y)
|
|
39
|
+
X = df.drop(columns=target_columns)
|
|
40
|
+
y = df[target_columns]
|
|
41
|
+
feature_names = X.columns.tolist()
|
|
42
|
+
|
|
43
|
+
# Split data based on the chosen method
|
|
44
|
+
if split_method.lower() == "random":
|
|
45
|
+
# Random split using scikit-learn's train_test_split
|
|
46
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
|
|
47
|
+
elif split_method.lower() == "first":
|
|
48
|
+
# Use the first 'test_size' proportion of rows as the test set
|
|
49
|
+
test_count = int(np.ceil(test_size * len(X)))
|
|
50
|
+
X_test = X.iloc[:test_count]
|
|
51
|
+
y_test = y.iloc[:test_count]
|
|
52
|
+
X_train = X.iloc[test_count:]
|
|
53
|
+
y_train = y.iloc[test_count:]
|
|
54
|
+
elif split_method.lower() == "last":
|
|
55
|
+
# Use the last 'test_size' proportion of rows as the test set
|
|
56
|
+
test_count = int(np.ceil(test_size * len(X)))
|
|
57
|
+
X_test = X.iloc[-test_count:]
|
|
58
|
+
y_test = y.iloc[-test_count:]
|
|
59
|
+
X_train = X.iloc[:-test_count]
|
|
60
|
+
y_train = y.iloc[:-test_count]
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError("split_method must be 'random', 'first', or 'last'.")
|
|
63
|
+
|
|
64
|
+
# Standardise features (mean = 0, variance = 1)
|
|
65
|
+
scaler = StandardScaler()
|
|
66
|
+
X_train_scaled = scaler.fit_transform(X_train)
|
|
67
|
+
X_test_scaled = scaler.transform(X_test)
|
|
68
|
+
|
|
69
|
+
return df, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler, target_columns, feature_names
|
|
70
|
+
|
|
71
|
+
def plot_target_vs_target(y_train, y_test, target_columns):
|
|
72
|
+
"""
|
|
73
|
+
Scatter plot of first two target variables, colored by train/test.
|
|
74
|
+
"""
|
|
75
|
+
if len(target_columns) < 2:
|
|
76
|
+
print("Not enough target variables specified to plot graph of target variables.")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
# Scatter plot of target variables
|
|
80
|
+
target1, target2 = target_columns[:2] # Use first two for now (maybe add more in later versions?)
|
|
81
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
82
|
+
ax.scatter(y_train[target1], y_train[target2], color='black', alpha=0.5, label=f'Training Data (n={len(y_train)})')
|
|
83
|
+
ax.scatter(y_test[target1], y_test[target2], color='red', alpha=0.5, label=f'Testing Data (n={len(y_test)})')
|
|
84
|
+
ax.set_xlabel(target1)
|
|
85
|
+
ax.set_ylabel(target2)
|
|
86
|
+
ax.set_title(f'Train/Test Split of {target1} vs {target2}')
|
|
87
|
+
ax.legend()
|
|
88
|
+
fig.tight_layout()
|
|
89
|
+
return fig
|
|
90
|
+
|
|
91
|
+
def plot_features_vs_targets(X_train, y_train, target_columns):
|
|
92
|
+
"""
|
|
93
|
+
For each target, create a grid of scatter plots of every feature vs the target,
|
|
94
|
+
with a simple fitted line (np.polyfit).
|
|
95
|
+
"""
|
|
96
|
+
figs = {}
|
|
97
|
+
|
|
98
|
+
for target_var in target_columns:
|
|
99
|
+
num_features = X_train.shape[1]
|
|
100
|
+
num_cols = 3 # Fixed number of columns
|
|
101
|
+
num_rows = (num_features + num_cols - 1) // num_cols # Calculate rows dynamically
|
|
102
|
+
|
|
103
|
+
# Dynamically adjust figure size based on the number of rows
|
|
104
|
+
fig_width = 15 # Fixed width
|
|
105
|
+
row_height = 4 # Height per row
|
|
106
|
+
fig_height = min(row_height * num_rows, 50) # Prevent excessive figure size
|
|
107
|
+
|
|
108
|
+
fig, axes = plt.subplots(num_rows, num_cols, figsize=(fig_width, fig_height))
|
|
109
|
+
fig.suptitle(f"Scatter Plots of features against {target_var}", fontsize=16)
|
|
110
|
+
axes = axes.flatten()
|
|
111
|
+
|
|
112
|
+
for i, column in enumerate(X_train.columns):
|
|
113
|
+
ax = axes[i]
|
|
114
|
+
ax.scatter(X_train[column], y_train[target_var], alpha=0.5)
|
|
115
|
+
|
|
116
|
+
# Fit and plot a regression line
|
|
117
|
+
slope, intercept = np.polyfit(X_train[column], y_train[target_var], 1)
|
|
118
|
+
ax.plot(X_train[column], slope * X_train[column] + intercept, color='red')
|
|
119
|
+
|
|
120
|
+
ax.set_xlabel(column)
|
|
121
|
+
ax.set_ylabel(target_var)
|
|
122
|
+
ax.set_title(f'{column} vs {target_var}')
|
|
123
|
+
|
|
124
|
+
# Remove empty subplots
|
|
125
|
+
for j in range(i + 1, len(axes)):
|
|
126
|
+
fig.delaxes(axes[j])
|
|
127
|
+
|
|
128
|
+
fig.tight_layout(rect=[0, 0, 1, 0.96])
|
|
129
|
+
figs[target_var] = fig # Store the figure with key = target name
|
|
130
|
+
|
|
131
|
+
return figs
|
|
132
|
+
|
|
133
|
+
def plot_boxplots(df, target_columns):
|
|
134
|
+
"""
|
|
135
|
+
Boxplots for all features + targets, with overlayed mean/median/percentiles legends.
|
|
136
|
+
"""
|
|
137
|
+
# Separate features and targets
|
|
138
|
+
features = df.drop(columns=target_columns)
|
|
139
|
+
targets = df[target_columns]
|
|
140
|
+
|
|
141
|
+
# Combine features and targets for plotting
|
|
142
|
+
combined_df = pd.concat([features, targets], axis=1)
|
|
143
|
+
|
|
144
|
+
# Get column names for the combined data
|
|
145
|
+
all_columns = combined_df.columns.tolist()
|
|
146
|
+
num_columns = len(all_columns)
|
|
147
|
+
|
|
148
|
+
# Dynamic layout
|
|
149
|
+
num_cols = 3 # Number of columns for subplots
|
|
150
|
+
num_rows = (num_columns + num_cols - 1) // num_cols # Dynamically calculate rows
|
|
151
|
+
max_height_per_row = 5 # Maximum height for each row of subplots
|
|
152
|
+
|
|
153
|
+
# Adjust figure size dynamically
|
|
154
|
+
fig_height = min(max_height_per_row * num_rows, 50) # Limit overall height
|
|
155
|
+
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, fig_height), sharey=True)
|
|
156
|
+
fig.suptitle("Box Plots of Features and Target Variables", fontsize=16)
|
|
157
|
+
axes = axes.flatten()
|
|
158
|
+
|
|
159
|
+
for i, column in enumerate(all_columns):
|
|
160
|
+
ax = axes[i]
|
|
161
|
+
data = combined_df[column]
|
|
162
|
+
|
|
163
|
+
# Plot the box plot
|
|
164
|
+
bp = ax.boxplot(data, vert=False, patch_artist=True,
|
|
165
|
+
boxprops=dict(facecolor='lightblue', color='black'),
|
|
166
|
+
flierprops=dict(marker='o', color='red', alpha=0.5))
|
|
167
|
+
|
|
168
|
+
# Add statistical information
|
|
169
|
+
median = data.median()
|
|
170
|
+
mean = data.mean()
|
|
171
|
+
q1, q3 = data.quantile([0.25, 0.75])
|
|
172
|
+
p5, p95 = data.quantile([0.05, 0.95])
|
|
173
|
+
min_val, max_val = data.min(), data.max()
|
|
174
|
+
|
|
175
|
+
# Draw vertical lines for the mean, median, and percentiles
|
|
176
|
+
ax.axvline(mean, color='red', linestyle='-', linewidth=1.5)
|
|
177
|
+
ax.axvline(median, color='orange', linestyle='-', linewidth=1.5)
|
|
178
|
+
ax.axvline(p5, color='green', linestyle='--', linewidth=1)
|
|
179
|
+
ax.axvline(p95, color='green', linestyle='--', linewidth=1)
|
|
180
|
+
|
|
181
|
+
ax.set_title(column, fontsize=12)
|
|
182
|
+
ax.set_xlabel("Value")
|
|
183
|
+
ax.set_yticks([])
|
|
184
|
+
|
|
185
|
+
# Custom legend handles (using matplotlib.lines and patches)
|
|
186
|
+
legend_handles = [
|
|
187
|
+
mlines.Line2D([], [], color='red', linestyle='-', linewidth=1.5, label=f"Mean: {mean:.2f}"),
|
|
188
|
+
mlines.Line2D([], [], color='orange', linestyle='-', linewidth=1.5, label=f"Median: {median:.2f}"),
|
|
189
|
+
mpatches.Patch(facecolor='lightblue', edgecolor='black', label=f"IQR: {q1:.2f} to {q3:.2f}"),
|
|
190
|
+
mlines.Line2D([], [], color='green', linestyle='--', linewidth=1, label=f"5th/95th: {p5:.2f}, {p95:.2f}"),
|
|
191
|
+
mlines.Line2D([], [], color='black', linestyle='-', linewidth=1, label=f"Min/Max: {min_val:.2f}, {max_val:.2f}")
|
|
192
|
+
]
|
|
193
|
+
ax.legend(handles=legend_handles, loc="upper right", fontsize=8, title="Statistics", title_fontsize='9')
|
|
194
|
+
|
|
195
|
+
# Remove any unused subplots
|
|
196
|
+
for j in range(i + 1, len(axes)):
|
|
197
|
+
fig.delaxes(axes[j])
|
|
198
|
+
|
|
199
|
+
plt.tight_layout(rect=[0, 0, 1, 0.96])
|
|
200
|
+
return fig
|
|
201
|
+
|
|
202
|
+
def plot_distance_correlation_matrix(df, title="Distance Correlation Matrix", cmap='RdYlGn', dummy=False, annotate=True):
|
|
203
|
+
"""
|
|
204
|
+
Compute & plot a distance-correlation heatmap for numeric columns.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
df (pd.DataFrame): Numeric columns only.
|
|
208
|
+
title (str): Figure title.
|
|
209
|
+
cmap (str): Matplotlib colormap.
|
|
210
|
+
dummy (bool): If True, append a random dummy column to illustrate.
|
|
211
|
+
annotate (bool): Show numeric values on the heatmap.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
(pd.DataFrame, matplotlib.figure.Figure): distance corr matrix and the figure.
|
|
215
|
+
"""
|
|
216
|
+
if dummy:
|
|
217
|
+
df = df.copy()
|
|
218
|
+
df["Dummy"] = np.random.normal(size=len(df))
|
|
219
|
+
|
|
220
|
+
if not all(df.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
|
|
221
|
+
raise ValueError("All columns in the dataset must be numeric for distance correlation calculation.")
|
|
222
|
+
|
|
223
|
+
features = df.columns
|
|
224
|
+
n = len(features)
|
|
225
|
+
dist_corr_matrix = np.zeros((n, n))
|
|
226
|
+
|
|
227
|
+
for i in range(n):
|
|
228
|
+
for j in range(n):
|
|
229
|
+
dist_corr_matrix[i, j] = dcor.distance_correlation(df[features[i]], df[features[j]])
|
|
230
|
+
|
|
231
|
+
dist_corr_df = pd.DataFrame(dist_corr_matrix, index=features, columns=features)
|
|
232
|
+
|
|
233
|
+
fig_width = max(12, n * 0.5)
|
|
234
|
+
fig_height = max(10, n * 0.5)
|
|
235
|
+
|
|
236
|
+
fig = plt.figure(figsize=(fig_width, fig_height))
|
|
237
|
+
ax = plt.gca() # Get the current axes
|
|
238
|
+
sns.heatmap(dist_corr_df, annot=annotate, cmap=cmap, square=True, linewidths=0.5, fmt=".4f",
|
|
239
|
+
annot_kws={"size": 8}, cbar_kws={"shrink": 0.8}, ax=ax)
|
|
240
|
+
plt.xticks(rotation=45, ha='right', fontsize=8)
|
|
241
|
+
plt.yticks(rotation=0, fontsize=8)
|
|
242
|
+
plt.title(title, fontsize=14)
|
|
243
|
+
|
|
244
|
+
plt.tight_layout()
|
|
245
|
+
|
|
246
|
+
return dist_corr_df, fig
|
|
247
|
+
|
|
248
|
+
# Function to actually run the program
|
|
249
|
+
def run_preprocessing_workflow(
|
|
250
|
+
file_path,
|
|
251
|
+
test_size=0.2,
|
|
252
|
+
split_method="random",
|
|
253
|
+
target_columns=None,
|
|
254
|
+
plot_target_vs_target_enabled=True,
|
|
255
|
+
plot_features_vs_targets_enabled=True,
|
|
256
|
+
plot_boxplots_enabled=True,
|
|
257
|
+
plot_distance_corr_enabled=True,
|
|
258
|
+
figures = {}
|
|
259
|
+
):
|
|
260
|
+
print("\nAvailable columns in the dataset:")
|
|
261
|
+
df_preview = pd.read_csv(file_path)
|
|
262
|
+
print(df_preview.columns.tolist())
|
|
263
|
+
|
|
264
|
+
# Default to last 2 columns if not specified
|
|
265
|
+
if target_columns is None:
|
|
266
|
+
target_columns = df_preview.columns[-2:].tolist()
|
|
267
|
+
|
|
268
|
+
(
|
|
269
|
+
df, X_train, X_test, y_train, y_test,
|
|
270
|
+
X_train_scaled, X_test_scaled, scaler,
|
|
271
|
+
target_columns, feature_names
|
|
272
|
+
) = load_and_preprocess_data(
|
|
273
|
+
file_path, test_size=test_size, split_method=split_method, target_columns=target_columns
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Dataset metadata for the report
|
|
277
|
+
n_rows, n_cols = df.shape
|
|
278
|
+
features = [c for c in df.columns if c not in target_columns]
|
|
279
|
+
train_n, test_n = len(X_train), len(X_test)
|
|
280
|
+
meta = {
|
|
281
|
+
"dataset_path": file_path,
|
|
282
|
+
"n_rows": n_rows,
|
|
283
|
+
"n_cols": n_cols,
|
|
284
|
+
"targets": list(target_columns),
|
|
285
|
+
"features": features,
|
|
286
|
+
"n_features": len(features),
|
|
287
|
+
"split_method": split_method,
|
|
288
|
+
"test_size_param": test_size,
|
|
289
|
+
"train_count": train_n,
|
|
290
|
+
"test_count": test_n,
|
|
291
|
+
"train_prop": train_n / n_rows if n_rows else 0.0,
|
|
292
|
+
"test_prop": test_n / n_rows if n_rows else 0.0,
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
print(f"\nDataset has {n_rows} rows and {n_cols} columns.")
|
|
296
|
+
print(f"Using the following target columns: {target_columns}")
|
|
297
|
+
|
|
298
|
+
X_train_df = pd.DataFrame(X_train, columns=df.columns.drop(target_columns))
|
|
299
|
+
|
|
300
|
+
if plot_target_vs_target_enabled:
|
|
301
|
+
print("\nGenerating Target vs Target plot...")
|
|
302
|
+
fig = plot_target_vs_target(y_train, y_test, target_columns)
|
|
303
|
+
if fig: figures["Target vs Target"] = fig
|
|
304
|
+
|
|
305
|
+
if plot_features_vs_targets_enabled:
|
|
306
|
+
print("\nGenerating Feature vs Target scatter plots...")
|
|
307
|
+
feature_vs_target_figs = plot_features_vs_targets(X_train_df, y_train, target_columns)
|
|
308
|
+
figures.update({f"Features vs {target}": fig for target, fig in feature_vs_target_figs.items()})
|
|
309
|
+
|
|
310
|
+
if plot_boxplots_enabled:
|
|
311
|
+
print("\nGenerating Box plots...")
|
|
312
|
+
fig = plot_boxplots(df, target_columns)
|
|
313
|
+
if fig: figures["Boxplots"] = fig
|
|
314
|
+
|
|
315
|
+
dist_corr_df = None
|
|
316
|
+
if plot_distance_corr_enabled:
|
|
317
|
+
print("\nGenerating Distance Correlation Matrix...")
|
|
318
|
+
dist_corr_df, fig = plot_distance_correlation_matrix(df.drop(columns=target_columns), dummy=True)
|
|
319
|
+
if fig: figures["Distance Correlation"] = fig
|
|
320
|
+
|
|
321
|
+
return {
|
|
322
|
+
"df": df,
|
|
323
|
+
"X_train": X_train,
|
|
324
|
+
"X_test": X_test,
|
|
325
|
+
"y_train": y_train,
|
|
326
|
+
"y_test": y_test,
|
|
327
|
+
"X_train_scaled": X_train_scaled,
|
|
328
|
+
"X_test_scaled": X_test_scaled,
|
|
329
|
+
"scaler": scaler,
|
|
330
|
+
"target_columns": target_columns,
|
|
331
|
+
"feature_names": feature_names,
|
|
332
|
+
"distance_corr_matrix": dist_corr_df,
|
|
333
|
+
"figures": figures,
|
|
334
|
+
"meta": meta,
|
|
335
|
+
}
|