phoenix-ml-workflow 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
phoenix_ml/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ '''
2
+ phoenix_ml: A Physics and Hybrid Optimised ENgine for Interpretability and eXplainability for Machine Learning.
3
+
4
+ '''
5
+
6
+ __version__ = "1.0.0"
@@ -0,0 +1,335 @@
1
+ # data_preprocessing.py
2
+ # This module serves as the analysis of the dataset provided before it undergoes any model evaluation.
3
+ # This includes the test/train split, features-target scatter plots, boxplots, and distance correlation.
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.preprocessing import StandardScaler
11
+
12
+ import matplotlib.lines as mlines
13
+ import matplotlib.patches as mpatches
14
+
15
+ import seaborn as sns
16
+ import dcor
17
+
18
+ def load_and_preprocess_data(filepath, test_size, split_method="random", target_columns=None):
19
+ """
20
+ Load a CSV, choose targets, split into train/test by a chosen method, and standardize features.
21
+
22
+ Args:
23
+ filepath (str): Path to CSV.
24
+ test_size (float): Proportion of rows in the test split (0–1).
25
+ split_method (str): 'random' | 'first' | 'last'.
26
+ target_columns (list[str] | None): Columns to treat as targets. If None, uses last column.
27
+
28
+ Returns:
29
+ tuple: (df, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler,
30
+ target_columns, feature_names)
31
+ """
32
+ df = pd.read_csv(filepath)
33
+
34
+ # Automatically make the last column the target variable if none are specified
35
+ if target_columns is None:
36
+ target_columns = df.columns[-1:]
37
+
38
+ # Split into features (X) and target variables (y)
39
+ X = df.drop(columns=target_columns)
40
+ y = df[target_columns]
41
+ feature_names = X.columns.tolist()
42
+
43
+ # Split data based on the chosen method
44
+ if split_method.lower() == "random":
45
+ # Random split using scikit-learn's train_test_split
46
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
47
+ elif split_method.lower() == "first":
48
+ # Use the first 'test_size' proportion of rows as the test set
49
+ test_count = int(np.ceil(test_size * len(X)))
50
+ X_test = X.iloc[:test_count]
51
+ y_test = y.iloc[:test_count]
52
+ X_train = X.iloc[test_count:]
53
+ y_train = y.iloc[test_count:]
54
+ elif split_method.lower() == "last":
55
+ # Use the last 'test_size' proportion of rows as the test set
56
+ test_count = int(np.ceil(test_size * len(X)))
57
+ X_test = X.iloc[-test_count:]
58
+ y_test = y.iloc[-test_count:]
59
+ X_train = X.iloc[:-test_count]
60
+ y_train = y.iloc[:-test_count]
61
+ else:
62
+ raise ValueError("split_method must be 'random', 'first', or 'last'.")
63
+
64
+ # Standardise features (mean = 0, variance = 1)
65
+ scaler = StandardScaler()
66
+ X_train_scaled = scaler.fit_transform(X_train)
67
+ X_test_scaled = scaler.transform(X_test)
68
+
69
+ return df, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler, target_columns, feature_names
70
+
71
+ def plot_target_vs_target(y_train, y_test, target_columns):
72
+ """
73
+ Scatter plot of first two target variables, colored by train/test.
74
+ """
75
+ if len(target_columns) < 2:
76
+ print("Not enough target variables specified to plot graph of target variables.")
77
+ return
78
+
79
+ # Scatter plot of target variables
80
+ target1, target2 = target_columns[:2] # Use first two for now (maybe add more in later versions?)
81
+ fig, ax = plt.subplots(figsize=(10, 6))
82
+ ax.scatter(y_train[target1], y_train[target2], color='black', alpha=0.5, label=f'Training Data (n={len(y_train)})')
83
+ ax.scatter(y_test[target1], y_test[target2], color='red', alpha=0.5, label=f'Testing Data (n={len(y_test)})')
84
+ ax.set_xlabel(target1)
85
+ ax.set_ylabel(target2)
86
+ ax.set_title(f'Train/Test Split of {target1} vs {target2}')
87
+ ax.legend()
88
+ fig.tight_layout()
89
+ return fig
90
+
91
+ def plot_features_vs_targets(X_train, y_train, target_columns):
92
+ """
93
+ For each target, create a grid of scatter plots of every feature vs the target,
94
+ with a simple fitted line (np.polyfit).
95
+ """
96
+ figs = {}
97
+
98
+ for target_var in target_columns:
99
+ num_features = X_train.shape[1]
100
+ num_cols = 3 # Fixed number of columns
101
+ num_rows = (num_features + num_cols - 1) // num_cols # Calculate rows dynamically
102
+
103
+ # Dynamically adjust figure size based on the number of rows
104
+ fig_width = 15 # Fixed width
105
+ row_height = 4 # Height per row
106
+ fig_height = min(row_height * num_rows, 50) # Prevent excessive figure size
107
+
108
+ fig, axes = plt.subplots(num_rows, num_cols, figsize=(fig_width, fig_height))
109
+ fig.suptitle(f"Scatter Plots of features against {target_var}", fontsize=16)
110
+ axes = axes.flatten()
111
+
112
+ for i, column in enumerate(X_train.columns):
113
+ ax = axes[i]
114
+ ax.scatter(X_train[column], y_train[target_var], alpha=0.5)
115
+
116
+ # Fit and plot a regression line
117
+ slope, intercept = np.polyfit(X_train[column], y_train[target_var], 1)
118
+ ax.plot(X_train[column], slope * X_train[column] + intercept, color='red')
119
+
120
+ ax.set_xlabel(column)
121
+ ax.set_ylabel(target_var)
122
+ ax.set_title(f'{column} vs {target_var}')
123
+
124
+ # Remove empty subplots
125
+ for j in range(i + 1, len(axes)):
126
+ fig.delaxes(axes[j])
127
+
128
+ fig.tight_layout(rect=[0, 0, 1, 0.96])
129
+ figs[target_var] = fig # Store the figure with key = target name
130
+
131
+ return figs
132
+
133
+ def plot_boxplots(df, target_columns):
134
+ """
135
+ Boxplots for all features + targets, with overlayed mean/median/percentiles legends.
136
+ """
137
+ # Separate features and targets
138
+ features = df.drop(columns=target_columns)
139
+ targets = df[target_columns]
140
+
141
+ # Combine features and targets for plotting
142
+ combined_df = pd.concat([features, targets], axis=1)
143
+
144
+ # Get column names for the combined data
145
+ all_columns = combined_df.columns.tolist()
146
+ num_columns = len(all_columns)
147
+
148
+ # Dynamic layout
149
+ num_cols = 3 # Number of columns for subplots
150
+ num_rows = (num_columns + num_cols - 1) // num_cols # Dynamically calculate rows
151
+ max_height_per_row = 5 # Maximum height for each row of subplots
152
+
153
+ # Adjust figure size dynamically
154
+ fig_height = min(max_height_per_row * num_rows, 50) # Limit overall height
155
+ fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, fig_height), sharey=True)
156
+ fig.suptitle("Box Plots of Features and Target Variables", fontsize=16)
157
+ axes = axes.flatten()
158
+
159
+ for i, column in enumerate(all_columns):
160
+ ax = axes[i]
161
+ data = combined_df[column]
162
+
163
+ # Plot the box plot
164
+ bp = ax.boxplot(data, vert=False, patch_artist=True,
165
+ boxprops=dict(facecolor='lightblue', color='black'),
166
+ flierprops=dict(marker='o', color='red', alpha=0.5))
167
+
168
+ # Add statistical information
169
+ median = data.median()
170
+ mean = data.mean()
171
+ q1, q3 = data.quantile([0.25, 0.75])
172
+ p5, p95 = data.quantile([0.05, 0.95])
173
+ min_val, max_val = data.min(), data.max()
174
+
175
+ # Draw vertical lines for the mean, median, and percentiles
176
+ ax.axvline(mean, color='red', linestyle='-', linewidth=1.5)
177
+ ax.axvline(median, color='orange', linestyle='-', linewidth=1.5)
178
+ ax.axvline(p5, color='green', linestyle='--', linewidth=1)
179
+ ax.axvline(p95, color='green', linestyle='--', linewidth=1)
180
+
181
+ ax.set_title(column, fontsize=12)
182
+ ax.set_xlabel("Value")
183
+ ax.set_yticks([])
184
+
185
+ # Custom legend handles (using matplotlib.lines and patches)
186
+ legend_handles = [
187
+ mlines.Line2D([], [], color='red', linestyle='-', linewidth=1.5, label=f"Mean: {mean:.2f}"),
188
+ mlines.Line2D([], [], color='orange', linestyle='-', linewidth=1.5, label=f"Median: {median:.2f}"),
189
+ mpatches.Patch(facecolor='lightblue', edgecolor='black', label=f"IQR: {q1:.2f} to {q3:.2f}"),
190
+ mlines.Line2D([], [], color='green', linestyle='--', linewidth=1, label=f"5th/95th: {p5:.2f}, {p95:.2f}"),
191
+ mlines.Line2D([], [], color='black', linestyle='-', linewidth=1, label=f"Min/Max: {min_val:.2f}, {max_val:.2f}")
192
+ ]
193
+ ax.legend(handles=legend_handles, loc="upper right", fontsize=8, title="Statistics", title_fontsize='9')
194
+
195
+ # Remove any unused subplots
196
+ for j in range(i + 1, len(axes)):
197
+ fig.delaxes(axes[j])
198
+
199
+ plt.tight_layout(rect=[0, 0, 1, 0.96])
200
+ return fig
201
+
202
+ def plot_distance_correlation_matrix(df, title="Distance Correlation Matrix", cmap='RdYlGn', dummy=False, annotate=True):
203
+ """
204
+ Compute & plot a distance-correlation heatmap for numeric columns.
205
+
206
+ Args:
207
+ df (pd.DataFrame): Numeric columns only.
208
+ title (str): Figure title.
209
+ cmap (str): Matplotlib colormap.
210
+ dummy (bool): If True, append a random dummy column to illustrate.
211
+ annotate (bool): Show numeric values on the heatmap.
212
+
213
+ Returns:
214
+ (pd.DataFrame, matplotlib.figure.Figure): distance corr matrix and the figure.
215
+ """
216
+ if dummy:
217
+ df = df.copy()
218
+ df["Dummy"] = np.random.normal(size=len(df))
219
+
220
+ if not all(df.dtypes.apply(lambda x: np.issubdtype(x, np.number))):
221
+ raise ValueError("All columns in the dataset must be numeric for distance correlation calculation.")
222
+
223
+ features = df.columns
224
+ n = len(features)
225
+ dist_corr_matrix = np.zeros((n, n))
226
+
227
+ for i in range(n):
228
+ for j in range(n):
229
+ dist_corr_matrix[i, j] = dcor.distance_correlation(df[features[i]], df[features[j]])
230
+
231
+ dist_corr_df = pd.DataFrame(dist_corr_matrix, index=features, columns=features)
232
+
233
+ fig_width = max(12, n * 0.5)
234
+ fig_height = max(10, n * 0.5)
235
+
236
+ fig = plt.figure(figsize=(fig_width, fig_height))
237
+ ax = plt.gca() # Get the current axes
238
+ sns.heatmap(dist_corr_df, annot=annotate, cmap=cmap, square=True, linewidths=0.5, fmt=".4f",
239
+ annot_kws={"size": 8}, cbar_kws={"shrink": 0.8}, ax=ax)
240
+ plt.xticks(rotation=45, ha='right', fontsize=8)
241
+ plt.yticks(rotation=0, fontsize=8)
242
+ plt.title(title, fontsize=14)
243
+
244
+ plt.tight_layout()
245
+
246
+ return dist_corr_df, fig
247
+
248
+ # Function to actually run the program
249
+ def run_preprocessing_workflow(
250
+ file_path,
251
+ test_size=0.2,
252
+ split_method="random",
253
+ target_columns=None,
254
+ plot_target_vs_target_enabled=True,
255
+ plot_features_vs_targets_enabled=True,
256
+ plot_boxplots_enabled=True,
257
+ plot_distance_corr_enabled=True,
258
+ figures = {}
259
+ ):
260
+ print("\nAvailable columns in the dataset:")
261
+ df_preview = pd.read_csv(file_path)
262
+ print(df_preview.columns.tolist())
263
+
264
+ # Default to last 2 columns if not specified
265
+ if target_columns is None:
266
+ target_columns = df_preview.columns[-2:].tolist()
267
+
268
+ (
269
+ df, X_train, X_test, y_train, y_test,
270
+ X_train_scaled, X_test_scaled, scaler,
271
+ target_columns, feature_names
272
+ ) = load_and_preprocess_data(
273
+ file_path, test_size=test_size, split_method=split_method, target_columns=target_columns
274
+ )
275
+
276
+ # Dataset metadata for the report
277
+ n_rows, n_cols = df.shape
278
+ features = [c for c in df.columns if c not in target_columns]
279
+ train_n, test_n = len(X_train), len(X_test)
280
+ meta = {
281
+ "dataset_path": file_path,
282
+ "n_rows": n_rows,
283
+ "n_cols": n_cols,
284
+ "targets": list(target_columns),
285
+ "features": features,
286
+ "n_features": len(features),
287
+ "split_method": split_method,
288
+ "test_size_param": test_size,
289
+ "train_count": train_n,
290
+ "test_count": test_n,
291
+ "train_prop": train_n / n_rows if n_rows else 0.0,
292
+ "test_prop": test_n / n_rows if n_rows else 0.0,
293
+ }
294
+
295
+ print(f"\nDataset has {n_rows} rows and {n_cols} columns.")
296
+ print(f"Using the following target columns: {target_columns}")
297
+
298
+ X_train_df = pd.DataFrame(X_train, columns=df.columns.drop(target_columns))
299
+
300
+ if plot_target_vs_target_enabled:
301
+ print("\nGenerating Target vs Target plot...")
302
+ fig = plot_target_vs_target(y_train, y_test, target_columns)
303
+ if fig: figures["Target vs Target"] = fig
304
+
305
+ if plot_features_vs_targets_enabled:
306
+ print("\nGenerating Feature vs Target scatter plots...")
307
+ feature_vs_target_figs = plot_features_vs_targets(X_train_df, y_train, target_columns)
308
+ figures.update({f"Features vs {target}": fig for target, fig in feature_vs_target_figs.items()})
309
+
310
+ if plot_boxplots_enabled:
311
+ print("\nGenerating Box plots...")
312
+ fig = plot_boxplots(df, target_columns)
313
+ if fig: figures["Boxplots"] = fig
314
+
315
+ dist_corr_df = None
316
+ if plot_distance_corr_enabled:
317
+ print("\nGenerating Distance Correlation Matrix...")
318
+ dist_corr_df, fig = plot_distance_correlation_matrix(df.drop(columns=target_columns), dummy=True)
319
+ if fig: figures["Distance Correlation"] = fig
320
+
321
+ return {
322
+ "df": df,
323
+ "X_train": X_train,
324
+ "X_test": X_test,
325
+ "y_train": y_train,
326
+ "y_test": y_test,
327
+ "X_train_scaled": X_train_scaled,
328
+ "X_test_scaled": X_test_scaled,
329
+ "scaler": scaler,
330
+ "target_columns": target_columns,
331
+ "feature_names": feature_names,
332
+ "distance_corr_matrix": dist_corr_df,
333
+ "figures": figures,
334
+ "meta": meta,
335
+ }