trainedml 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
trainedml/__init__.py ADDED
@@ -0,0 +1,178 @@
1
+
2
+ """
3
+ Module principal du package trainedml.
4
+
5
+ Ce module expose la classe centrale `Trainer` qui permet de gérer tout le workflow de machine learning :
6
+ chargement de données, séparation train/test, entraînement, évaluation et prédiction.
7
+ Il sert aussi de point d'entrée pour la CLI (ligne de commande).
8
+
9
+ Fonctionnalités principales
10
+ --------------------------
11
+ - API haut niveau pour entraîner, évaluer et prédire avec un modèle ML
12
+ - Supporte les datasets publics (Iris, Wine, etc.) ou des CSV distants
13
+ - Séparation automatique train/test
14
+ - Gestion de plusieurs modèles (KNN, Logistic, Random Forest, etc.)
15
+ - Évaluation standard (accuracy, precision, recall, f1)
16
+ - Peut être utilisé en script, API, ou webapp
17
+
18
+ Exemple
19
+ -------
20
+ >>> from trainedml import Trainer
21
+ >>> trainer = Trainer(dataset="iris", model="knn")
22
+ >>> trainer.fit()
23
+ >>> results = trainer.evaluate()
24
+ >>> print(results)
25
+ >>> preds = trainer.predict([[5.1, 3.5, 1.4, 0.2]])
26
+ >>> print(preds)
27
+ """
28
+
29
+ # Ce fichier permet d'importer le package trainedml
30
+
31
+ # Classe Trainer pour usage API et webapp
32
+ from .data.loader import DataLoader
33
+ from .models import KNNModel, LogisticModel, RandomForestModel, MODEL_MAP, get_model
34
+ from .evaluation import Evaluator
35
+ from sklearn.model_selection import train_test_split
36
+
37
+
38
+
39
+ class Trainer:
40
+ r"""
41
+ Classe haut niveau pour entraîner, évaluer et prédire avec un modèle de machine learning.
42
+
43
+ Cette classe centralise tout le workflow ML : chargement des données, split train/test,
44
+ entraînement, évaluation et prédiction. Elle est conçue pour être utilisée dans une API,
45
+ une webapp ou en script Python.
46
+
47
+ Parameters
48
+ ----------
49
+ dataset : str, optional
50
+ Nom du dataset connu ("iris", "wine", etc.).
51
+ model : str
52
+ Nom du modèle à utiliser ("random_forest", "knn", "logistic").
53
+ url : str, optional
54
+ URL d'un CSV distant à charger.
55
+ target : str, optional
56
+ Nom de la colonne cible (si url).
57
+ test_size : float
58
+ Proportion de test (entre 0 et 1).
59
+ seed : int
60
+ Graine aléatoire pour la reproductibilité.
61
+
62
+ Attributes
63
+ ----------
64
+ model : BaseModel
65
+ Instance du modèle ML utilisé.
66
+ X_train, X_test, y_train, y_test : array-like
67
+ Données séparées pour l'entraînement et le test.
68
+ is_fitted : bool
69
+ Indique si le modèle a été entraîné.
70
+
71
+ Examples
72
+ --------
73
+ >>> trainer = Trainer(dataset="iris", model="knn")
74
+ >>> trainer.fit()
75
+ >>> results = trainer.evaluate()
76
+ >>> print(results)
77
+ >>> preds = trainer.predict([[5.1, 3.5, 1.4, 0.2]])
78
+ >>> print(preds)
79
+ """
80
+ def __init__(self, dataset=None, model='random_forest', url=None, target=None, test_size=0.2, seed=42):
81
+ self.dataset = dataset
82
+ self.url = url
83
+ self.target = target
84
+ self.test_size = test_size
85
+ self.seed = seed
86
+ self.model_name = model
87
+ self.model = MODEL_MAP[model]()
88
+ self.X_train = self.X_test = self.y_train = self.y_test = None
89
+ self.is_fitted = False
90
+
91
+ def load_data(self):
92
+ """
93
+ Charge les données, effectue la séparation train/test et les stocke dans l'objet.
94
+
95
+ Returns
96
+ -------
97
+ tuple
98
+ (X_train, X_test, y_train, y_test)
99
+
100
+ Raises
101
+ ------
102
+ ValueError
103
+ Si le dataset ou la cible n'est pas spécifié correctement.
104
+ """
105
+ loader = DataLoader()
106
+ X, y = loader.load_dataset(name=self.dataset, url=self.url, target=self.target)
107
+ self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
108
+ X, y, test_size=self.test_size, random_state=self.seed)
109
+ return self.X_train, self.X_test, self.y_train, self.y_test
110
+
111
+ def fit(self):
112
+ """
113
+ Entraîne le modèle sur les données d'entraînement.
114
+ Charge les données si nécessaire.
115
+
116
+ Returns
117
+ -------
118
+ self : Trainer
119
+ L'instance courante (pour chaînage).
120
+ """
121
+ if self.X_train is None:
122
+ self.load_data()
123
+ self.model.fit(self.X_train, self.y_train)
124
+ self.is_fitted = True
125
+ return self
126
+
127
+ def evaluate(self):
128
+ """
129
+ Évalue le modèle entraîné sur les données de test.
130
+
131
+ Returns
132
+ -------
133
+ dict
134
+ Dictionnaire des scores de classification (accuracy, precision, recall, f1).
135
+
136
+ Raises
137
+ ------
138
+ RuntimeError
139
+ Si le modèle n'est pas entraîné.
140
+ """
141
+ if not self.is_fitted:
142
+ raise RuntimeError("Le modèle doit être entraîné avant l'évaluation.")
143
+ y_pred = self.model.predict(self.X_test)
144
+ return Evaluator.evaluate_all(self.y_test, y_pred)
145
+
146
+ def predict(self, X):
147
+ """
148
+ Prédit la cible pour de nouvelles données X.
149
+
150
+ Parameters
151
+ ----------
152
+ X : array-like
153
+ Données d'entrée (mêmes features que l'entraînement).
154
+
155
+ Returns
156
+ -------
157
+ array
158
+ Prédictions du modèle.
159
+
160
+ Raises
161
+ ------
162
+ RuntimeError
163
+ Si le modèle n'est pas entraîné.
164
+ """
165
+ if not self.is_fitted:
166
+ raise RuntimeError("Le modèle doit être entraîné avant la prédiction.")
167
+ import numpy as np
168
+ X = np.array(X)
169
+ return self.model.predict(X)
170
+
171
+
172
+ def main():
173
+ """
174
+ Point d'entrée CLI du package trainedml.
175
+ Lance l'interface en ligne de commande (voir src/trainedml/cli.py).
176
+ """
177
+ from .cli import main as cli_main
178
+ cli_main()
trainedml/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ """
2
+ Point d'entrée pour l'exécution du package via python -m trainedml.
3
+ """
4
+ from .cli import main
5
+
6
+ if __name__ == "__main__":
7
+ main()
trainedml/analyzer.py ADDED
@@ -0,0 +1,481 @@
1
+ """
2
+ Data analysis and exploratory statistics for trainedml.
3
+
4
+ This module provides the DataAnalyzer class, which offers a suite of methods for
5
+ descriptive statistics, distribution analysis, correlation, missing values, outliers,
6
+ target analysis, boxplots, bivariate analysis, normality, multicollinearity, and profiling.
7
+
8
+ Mathematical context
9
+ --------------------
10
+ - Correlation: Pearson, Spearman, Kendall
11
+ - Outlier detection: IQR, Z-score
12
+ - Normality: Shapiro-Wilk, D'Agostino, Anderson-Darling
13
+ - Multicollinearity: Variance Inflation Factor (VIF)
14
+
15
+ Examples
16
+ --------
17
+ >>> from trainedml.analyzer import DataAnalyzer
18
+ >>> analyzer = DataAnalyzer(df)
19
+ >>> analyzer.correlation()
20
+ >>> analyzer.outliers()
21
+ """
22
+
23
+ import pandas as pd
24
+ import numpy as np
25
+ import matplotlib.pyplot as plt
26
+ from scipy import stats
27
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
28
+
29
+
30
+ class DataAnalyzer:
31
+ r"""
32
+ Exploratory data analysis and statistics.
33
+
34
+ Provides a suite of methods for descriptive statistics, distribution analysis, correlation,
35
+ missing values, outliers, target analysis, boxplots, bivariate analysis, normality,
36
+ multicollinearity, and profiling.
37
+
38
+ Parameters
39
+ ----------
40
+ data : pandas.DataFrame
41
+ The dataset to analyze.
42
+
43
+ Attributes
44
+ ----------
45
+ data : pandas.DataFrame
46
+ The underlying data.
47
+
48
+ Examples
49
+ --------
50
+ Basic usage:
51
+ >>> from trainedml.analyzer import DataAnalyzer
52
+ >>> analyzer = DataAnalyzer(df)
53
+ >>> stats = analyzer.distribution()
54
+ >>> print(stats)
55
+
56
+ Correlation matrix:
57
+ >>> corr = analyzer.correlation(method='spearman')
58
+ >>> print(corr)
59
+
60
+ Outlier detection:
61
+ >>> out = analyzer.outliers(method='zscore', threshold=3)
62
+ >>> print(out)
63
+
64
+ Normality tests:
65
+ >>> norm = analyzer.normality()
66
+ >>> print(norm)
67
+
68
+ Profiling report:
69
+ >>> report = analyzer.profiling()
70
+ >>> print(report['describe'])
71
+
72
+ Notes
73
+ -----
74
+ - All methods return pandas objects or dicts for easy integration with pandas workflows.
75
+ - For plotting, returned objects are matplotlib figures.
76
+ """
77
+
78
+ def __init__(self, data):
79
+ self.data = data
80
+
81
+ def _select_numeric(self, columns='all'):
82
+ """Return numeric columns from the DataFrame."""
83
+ if columns == 'all':
84
+ return self.data.select_dtypes(include=[np.number])
85
+ if isinstance(columns, str):
86
+ columns = [columns]
87
+ return self.data[columns].select_dtypes(include=[np.number])
88
+
89
+ def distribution(self, columns='all', **kwargs):
90
+ """
91
+ Compute and plot the distribution of variables.
92
+
93
+ Parameters
94
+ ----------
95
+ columns : 'all' or list, default='all'
96
+ Columns to analyze.
97
+ **kwargs :
98
+ Additional arguments for plotting.
99
+
100
+ Returns
101
+ -------
102
+ dict
103
+ Dictionary with keys 'describe' (summary stats) and 'figure' (histogram grid).
104
+
105
+ Examples
106
+ --------
107
+ >>> stats = analyzer.distribution()
108
+ >>> print(stats['describe'])
109
+ >>> stats['figure'].show()
110
+ """
111
+ df_num = self._select_numeric(columns)
112
+ cols = df_num.columns.tolist()
113
+ n = len(cols)
114
+ if n == 0:
115
+ return {'describe': pd.DataFrame(), 'figure': None}
116
+
117
+ ncols_plot = min(n, 3)
118
+ nrows_plot = (n + ncols_plot - 1) // ncols_plot
119
+ fig, axes = plt.subplots(nrows_plot, ncols_plot, figsize=(5 * ncols_plot, 4 * nrows_plot))
120
+ axes = np.atleast_1d(axes).flatten()
121
+
122
+ for i, col in enumerate(cols):
123
+ ax = axes[i]
124
+ ax.hist(df_num[col].dropna(), bins=kwargs.get('bins', 30), edgecolor='black', alpha=0.7)
125
+ ax.set_title(col)
126
+ ax.set_xlabel(col)
127
+ ax.set_ylabel('Frequency')
128
+ # Hide unused axes
129
+ for j in range(i + 1, len(axes)):
130
+ axes[j].set_visible(False)
131
+ fig.tight_layout()
132
+
133
+ return {'describe': df_num.describe(), 'figure': fig}
134
+
135
+ def correlation(self, features='all', method='pearson', mask=True, **kwargs):
136
+ r"""
137
+ Compute the correlation matrix between features.
138
+
139
+ Parameters
140
+ ----------
141
+ features : 'all' or list, default='all'
142
+ Features to include.
143
+ method : str, default='pearson'
144
+ Correlation method ('pearson', 'spearman', 'kendall').
145
+ mask : bool, default=True
146
+ Whether to mask the upper triangle.
147
+ **kwargs :
148
+ Additional arguments for plotting.
149
+
150
+ Returns
151
+ -------
152
+ pandas.DataFrame
153
+ Correlation matrix.
154
+
155
+ Notes
156
+ -----
157
+ Pearson correlation:
158
+ $r_{xy} = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum (x_i - \bar{x})^2 \sum (y_i - \bar{y})^2}}$
159
+
160
+ Examples
161
+ --------
162
+ >>> corr = analyzer.correlation()
163
+ >>> print(corr)
164
+ >>> corr = analyzer.correlation(features=['A', 'B'], method='kendall')
165
+ >>> print(corr)
166
+ """
167
+ df_num = self._select_numeric(features)
168
+ corr = df_num.corr(method=method)
169
+
170
+ if mask:
171
+ mask_arr = np.triu(np.ones_like(corr, dtype=bool), k=1)
172
+ corr = corr.where(~mask_arr)
173
+
174
+ return corr
175
+
176
+ def missing(self, **kwargs):
177
+ """
178
+ Analyze missing values in the dataset.
179
+
180
+ Returns
181
+ -------
182
+ pandas.DataFrame
183
+ DataFrame with columns 'count', 'percent' for each column with missing values.
184
+
185
+ Examples
186
+ --------
187
+ >>> missing = analyzer.missing()
188
+ >>> print(missing)
189
+ """
190
+ total = self.data.isnull().sum()
191
+ percent = (total / len(self.data)) * 100
192
+ result = pd.DataFrame({'count': total, 'percent': percent})
193
+ result = result[result['count'] > 0].sort_values('count', ascending=False)
194
+ return result
195
+
196
+ def outliers(self, method='iqr', threshold=1.5, **kwargs):
197
+ r"""
198
+ Detect outliers in the dataset.
199
+
200
+ Parameters
201
+ ----------
202
+ method : str, default='iqr'
203
+ Outlier detection method ('iqr', 'zscore').
204
+ threshold : float, default=1.5
205
+ Threshold for outlier detection (IQR multiplier or Z-score cutoff).
206
+ **kwargs :
207
+ Additional arguments.
208
+
209
+ Returns
210
+ -------
211
+ dict
212
+ Dictionary per column with keys 'count', 'indices', 'lower_bound', 'upper_bound'.
213
+
214
+ Notes
215
+ -----
216
+ IQR method:
217
+ $Q_1 = 25\%$ percentile, $Q_3 = 75\%$ percentile
218
+ $IQR = Q_3 - Q_1$
219
+ Outlier if $x < Q_1 - k \cdot IQR$ or $x > Q_3 + k \cdot IQR$
220
+
221
+ Examples
222
+ --------
223
+ >>> out = analyzer.outliers()
224
+ >>> print(out)
225
+ >>> out = analyzer.outliers(method='zscore', threshold=3)
226
+ >>> print(out)
227
+ """
228
+ df_num = self._select_numeric()
229
+ results = {}
230
+
231
+ for col in df_num.columns:
232
+ x = df_num[col].dropna()
233
+ if method == 'iqr':
234
+ q1 = x.quantile(0.25)
235
+ q3 = x.quantile(0.75)
236
+ iqr = q3 - q1
237
+ lower = q1 - threshold * iqr
238
+ upper = q3 + threshold * iqr
239
+ mask = (x < lower) | (x > upper)
240
+ elif method == 'zscore':
241
+ z = np.abs((x - x.mean()) / x.std())
242
+ lower = x.mean() - threshold * x.std()
243
+ upper = x.mean() + threshold * x.std()
244
+ mask = z > threshold
245
+ else:
246
+ raise ValueError(f"Unknown method: {method}. Use 'iqr' or 'zscore'.")
247
+
248
+ outlier_idx = x[mask].index.tolist()
249
+ results[col] = {
250
+ 'count': int(mask.sum()),
251
+ 'indices': outlier_idx,
252
+ 'lower_bound': float(lower),
253
+ 'upper_bound': float(upper),
254
+ }
255
+
256
+ return results
257
+
258
+ def target(self, target_column, **kwargs):
259
+ """
260
+ Analyze the target variable (distribution, imbalance, etc.).
261
+
262
+ Parameters
263
+ ----------
264
+ target_column : str
265
+ Name of the target column.
266
+ **kwargs :
267
+ Additional arguments.
268
+
269
+ Returns
270
+ -------
271
+ dict
272
+ Target analysis summary with keys 'value_counts', 'percent', 'n_unique', 'dtype'.
273
+
274
+ Examples
275
+ --------
276
+ >>> target = analyzer.target(target_column='species')
277
+ >>> print(target)
278
+ """
279
+ y = self.data[target_column]
280
+ counts = y.value_counts()
281
+ percent = (counts / len(y)) * 100
282
+ return {
283
+ 'value_counts': counts,
284
+ 'percent': percent,
285
+ 'n_unique': int(y.nunique()),
286
+ 'dtype': str(y.dtype),
287
+ 'missing': int(y.isnull().sum()),
288
+ }
289
+
290
+ def boxplot(self, columns='all', by=None, **kwargs):
291
+ """
292
+ Generate boxplots for selected columns.
293
+
294
+ Parameters
295
+ ----------
296
+ columns : 'all' or list, default='all'
297
+ Columns to plot.
298
+ by : str or None, default=None
299
+ Grouping variable.
300
+ **kwargs :
301
+ Additional arguments for plotting.
302
+
303
+ Returns
304
+ -------
305
+ matplotlib.figure.Figure
306
+ The generated boxplot figure.
307
+
308
+ Examples
309
+ --------
310
+ >>> fig = analyzer.boxplot(columns=['A', 'B'], by='Group')
311
+ >>> fig.show()
312
+ """
313
+ df_num = self._select_numeric(columns)
314
+ cols = df_num.columns.tolist()
315
+
316
+ fig, ax = plt.subplots(figsize=kwargs.get('figsize', (8, 6)))
317
+ if by is not None and by in self.data.columns:
318
+ self.data.boxplot(column=cols, by=by, ax=ax)
319
+ ax.set_title(f'Boxplot grouped by {by}')
320
+ else:
321
+ df_num[cols].boxplot(ax=ax)
322
+ ax.set_title('Boxplot')
323
+ ax.set_ylabel('Value')
324
+ fig.tight_layout()
325
+ return fig
326
+
327
+ def bivariate(self, x, y, **kwargs):
328
+ """
329
+ Bivariate analysis between two variables (scatter plot + regression line).
330
+
331
+ Parameters
332
+ ----------
333
+ x : str
334
+ First variable.
335
+ y : str
336
+ Second variable.
337
+ **kwargs :
338
+ Additional arguments for plotting.
339
+
340
+ Returns
341
+ -------
342
+ matplotlib.figure.Figure
343
+ The generated bivariate plot.
344
+
345
+ Examples
346
+ --------
347
+ >>> fig = analyzer.bivariate(x='A', y='B')
348
+ >>> fig.show()
349
+ """
350
+ fig, ax = plt.subplots(figsize=kwargs.get('figsize', (8, 6)))
351
+ x_data = self.data[x].dropna()
352
+ y_data = self.data[y].dropna()
353
+ # Align indices
354
+ common = x_data.index.intersection(y_data.index)
355
+ x_data = x_data.loc[common]
356
+ y_data = y_data.loc[common]
357
+
358
+ ax.scatter(x_data, y_data, alpha=0.6, edgecolors='k', linewidths=0.5)
359
+
360
+ # Linear regression line
361
+ if np.issubdtype(x_data.dtype, np.number) and np.issubdtype(y_data.dtype, np.number):
362
+ slope, intercept, r_value, _, _ = stats.linregress(x_data, y_data)
363
+ x_line = np.linspace(x_data.min(), x_data.max(), 100)
364
+ ax.plot(x_line, slope * x_line + intercept, 'r--',
365
+ label=f'y={slope:.3f}x+{intercept:.3f} (R²={r_value**2:.3f})')
366
+ ax.legend()
367
+
368
+ ax.set_xlabel(x)
369
+ ax.set_ylabel(y)
370
+ ax.set_title(f'{x} vs {y}')
371
+ fig.tight_layout()
372
+ return fig
373
+
374
+ def normality(self, columns='all', **kwargs):
375
+ """
376
+ Test normality of variables (Shapiro-Wilk, D'Agostino-Pearson, Anderson-Darling).
377
+
378
+ Parameters
379
+ ----------
380
+ columns : 'all' or list, default='all'
381
+ Columns to test.
382
+ **kwargs :
383
+ Additional arguments.
384
+
385
+ Returns
386
+ -------
387
+ dict
388
+ Normality test results per column with p-values and statistics.
389
+
390
+ Examples
391
+ --------
392
+ >>> norm = analyzer.normality()
393
+ >>> print(norm)
394
+ """
395
+ df_num = self._select_numeric(columns)
396
+ results = {}
397
+
398
+ for col in df_num.columns:
399
+ x = df_num[col].dropna()
400
+ col_result = {}
401
+
402
+ # Shapiro-Wilk (max 5000 samples)
403
+ sample = x if len(x) <= 5000 else x.sample(5000, random_state=42)
404
+ stat_sw, p_sw = stats.shapiro(sample)
405
+ col_result['shapiro'] = {'statistic': float(stat_sw), 'p_value': float(p_sw)}
406
+
407
+ # D'Agostino-Pearson (requires n >= 20)
408
+ if len(x) >= 20:
409
+ stat_da, p_da = stats.normaltest(x)
410
+ col_result['dagostino'] = {'statistic': float(stat_da), 'p_value': float(p_da)}
411
+ else:
412
+ col_result['dagostino'] = {'statistic': None, 'p_value': None}
413
+
414
+ # Anderson-Darling
415
+ ad_result = stats.anderson(x, dist='norm', method='interpolate')
416
+ col_result['anderson'] = {
417
+ 'statistic': float(ad_result.statistic),
418
+ 'p_value': float(ad_result.pvalue),
419
+ }
420
+
421
+ # Skewness & kurtosis
422
+ col_result['skewness'] = float(x.skew())
423
+ col_result['kurtosis'] = float(x.kurtosis())
424
+
425
+ results[col] = col_result
426
+
427
+ return results
428
+
429
+ def multicollinearity(self, **kwargs):
430
+ """
431
+ Analyze multicollinearity using Variance Inflation Factor (VIF).
432
+
433
+ Returns
434
+ -------
435
+ pandas.DataFrame
436
+ VIF per feature with columns 'feature' and 'VIF'.
437
+
438
+ Notes
439
+ -----
440
+ $VIF_j = \frac{1}{1 - R_j^2}$
441
+ where $R_j^2$ is the $R^2$ of regressing feature $j$ on all others.
442
+
443
+ Examples
444
+ --------
445
+ >>> vif = analyzer.multicollinearity()
446
+ >>> print(vif)
447
+ """
448
+ df_num = self._select_numeric().dropna()
449
+ if df_num.shape[1] < 2:
450
+ return pd.DataFrame(columns=['feature', 'VIF'])
451
+
452
+ X = df_num.values
453
+ vif_data = pd.DataFrame({
454
+ 'feature': df_num.columns,
455
+ 'VIF': [variance_inflation_factor(X, i) for i in range(X.shape[1])]
456
+ })
457
+ return vif_data.sort_values('VIF', ascending=False).reset_index(drop=True)
458
+
459
+ def profiling(self, **kwargs):
460
+ """
461
+ Generate a global profiling report (summary statistics, missing, outliers, etc.).
462
+
463
+ Returns
464
+ -------
465
+ dict
466
+ Profiling report with keys 'describe', 'dtypes', 'missing', 'outliers',
467
+ 'correlation', 'shape'.
468
+
469
+ Examples
470
+ --------
471
+ >>> report = analyzer.profiling()
472
+ >>> print(report['describe'])
473
+ """
474
+ return {
475
+ 'shape': self.data.shape,
476
+ 'dtypes': self.data.dtypes,
477
+ 'describe': self.data.describe(include='all'),
478
+ 'missing': self.missing(),
479
+ 'outliers': self.outliers(),
480
+ 'correlation': self.correlation(),
481
+ }