trainedml 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trainedml/__init__.py +178 -0
- trainedml/__main__.py +7 -0
- trainedml/analyzer.py +481 -0
- trainedml/benchmark.py +251 -0
- trainedml/cli.py +183 -0
- trainedml/data/__init__.py +6 -0
- trainedml/data/loader.py +206 -0
- trainedml/evaluation.py +172 -0
- trainedml/figure.py +209 -0
- trainedml/models/__init__.py +84 -0
- trainedml/models/base.py +182 -0
- trainedml/models/factory.py +54 -0
- trainedml/models/knn.py +93 -0
- trainedml/models/logistic.py +102 -0
- trainedml/models/random_forest.py +101 -0
- trainedml/models/regressors.py +132 -0
- trainedml/utils/__init__.py +5 -0
- trainedml/utils/factory.py +13 -0
- trainedml/visualization.py +487 -0
- trainedml/viz/__init__.py +20 -0
- trainedml/viz/bivariate.py +68 -0
- trainedml/viz/boxplot.py +64 -0
- trainedml/viz/correlation.py +102 -0
- trainedml/viz/distribution.py +63 -0
- trainedml/viz/heatmap.py +102 -0
- trainedml/viz/histogram.py +98 -0
- trainedml/viz/line.py +56 -0
- trainedml/viz/missing.py +57 -0
- trainedml/viz/multicollinearity.py +70 -0
- trainedml/viz/normality.py +81 -0
- trainedml/viz/outliers.py +92 -0
- trainedml/viz/profiling.py +59 -0
- trainedml/viz/target.py +26 -0
- trainedml/viz/vizs.py +109 -0
- trainedml-0.1.1.dist-info/METADATA +145 -0
- trainedml-0.1.1.dist-info/RECORD +40 -0
- trainedml-0.1.1.dist-info/WHEEL +5 -0
- trainedml-0.1.1.dist-info/entry_points.txt +2 -0
- trainedml-0.1.1.dist-info/licenses/LICENCE +21 -0
- trainedml-0.1.1.dist-info/top_level.txt +1 -0
trainedml/__init__.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Module principal du package trainedml.
|
|
4
|
+
|
|
5
|
+
Ce module expose la classe centrale `Trainer` qui permet de gérer tout le workflow de machine learning :
|
|
6
|
+
chargement de données, séparation train/test, entraînement, évaluation et prédiction.
|
|
7
|
+
Il sert aussi de point d'entrée pour la CLI (ligne de commande).
|
|
8
|
+
|
|
9
|
+
Fonctionnalités principales
|
|
10
|
+
--------------------------
|
|
11
|
+
- API haut niveau pour entraîner, évaluer et prédire avec un modèle ML
|
|
12
|
+
- Supporte les datasets publics (Iris, Wine, etc.) ou des CSV distants
|
|
13
|
+
- Séparation automatique train/test
|
|
14
|
+
- Gestion de plusieurs modèles (KNN, Logistic, Random Forest, etc.)
|
|
15
|
+
- Évaluation standard (accuracy, precision, recall, f1)
|
|
16
|
+
- Peut être utilisé en script, API, ou webapp
|
|
17
|
+
|
|
18
|
+
Exemple
|
|
19
|
+
-------
|
|
20
|
+
>>> from trainedml import Trainer
|
|
21
|
+
>>> trainer = Trainer(dataset="iris", model="knn")
|
|
22
|
+
>>> trainer.fit()
|
|
23
|
+
>>> results = trainer.evaluate()
|
|
24
|
+
>>> print(results)
|
|
25
|
+
>>> preds = trainer.predict([[5.1, 3.5, 1.4, 0.2]])
|
|
26
|
+
>>> print(preds)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# Ce fichier permet d'importer le package trainedml
|
|
30
|
+
|
|
31
|
+
# Classe Trainer pour usage API et webapp
|
|
32
|
+
from .data.loader import DataLoader
|
|
33
|
+
from .models import KNNModel, LogisticModel, RandomForestModel, MODEL_MAP, get_model
|
|
34
|
+
from .evaluation import Evaluator
|
|
35
|
+
from sklearn.model_selection import train_test_split
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Trainer:
|
|
40
|
+
r"""
|
|
41
|
+
Classe haut niveau pour entraîner, évaluer et prédire avec un modèle de machine learning.
|
|
42
|
+
|
|
43
|
+
Cette classe centralise tout le workflow ML : chargement des données, split train/test,
|
|
44
|
+
entraînement, évaluation et prédiction. Elle est conçue pour être utilisée dans une API,
|
|
45
|
+
une webapp ou en script Python.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
dataset : str, optional
|
|
50
|
+
Nom du dataset connu ("iris", "wine", etc.).
|
|
51
|
+
model : str
|
|
52
|
+
Nom du modèle à utiliser ("random_forest", "knn", "logistic").
|
|
53
|
+
url : str, optional
|
|
54
|
+
URL d'un CSV distant à charger.
|
|
55
|
+
target : str, optional
|
|
56
|
+
Nom de la colonne cible (si url).
|
|
57
|
+
test_size : float
|
|
58
|
+
Proportion de test (entre 0 et 1).
|
|
59
|
+
seed : int
|
|
60
|
+
Graine aléatoire pour la reproductibilité.
|
|
61
|
+
|
|
62
|
+
Attributes
|
|
63
|
+
----------
|
|
64
|
+
model : BaseModel
|
|
65
|
+
Instance du modèle ML utilisé.
|
|
66
|
+
X_train, X_test, y_train, y_test : array-like
|
|
67
|
+
Données séparées pour l'entraînement et le test.
|
|
68
|
+
is_fitted : bool
|
|
69
|
+
Indique si le modèle a été entraîné.
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> trainer = Trainer(dataset="iris", model="knn")
|
|
74
|
+
>>> trainer.fit()
|
|
75
|
+
>>> results = trainer.evaluate()
|
|
76
|
+
>>> print(results)
|
|
77
|
+
>>> preds = trainer.predict([[5.1, 3.5, 1.4, 0.2]])
|
|
78
|
+
>>> print(preds)
|
|
79
|
+
"""
|
|
80
|
+
def __init__(self, dataset=None, model='random_forest', url=None, target=None, test_size=0.2, seed=42):
|
|
81
|
+
self.dataset = dataset
|
|
82
|
+
self.url = url
|
|
83
|
+
self.target = target
|
|
84
|
+
self.test_size = test_size
|
|
85
|
+
self.seed = seed
|
|
86
|
+
self.model_name = model
|
|
87
|
+
self.model = MODEL_MAP[model]()
|
|
88
|
+
self.X_train = self.X_test = self.y_train = self.y_test = None
|
|
89
|
+
self.is_fitted = False
|
|
90
|
+
|
|
91
|
+
def load_data(self):
|
|
92
|
+
"""
|
|
93
|
+
Charge les données, effectue la séparation train/test et les stocke dans l'objet.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
tuple
|
|
98
|
+
(X_train, X_test, y_train, y_test)
|
|
99
|
+
|
|
100
|
+
Raises
|
|
101
|
+
------
|
|
102
|
+
ValueError
|
|
103
|
+
Si le dataset ou la cible n'est pas spécifié correctement.
|
|
104
|
+
"""
|
|
105
|
+
loader = DataLoader()
|
|
106
|
+
X, y = loader.load_dataset(name=self.dataset, url=self.url, target=self.target)
|
|
107
|
+
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
|
108
|
+
X, y, test_size=self.test_size, random_state=self.seed)
|
|
109
|
+
return self.X_train, self.X_test, self.y_train, self.y_test
|
|
110
|
+
|
|
111
|
+
def fit(self):
|
|
112
|
+
"""
|
|
113
|
+
Entraîne le modèle sur les données d'entraînement.
|
|
114
|
+
Charge les données si nécessaire.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
self : Trainer
|
|
119
|
+
L'instance courante (pour chaînage).
|
|
120
|
+
"""
|
|
121
|
+
if self.X_train is None:
|
|
122
|
+
self.load_data()
|
|
123
|
+
self.model.fit(self.X_train, self.y_train)
|
|
124
|
+
self.is_fitted = True
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
def evaluate(self):
|
|
128
|
+
"""
|
|
129
|
+
Évalue le modèle entraîné sur les données de test.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
dict
|
|
134
|
+
Dictionnaire des scores de classification (accuracy, precision, recall, f1).
|
|
135
|
+
|
|
136
|
+
Raises
|
|
137
|
+
------
|
|
138
|
+
RuntimeError
|
|
139
|
+
Si le modèle n'est pas entraîné.
|
|
140
|
+
"""
|
|
141
|
+
if not self.is_fitted:
|
|
142
|
+
raise RuntimeError("Le modèle doit être entraîné avant l'évaluation.")
|
|
143
|
+
y_pred = self.model.predict(self.X_test)
|
|
144
|
+
return Evaluator.evaluate_all(self.y_test, y_pred)
|
|
145
|
+
|
|
146
|
+
def predict(self, X):
|
|
147
|
+
"""
|
|
148
|
+
Prédit la cible pour de nouvelles données X.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
X : array-like
|
|
153
|
+
Données d'entrée (mêmes features que l'entraînement).
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
array
|
|
158
|
+
Prédictions du modèle.
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
RuntimeError
|
|
163
|
+
Si le modèle n'est pas entraîné.
|
|
164
|
+
"""
|
|
165
|
+
if not self.is_fitted:
|
|
166
|
+
raise RuntimeError("Le modèle doit être entraîné avant la prédiction.")
|
|
167
|
+
import numpy as np
|
|
168
|
+
X = np.array(X)
|
|
169
|
+
return self.model.predict(X)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def main():
|
|
173
|
+
"""
|
|
174
|
+
Point d'entrée CLI du package trainedml.
|
|
175
|
+
Lance l'interface en ligne de commande (voir src/trainedml/cli.py).
|
|
176
|
+
"""
|
|
177
|
+
from .cli import main as cli_main
|
|
178
|
+
cli_main()
|
trainedml/__main__.py
ADDED
trainedml/analyzer.py
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data analysis and exploratory statistics for trainedml.
|
|
3
|
+
|
|
4
|
+
This module provides the DataAnalyzer class, which offers a suite of methods for
|
|
5
|
+
descriptive statistics, distribution analysis, correlation, missing values, outliers,
|
|
6
|
+
target analysis, boxplots, bivariate analysis, normality, multicollinearity, and profiling.
|
|
7
|
+
|
|
8
|
+
Mathematical context
|
|
9
|
+
--------------------
|
|
10
|
+
- Correlation: Pearson, Spearman, Kendall
|
|
11
|
+
- Outlier detection: IQR, Z-score
|
|
12
|
+
- Normality: Shapiro-Wilk, D'Agostino, Anderson-Darling
|
|
13
|
+
- Multicollinearity: Variance Inflation Factor (VIF)
|
|
14
|
+
|
|
15
|
+
Examples
|
|
16
|
+
--------
|
|
17
|
+
>>> from trainedml.analyzer import DataAnalyzer
|
|
18
|
+
>>> analyzer = DataAnalyzer(df)
|
|
19
|
+
>>> analyzer.correlation()
|
|
20
|
+
>>> analyzer.outliers()
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
import numpy as np
|
|
25
|
+
import matplotlib.pyplot as plt
|
|
26
|
+
from scipy import stats
|
|
27
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DataAnalyzer:
|
|
31
|
+
r"""
|
|
32
|
+
Exploratory data analysis and statistics.
|
|
33
|
+
|
|
34
|
+
Provides a suite of methods for descriptive statistics, distribution analysis, correlation,
|
|
35
|
+
missing values, outliers, target analysis, boxplots, bivariate analysis, normality,
|
|
36
|
+
multicollinearity, and profiling.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
data : pandas.DataFrame
|
|
41
|
+
The dataset to analyze.
|
|
42
|
+
|
|
43
|
+
Attributes
|
|
44
|
+
----------
|
|
45
|
+
data : pandas.DataFrame
|
|
46
|
+
The underlying data.
|
|
47
|
+
|
|
48
|
+
Examples
|
|
49
|
+
--------
|
|
50
|
+
Basic usage:
|
|
51
|
+
>>> from trainedml.analyzer import DataAnalyzer
|
|
52
|
+
>>> analyzer = DataAnalyzer(df)
|
|
53
|
+
>>> stats = analyzer.distribution()
|
|
54
|
+
>>> print(stats)
|
|
55
|
+
|
|
56
|
+
Correlation matrix:
|
|
57
|
+
>>> corr = analyzer.correlation(method='spearman')
|
|
58
|
+
>>> print(corr)
|
|
59
|
+
|
|
60
|
+
Outlier detection:
|
|
61
|
+
>>> out = analyzer.outliers(method='zscore', threshold=3)
|
|
62
|
+
>>> print(out)
|
|
63
|
+
|
|
64
|
+
Normality tests:
|
|
65
|
+
>>> norm = analyzer.normality()
|
|
66
|
+
>>> print(norm)
|
|
67
|
+
|
|
68
|
+
Profiling report:
|
|
69
|
+
>>> report = analyzer.profiling()
|
|
70
|
+
>>> print(report['describe'])
|
|
71
|
+
|
|
72
|
+
Notes
|
|
73
|
+
-----
|
|
74
|
+
- All methods return pandas objects or dicts for easy integration with pandas workflows.
|
|
75
|
+
- For plotting, returned objects are matplotlib figures.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(self, data):
|
|
79
|
+
self.data = data
|
|
80
|
+
|
|
81
|
+
def _select_numeric(self, columns='all'):
|
|
82
|
+
"""Return numeric columns from the DataFrame."""
|
|
83
|
+
if columns == 'all':
|
|
84
|
+
return self.data.select_dtypes(include=[np.number])
|
|
85
|
+
if isinstance(columns, str):
|
|
86
|
+
columns = [columns]
|
|
87
|
+
return self.data[columns].select_dtypes(include=[np.number])
|
|
88
|
+
|
|
89
|
+
def distribution(self, columns='all', **kwargs):
|
|
90
|
+
"""
|
|
91
|
+
Compute and plot the distribution of variables.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
columns : 'all' or list, default='all'
|
|
96
|
+
Columns to analyze.
|
|
97
|
+
**kwargs :
|
|
98
|
+
Additional arguments for plotting.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
dict
|
|
103
|
+
Dictionary with keys 'describe' (summary stats) and 'figure' (histogram grid).
|
|
104
|
+
|
|
105
|
+
Examples
|
|
106
|
+
--------
|
|
107
|
+
>>> stats = analyzer.distribution()
|
|
108
|
+
>>> print(stats['describe'])
|
|
109
|
+
>>> stats['figure'].show()
|
|
110
|
+
"""
|
|
111
|
+
df_num = self._select_numeric(columns)
|
|
112
|
+
cols = df_num.columns.tolist()
|
|
113
|
+
n = len(cols)
|
|
114
|
+
if n == 0:
|
|
115
|
+
return {'describe': pd.DataFrame(), 'figure': None}
|
|
116
|
+
|
|
117
|
+
ncols_plot = min(n, 3)
|
|
118
|
+
nrows_plot = (n + ncols_plot - 1) // ncols_plot
|
|
119
|
+
fig, axes = plt.subplots(nrows_plot, ncols_plot, figsize=(5 * ncols_plot, 4 * nrows_plot))
|
|
120
|
+
axes = np.atleast_1d(axes).flatten()
|
|
121
|
+
|
|
122
|
+
for i, col in enumerate(cols):
|
|
123
|
+
ax = axes[i]
|
|
124
|
+
ax.hist(df_num[col].dropna(), bins=kwargs.get('bins', 30), edgecolor='black', alpha=0.7)
|
|
125
|
+
ax.set_title(col)
|
|
126
|
+
ax.set_xlabel(col)
|
|
127
|
+
ax.set_ylabel('Frequency')
|
|
128
|
+
# Hide unused axes
|
|
129
|
+
for j in range(i + 1, len(axes)):
|
|
130
|
+
axes[j].set_visible(False)
|
|
131
|
+
fig.tight_layout()
|
|
132
|
+
|
|
133
|
+
return {'describe': df_num.describe(), 'figure': fig}
|
|
134
|
+
|
|
135
|
+
def correlation(self, features='all', method='pearson', mask=True, **kwargs):
|
|
136
|
+
r"""
|
|
137
|
+
Compute the correlation matrix between features.
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
features : 'all' or list, default='all'
|
|
142
|
+
Features to include.
|
|
143
|
+
method : str, default='pearson'
|
|
144
|
+
Correlation method ('pearson', 'spearman', 'kendall').
|
|
145
|
+
mask : bool, default=True
|
|
146
|
+
Whether to mask the upper triangle.
|
|
147
|
+
**kwargs :
|
|
148
|
+
Additional arguments for plotting.
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
pandas.DataFrame
|
|
153
|
+
Correlation matrix.
|
|
154
|
+
|
|
155
|
+
Notes
|
|
156
|
+
-----
|
|
157
|
+
Pearson correlation:
|
|
158
|
+
$r_{xy} = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum (x_i - \bar{x})^2 \sum (y_i - \bar{y})^2}}$
|
|
159
|
+
|
|
160
|
+
Examples
|
|
161
|
+
--------
|
|
162
|
+
>>> corr = analyzer.correlation()
|
|
163
|
+
>>> print(corr)
|
|
164
|
+
>>> corr = analyzer.correlation(features=['A', 'B'], method='kendall')
|
|
165
|
+
>>> print(corr)
|
|
166
|
+
"""
|
|
167
|
+
df_num = self._select_numeric(features)
|
|
168
|
+
corr = df_num.corr(method=method)
|
|
169
|
+
|
|
170
|
+
if mask:
|
|
171
|
+
mask_arr = np.triu(np.ones_like(corr, dtype=bool), k=1)
|
|
172
|
+
corr = corr.where(~mask_arr)
|
|
173
|
+
|
|
174
|
+
return corr
|
|
175
|
+
|
|
176
|
+
def missing(self, **kwargs):
|
|
177
|
+
"""
|
|
178
|
+
Analyze missing values in the dataset.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
pandas.DataFrame
|
|
183
|
+
DataFrame with columns 'count', 'percent' for each column with missing values.
|
|
184
|
+
|
|
185
|
+
Examples
|
|
186
|
+
--------
|
|
187
|
+
>>> missing = analyzer.missing()
|
|
188
|
+
>>> print(missing)
|
|
189
|
+
"""
|
|
190
|
+
total = self.data.isnull().sum()
|
|
191
|
+
percent = (total / len(self.data)) * 100
|
|
192
|
+
result = pd.DataFrame({'count': total, 'percent': percent})
|
|
193
|
+
result = result[result['count'] > 0].sort_values('count', ascending=False)
|
|
194
|
+
return result
|
|
195
|
+
|
|
196
|
+
def outliers(self, method='iqr', threshold=1.5, **kwargs):
|
|
197
|
+
r"""
|
|
198
|
+
Detect outliers in the dataset.
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
method : str, default='iqr'
|
|
203
|
+
Outlier detection method ('iqr', 'zscore').
|
|
204
|
+
threshold : float, default=1.5
|
|
205
|
+
Threshold for outlier detection (IQR multiplier or Z-score cutoff).
|
|
206
|
+
**kwargs :
|
|
207
|
+
Additional arguments.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
dict
|
|
212
|
+
Dictionary per column with keys 'count', 'indices', 'lower_bound', 'upper_bound'.
|
|
213
|
+
|
|
214
|
+
Notes
|
|
215
|
+
-----
|
|
216
|
+
IQR method:
|
|
217
|
+
$Q_1 = 25\%$ percentile, $Q_3 = 75\%$ percentile
|
|
218
|
+
$IQR = Q_3 - Q_1$
|
|
219
|
+
Outlier if $x < Q_1 - k \cdot IQR$ or $x > Q_3 + k \cdot IQR$
|
|
220
|
+
|
|
221
|
+
Examples
|
|
222
|
+
--------
|
|
223
|
+
>>> out = analyzer.outliers()
|
|
224
|
+
>>> print(out)
|
|
225
|
+
>>> out = analyzer.outliers(method='zscore', threshold=3)
|
|
226
|
+
>>> print(out)
|
|
227
|
+
"""
|
|
228
|
+
df_num = self._select_numeric()
|
|
229
|
+
results = {}
|
|
230
|
+
|
|
231
|
+
for col in df_num.columns:
|
|
232
|
+
x = df_num[col].dropna()
|
|
233
|
+
if method == 'iqr':
|
|
234
|
+
q1 = x.quantile(0.25)
|
|
235
|
+
q3 = x.quantile(0.75)
|
|
236
|
+
iqr = q3 - q1
|
|
237
|
+
lower = q1 - threshold * iqr
|
|
238
|
+
upper = q3 + threshold * iqr
|
|
239
|
+
mask = (x < lower) | (x > upper)
|
|
240
|
+
elif method == 'zscore':
|
|
241
|
+
z = np.abs((x - x.mean()) / x.std())
|
|
242
|
+
lower = x.mean() - threshold * x.std()
|
|
243
|
+
upper = x.mean() + threshold * x.std()
|
|
244
|
+
mask = z > threshold
|
|
245
|
+
else:
|
|
246
|
+
raise ValueError(f"Unknown method: {method}. Use 'iqr' or 'zscore'.")
|
|
247
|
+
|
|
248
|
+
outlier_idx = x[mask].index.tolist()
|
|
249
|
+
results[col] = {
|
|
250
|
+
'count': int(mask.sum()),
|
|
251
|
+
'indices': outlier_idx,
|
|
252
|
+
'lower_bound': float(lower),
|
|
253
|
+
'upper_bound': float(upper),
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return results
|
|
257
|
+
|
|
258
|
+
def target(self, target_column, **kwargs):
|
|
259
|
+
"""
|
|
260
|
+
Analyze the target variable (distribution, imbalance, etc.).
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
----------
|
|
264
|
+
target_column : str
|
|
265
|
+
Name of the target column.
|
|
266
|
+
**kwargs :
|
|
267
|
+
Additional arguments.
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
dict
|
|
272
|
+
Target analysis summary with keys 'value_counts', 'percent', 'n_unique', 'dtype'.
|
|
273
|
+
|
|
274
|
+
Examples
|
|
275
|
+
--------
|
|
276
|
+
>>> target = analyzer.target(target_column='species')
|
|
277
|
+
>>> print(target)
|
|
278
|
+
"""
|
|
279
|
+
y = self.data[target_column]
|
|
280
|
+
counts = y.value_counts()
|
|
281
|
+
percent = (counts / len(y)) * 100
|
|
282
|
+
return {
|
|
283
|
+
'value_counts': counts,
|
|
284
|
+
'percent': percent,
|
|
285
|
+
'n_unique': int(y.nunique()),
|
|
286
|
+
'dtype': str(y.dtype),
|
|
287
|
+
'missing': int(y.isnull().sum()),
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
def boxplot(self, columns='all', by=None, **kwargs):
|
|
291
|
+
"""
|
|
292
|
+
Generate boxplots for selected columns.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
columns : 'all' or list, default='all'
|
|
297
|
+
Columns to plot.
|
|
298
|
+
by : str or None, default=None
|
|
299
|
+
Grouping variable.
|
|
300
|
+
**kwargs :
|
|
301
|
+
Additional arguments for plotting.
|
|
302
|
+
|
|
303
|
+
Returns
|
|
304
|
+
-------
|
|
305
|
+
matplotlib.figure.Figure
|
|
306
|
+
The generated boxplot figure.
|
|
307
|
+
|
|
308
|
+
Examples
|
|
309
|
+
--------
|
|
310
|
+
>>> fig = analyzer.boxplot(columns=['A', 'B'], by='Group')
|
|
311
|
+
>>> fig.show()
|
|
312
|
+
"""
|
|
313
|
+
df_num = self._select_numeric(columns)
|
|
314
|
+
cols = df_num.columns.tolist()
|
|
315
|
+
|
|
316
|
+
fig, ax = plt.subplots(figsize=kwargs.get('figsize', (8, 6)))
|
|
317
|
+
if by is not None and by in self.data.columns:
|
|
318
|
+
self.data.boxplot(column=cols, by=by, ax=ax)
|
|
319
|
+
ax.set_title(f'Boxplot grouped by {by}')
|
|
320
|
+
else:
|
|
321
|
+
df_num[cols].boxplot(ax=ax)
|
|
322
|
+
ax.set_title('Boxplot')
|
|
323
|
+
ax.set_ylabel('Value')
|
|
324
|
+
fig.tight_layout()
|
|
325
|
+
return fig
|
|
326
|
+
|
|
327
|
+
def bivariate(self, x, y, **kwargs):
|
|
328
|
+
"""
|
|
329
|
+
Bivariate analysis between two variables (scatter plot + regression line).
|
|
330
|
+
|
|
331
|
+
Parameters
|
|
332
|
+
----------
|
|
333
|
+
x : str
|
|
334
|
+
First variable.
|
|
335
|
+
y : str
|
|
336
|
+
Second variable.
|
|
337
|
+
**kwargs :
|
|
338
|
+
Additional arguments for plotting.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
matplotlib.figure.Figure
|
|
343
|
+
The generated bivariate plot.
|
|
344
|
+
|
|
345
|
+
Examples
|
|
346
|
+
--------
|
|
347
|
+
>>> fig = analyzer.bivariate(x='A', y='B')
|
|
348
|
+
>>> fig.show()
|
|
349
|
+
"""
|
|
350
|
+
fig, ax = plt.subplots(figsize=kwargs.get('figsize', (8, 6)))
|
|
351
|
+
x_data = self.data[x].dropna()
|
|
352
|
+
y_data = self.data[y].dropna()
|
|
353
|
+
# Align indices
|
|
354
|
+
common = x_data.index.intersection(y_data.index)
|
|
355
|
+
x_data = x_data.loc[common]
|
|
356
|
+
y_data = y_data.loc[common]
|
|
357
|
+
|
|
358
|
+
ax.scatter(x_data, y_data, alpha=0.6, edgecolors='k', linewidths=0.5)
|
|
359
|
+
|
|
360
|
+
# Linear regression line
|
|
361
|
+
if np.issubdtype(x_data.dtype, np.number) and np.issubdtype(y_data.dtype, np.number):
|
|
362
|
+
slope, intercept, r_value, _, _ = stats.linregress(x_data, y_data)
|
|
363
|
+
x_line = np.linspace(x_data.min(), x_data.max(), 100)
|
|
364
|
+
ax.plot(x_line, slope * x_line + intercept, 'r--',
|
|
365
|
+
label=f'y={slope:.3f}x+{intercept:.3f} (R²={r_value**2:.3f})')
|
|
366
|
+
ax.legend()
|
|
367
|
+
|
|
368
|
+
ax.set_xlabel(x)
|
|
369
|
+
ax.set_ylabel(y)
|
|
370
|
+
ax.set_title(f'{x} vs {y}')
|
|
371
|
+
fig.tight_layout()
|
|
372
|
+
return fig
|
|
373
|
+
|
|
374
|
+
def normality(self, columns='all', **kwargs):
|
|
375
|
+
"""
|
|
376
|
+
Test normality of variables (Shapiro-Wilk, D'Agostino-Pearson, Anderson-Darling).
|
|
377
|
+
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
380
|
+
columns : 'all' or list, default='all'
|
|
381
|
+
Columns to test.
|
|
382
|
+
**kwargs :
|
|
383
|
+
Additional arguments.
|
|
384
|
+
|
|
385
|
+
Returns
|
|
386
|
+
-------
|
|
387
|
+
dict
|
|
388
|
+
Normality test results per column with p-values and statistics.
|
|
389
|
+
|
|
390
|
+
Examples
|
|
391
|
+
--------
|
|
392
|
+
>>> norm = analyzer.normality()
|
|
393
|
+
>>> print(norm)
|
|
394
|
+
"""
|
|
395
|
+
df_num = self._select_numeric(columns)
|
|
396
|
+
results = {}
|
|
397
|
+
|
|
398
|
+
for col in df_num.columns:
|
|
399
|
+
x = df_num[col].dropna()
|
|
400
|
+
col_result = {}
|
|
401
|
+
|
|
402
|
+
# Shapiro-Wilk (max 5000 samples)
|
|
403
|
+
sample = x if len(x) <= 5000 else x.sample(5000, random_state=42)
|
|
404
|
+
stat_sw, p_sw = stats.shapiro(sample)
|
|
405
|
+
col_result['shapiro'] = {'statistic': float(stat_sw), 'p_value': float(p_sw)}
|
|
406
|
+
|
|
407
|
+
# D'Agostino-Pearson (requires n >= 20)
|
|
408
|
+
if len(x) >= 20:
|
|
409
|
+
stat_da, p_da = stats.normaltest(x)
|
|
410
|
+
col_result['dagostino'] = {'statistic': float(stat_da), 'p_value': float(p_da)}
|
|
411
|
+
else:
|
|
412
|
+
col_result['dagostino'] = {'statistic': None, 'p_value': None}
|
|
413
|
+
|
|
414
|
+
# Anderson-Darling
|
|
415
|
+
ad_result = stats.anderson(x, dist='norm', method='interpolate')
|
|
416
|
+
col_result['anderson'] = {
|
|
417
|
+
'statistic': float(ad_result.statistic),
|
|
418
|
+
'p_value': float(ad_result.pvalue),
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
# Skewness & kurtosis
|
|
422
|
+
col_result['skewness'] = float(x.skew())
|
|
423
|
+
col_result['kurtosis'] = float(x.kurtosis())
|
|
424
|
+
|
|
425
|
+
results[col] = col_result
|
|
426
|
+
|
|
427
|
+
return results
|
|
428
|
+
|
|
429
|
+
def multicollinearity(self, **kwargs):
|
|
430
|
+
"""
|
|
431
|
+
Analyze multicollinearity using Variance Inflation Factor (VIF).
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
pandas.DataFrame
|
|
436
|
+
VIF per feature with columns 'feature' and 'VIF'.
|
|
437
|
+
|
|
438
|
+
Notes
|
|
439
|
+
-----
|
|
440
|
+
$VIF_j = \frac{1}{1 - R_j^2}$
|
|
441
|
+
where $R_j^2$ is the $R^2$ of regressing feature $j$ on all others.
|
|
442
|
+
|
|
443
|
+
Examples
|
|
444
|
+
--------
|
|
445
|
+
>>> vif = analyzer.multicollinearity()
|
|
446
|
+
>>> print(vif)
|
|
447
|
+
"""
|
|
448
|
+
df_num = self._select_numeric().dropna()
|
|
449
|
+
if df_num.shape[1] < 2:
|
|
450
|
+
return pd.DataFrame(columns=['feature', 'VIF'])
|
|
451
|
+
|
|
452
|
+
X = df_num.values
|
|
453
|
+
vif_data = pd.DataFrame({
|
|
454
|
+
'feature': df_num.columns,
|
|
455
|
+
'VIF': [variance_inflation_factor(X, i) for i in range(X.shape[1])]
|
|
456
|
+
})
|
|
457
|
+
return vif_data.sort_values('VIF', ascending=False).reset_index(drop=True)
|
|
458
|
+
|
|
459
|
+
def profiling(self, **kwargs):
|
|
460
|
+
"""
|
|
461
|
+
Generate a global profiling report (summary statistics, missing, outliers, etc.).
|
|
462
|
+
|
|
463
|
+
Returns
|
|
464
|
+
-------
|
|
465
|
+
dict
|
|
466
|
+
Profiling report with keys 'describe', 'dtypes', 'missing', 'outliers',
|
|
467
|
+
'correlation', 'shape'.
|
|
468
|
+
|
|
469
|
+
Examples
|
|
470
|
+
--------
|
|
471
|
+
>>> report = analyzer.profiling()
|
|
472
|
+
>>> print(report['describe'])
|
|
473
|
+
"""
|
|
474
|
+
return {
|
|
475
|
+
'shape': self.data.shape,
|
|
476
|
+
'dtypes': self.data.dtypes,
|
|
477
|
+
'describe': self.data.describe(include='all'),
|
|
478
|
+
'missing': self.missing(),
|
|
479
|
+
'outliers': self.outliers(),
|
|
480
|
+
'correlation': self.correlation(),
|
|
481
|
+
}
|