trailblazer-ml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: trailblazer-ml
3
+ Version: 0.1.0
4
+ Summary: Uma biblioteca de AutoML Exploratório e 'Glass-Box'.
5
+ Home-page: https://github.com/gabsalles/trailblazer-ml
6
+ Author: Gabriel Sales
7
+ Author-email: ggcs10@gmail.com
8
+ Keywords: automl,data-science,preprocessing,cleaning
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.8, <4
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: numpy>=1.21.0
17
+ Requires-Dist: pandas>=1.5.0
18
+ Requires-Dist: scikit-learn>=1.0.0
19
+ Requires-Dist: lightgbm>=3.3.0
20
+ Requires-Dist: rapidfuzz>=3.0.0
21
+ Requires-Dist: networkx>=3.0
22
+ Requires-Dist: plotly>=5.0.0
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: keywords
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # ScoutML 🚀
35
+
36
+ Uma biblioteca de AutoML "Glass-Box" focada em exploração, limpeza e geração de código transparente.
37
+
38
+ ## Instalação
39
+
40
+ ```bash
41
+ pip install scoutml
@@ -0,0 +1,8 @@
1
+ # ScoutML 🚀
2
+
3
+ Uma biblioteca de AutoML "Glass-Box" focada em exploração, limpeza e geração de código transparente.
4
+
5
+ ## Instalação
6
+
7
+ ```bash
8
+ pip install scoutml
@@ -0,0 +1,29 @@
1
+ """
2
+ ScoutML: Exploratory AutoML & Code Synthesis Library.
3
+ Version: 0.1.0-alpha
4
+ """
5
+
6
+ import logging
7
+ import sys
8
+
9
+ # Configuração de Logging para o Usuário
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s - [ScoutML] - %(levelname)s - %(message)s",
13
+ handlers=[logging.StreamHandler(sys.stdout)],
14
+ )
15
+
16
+ logger = logging.getLogger("scoutml")
17
+
18
+ # Expondo classes principais para acesso fácil
19
+ from .preprocessing import AutoCleaner, SmartImputer, AdaptiveEncoder
20
+ from .analysis import InsightEngine
21
+ from .transpiler import CodeSynthesizer
22
+
23
+ __all__ = [
24
+ "AutoCleaner",
25
+ "SmartImputer",
26
+ "AdaptiveEncoder",
27
+ "InsightEngine",
28
+ "CodeSynthesizer",
29
+ ]
@@ -0,0 +1,147 @@
1
+ import networkx as nx
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.graph_objects as go
5
+ from lightgbm import LGBMRegressor, LGBMClassifier
6
+ from sklearn.inspection import permutation_importance
7
+ from sklearn.model_selection import train_test_split
8
+ from typing import Tuple, List, Dict
9
+
10
+
11
+ class InsightEngine:
12
+ def __init__(self, target_col: str, task: str = "regression"):
13
+ self.target_col = target_col
14
+ self.task = task
15
+ self.importance_df_ = None
16
+ self.redundancy_clusters_ = {}
17
+ self.leakage_candidates_ = []
18
+ self.useless_features_ = []
19
+ self.corr_matrix_ = None
20
+
21
+ def fit(self, df: pd.DataFrame):
22
+ X = df.drop(columns=[self.target_col], errors="ignore")
23
+ y = df[self.target_col]
24
+
25
+ # 1. Permutation Importance (Juiz Imparcial)
26
+ X_train, X_val, y_train, y_val = train_test_split(
27
+ X, y, test_size=0.2, random_state=42
28
+ )
29
+
30
+ model = (
31
+ LGBMRegressor(n_estimators=100, verbose=-1)
32
+ if self.task == "regression"
33
+ else LGBMClassifier(n_estimators=100, verbose=-1)
34
+ )
35
+
36
+ # Simplificação para rodar rápido: Trata tudo como número
37
+ X_train_num = X_train.select_dtypes(include=np.number).fillna(0)
38
+ X_val_num = X_val.select_dtypes(include=np.number).fillna(0)
39
+
40
+ model.fit(X_train_num, y_train)
41
+ result = permutation_importance(
42
+ model, X_val_num, y_val, n_repeats=5, random_state=42, n_jobs=-1
43
+ )
44
+
45
+ self.importance_df_ = pd.DataFrame(
46
+ {"feature": X_train_num.columns, "importance": result.importances_mean}
47
+ ).sort_values(by="importance", ascending=False)
48
+
49
+ # 2. Diagnóstico
50
+ self.leakage_candidates_ = self.importance_df_[
51
+ self.importance_df_["importance"] > 0.8
52
+ ].feature.tolist()
53
+ self.useless_features_ = self.importance_df_[
54
+ self.importance_df_["importance"] <= 0
55
+ ].feature.tolist()
56
+
57
+ # 3. Correlação
58
+ self.corr_matrix_ = (
59
+ X.select_dtypes(include=np.number).corr(method="spearman").abs()
60
+ )
61
+
62
+ return self
63
+
64
+ def plot_correlation_network(self, threshold: float = 0.75) -> go.Figure:
65
+ if self.corr_matrix_ is None:
66
+ return go.Figure()
67
+
68
+ G = nx.Graph()
69
+ for col in self.corr_matrix_.columns:
70
+ G.add_node(col)
71
+
72
+ for i in range(len(self.corr_matrix_.columns)):
73
+ for j in range(i):
74
+ val = self.corr_matrix_.iloc[i, j]
75
+ if val > threshold:
76
+ G.add_edge(
77
+ self.corr_matrix_.columns[i],
78
+ self.corr_matrix_.columns[j],
79
+ weight=val,
80
+ )
81
+
82
+ # Comunidades
83
+ communities = nx.community.greedy_modularity_communities(G)
84
+ node_groups = {}
85
+ self.redundancy_clusters_ = {}
86
+
87
+ for idx, comm in enumerate(communities):
88
+ members = list(comm)
89
+ if len(members) > 1:
90
+ self.redundancy_clusters_[f"Cluster_{idx}"] = members
91
+ for node in members:
92
+ node_groups[node] = idx
93
+
94
+ # Plotly Logic
95
+ pos = nx.spring_layout(G, k=0.3, seed=42)
96
+ edge_x, edge_y = [], []
97
+ for edge in G.edges():
98
+ x0, y0 = pos[edge[0]]
99
+ x1, y1 = pos[edge[1]]
100
+ edge_x.extend([x0, x1, None])
101
+ edge_y.extend([y0, y1, None])
102
+
103
+ node_x, node_y, node_text, node_color = [], [], [], []
104
+ for node in G.nodes():
105
+ x, y = pos[node]
106
+ node_x.append(x)
107
+ node_y.append(y)
108
+ node_text.append(f"{node} (Group {node_groups.get(node,0)})")
109
+ node_color.append(node_groups.get(node, 0))
110
+
111
+ return go.Figure(
112
+ data=[
113
+ go.Scatter(
114
+ x=edge_x, y=edge_y, line=dict(width=1, color="#888"), mode="lines"
115
+ ),
116
+ go.Scatter(
117
+ x=node_x,
118
+ y=node_y,
119
+ mode="markers",
120
+ text=node_text,
121
+ marker=dict(color=node_color, size=15, colorscale="Turbo"),
122
+ ),
123
+ ],
124
+ layout=go.Layout(title="Redundancy Network", showlegend=False),
125
+ )
126
+
127
+ def generate_curation_report(self):
128
+ actions = []
129
+ print("# Relatório de Curadoria ScoutML")
130
+
131
+ if self.leakage_candidates_:
132
+ print(f"🔴 CRÍTICO (Leakage): {self.leakage_candidates_}")
133
+ actions.extend([("drop", f) for f in self.leakage_candidates_])
134
+
135
+ if self.redundancy_clusters_:
136
+ print(f"🟡 AVISO (Redundância): {self.redundancy_clusters_}")
137
+
138
+ if self.useless_features_:
139
+ print(
140
+ f"⚪ INFO (Zombie Features): {len(self.useless_features_)} features inúteis."
141
+ )
142
+
143
+ return self._generate_code(actions)
144
+
145
+ def _generate_code(self, actions):
146
+ drops = [x[1] for x in actions if x[0] == "drop"]
147
+ return f"cols_to_drop = {drops}"
@@ -0,0 +1,388 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from sklearn.model_selection import KFold
5
+ from rapidfuzz import process, fuzz
6
+ from typing import List, Dict, Union, Optional
7
+ import logging
8
+
9
+ logger = logging.getLogger("scoutml")
10
+
11
+
12
+ class DateFeaturizer(ScoutEstimator):
13
+ """Detecta colunas de data e extrai features ciclicas/temporais."""
14
+
15
+ def __init__(self, date_format: str = None):
16
+ super().__init__()
17
+ self.date_format = date_format
18
+ self.date_cols_ = []
19
+ self.generated_features_ = []
20
+
21
+ def fit(self, X: pd.DataFrame, y=None):
22
+ self.history = []
23
+ self.date_cols_ = []
24
+
25
+ # Heurística simples: converte pra datetime e vê se não dá erro massivo
26
+ # Ou verifica se o dtype já é datetime
27
+ for col in X.columns:
28
+ if pd.api.types.is_datetime64_any_dtype(X[col]):
29
+ self.date_cols_.append(col)
30
+ elif X[col].dtype == "object":
31
+ # Tenta inferir se é data (amostra para performance)
32
+ try:
33
+ pd.to_datetime(X[col].dropna().iloc[:100], format=self.date_format)
34
+ self.date_cols_.append(col)
35
+ except (ValueError, TypeError):
36
+ continue
37
+
38
+ if self.date_cols_:
39
+ self.history.append({"action": "detect_dates", "cols": self.date_cols_})
40
+
41
+ return self
42
+
43
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
44
+ X = X.copy()
45
+ for col in self.date_cols_:
46
+ # Garante conversão
47
+ if not pd.api.types.is_datetime64_any_dtype(X[col]):
48
+ X[col] = pd.to_datetime(
49
+ X[col], format=self.date_format, errors="coerce"
50
+ )
51
+
52
+ # Extração de Features
53
+ pfx = col
54
+ X[f"{pfx}_year"] = X[col].dt.year
55
+ X[f"{pfx}_month"] = X[col].dt.month
56
+ X[f"{pfx}_day"] = X[col].dt.day
57
+ X[f"{pfx}_dayofweek"] = X[col].dt.dayofweek
58
+ X[f"{pfx}_is_weekend"] = (X[col].dt.dayofweek >= 5).astype(int)
59
+
60
+ # Remove a original para não quebrar o Encoder depois
61
+ X.drop(columns=[col], inplace=True)
62
+
63
+ return X
64
+
65
+
66
+ class ScoutEstimator(BaseEstimator, TransformerMixin):
67
+ """Classe base para manter o histórico de decisões."""
68
+
69
+ def __init__(self):
70
+ self.history = []
71
+
72
+
73
+ class AutoCleaner(ScoutEstimator):
74
+ """Remove colunas constantes, IDs e detecta vazamento de dados."""
75
+
76
+ def __init__(self, leakage_threshold: float = 0.95):
77
+ super().__init__()
78
+ self.leakage_threshold = leakage_threshold
79
+ self.cols_to_drop_ = []
80
+ self.leakage_warnings_ = []
81
+
82
+ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
83
+ self.cols_to_drop_ = []
84
+ self.history = []
85
+ n_rows = len(X)
86
+
87
+ for col in X.columns:
88
+ # 1. ID Detection
89
+ if X[col].nunique() >= (n_rows * 0.99):
90
+ self.cols_to_drop_.append(col)
91
+ self.history.append(
92
+ {"action": "drop", "col": col, "reason": "High Cardinality (ID)"}
93
+ )
94
+ continue
95
+
96
+ # 2. Constant Columns
97
+ if X[col].nunique() <= 1:
98
+ self.cols_to_drop_.append(col)
99
+ self.history.append(
100
+ {"action": "drop", "col": col, "reason": "Zero Variance"}
101
+ )
102
+ continue
103
+
104
+ # 3. Leakage Hunter
105
+ if y is not None and pd.api.types.is_numeric_dtype(X[col]):
106
+ try:
107
+ corr = X[col].corr(y, method="spearman")
108
+ if abs(corr) > self.leakage_threshold:
109
+ self.leakage_warnings_.append((col, corr))
110
+ self.history.append(
111
+ {
112
+ "action": "warning",
113
+ "col": col,
114
+ "reason": f"Leakage ({corr:.2f})",
115
+ }
116
+ )
117
+ except:
118
+ pass
119
+ return self
120
+
121
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
122
+ if not self.cols_to_drop_:
123
+ return X
124
+ return X.drop(columns=self.cols_to_drop_, errors="ignore")
125
+
126
+
127
+ class SmartImputer(ScoutEstimator):
128
+ """Imputação Híbrida: Mediana para numéricos, Moda para categóricos."""
129
+
130
+ def __init__(
131
+ self, null_threshold_drop: float = 0.60, structural_threshold: float = 0.05
132
+ ):
133
+ super().__init__()
134
+ self.null_threshold_drop = null_threshold_drop
135
+ self.structural_threshold = structural_threshold
136
+ self.imputation_map_ = {}
137
+ self.missing_indicators_ = []
138
+ self.cols_to_drop_ = []
139
+
140
+ def fit(self, X: pd.DataFrame, y=None):
141
+ self.history = []
142
+ null_percent = X.isnull().mean()
143
+
144
+ for col in X.columns:
145
+ pct = null_percent[col]
146
+ if pct == 0:
147
+ continue
148
+
149
+ if pct > self.null_threshold_drop:
150
+ self.cols_to_drop_.append(col)
151
+ self.history.append(
152
+ {"action": "drop", "col": col, "reason": f"Nulls {pct:.1%}"}
153
+ )
154
+ continue
155
+
156
+ if pct > self.structural_threshold:
157
+ self.missing_indicators_.append(col)
158
+ self.history.append({"action": "add_indicator", "col": col})
159
+
160
+ # Estratégia de Preenchimento Robusta
161
+ if pd.api.types.is_numeric_dtype(X[col]):
162
+ fill_val = X[col].median()
163
+ if pd.isna(fill_val):
164
+ fill_val = 0
165
+ self.imputation_map_[col] = float(fill_val)
166
+ self.history.append(
167
+ {"action": "impute_num", "col": col, "val": fill_val}
168
+ )
169
+ else:
170
+ # Usa a Moda (valor mais frequente) ou "MISSING" se vazio
171
+ modes = X[col].mode()
172
+ fill_val = modes[0] if not modes.empty else "MISSING"
173
+ self.imputation_map_[col] = str(fill_val)
174
+ self.history.append(
175
+ {"action": "impute_cat", "col": col, "val": fill_val}
176
+ )
177
+
178
+ return self
179
+
180
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
181
+ X = X.copy()
182
+ if self.cols_to_drop_:
183
+ X.drop(columns=self.cols_to_drop_, inplace=True, errors="ignore")
184
+
185
+ for col in self.missing_indicators_:
186
+ if col in X.columns:
187
+ X[f"is_missing_{col}"] = X[col].isnull().astype(int)
188
+
189
+ X.fillna(value=self.imputation_map_, inplace=True)
190
+ return X
191
+
192
+
193
+ class AdaptiveEncoder(ScoutEstimator):
194
+ """
195
+ Pipeline Completo: Fuzzy Match -> Rare Label Grouping -> Encoding (Target/OHE).
196
+ """
197
+
198
+ def __init__(
199
+ self,
200
+ card_threshold: int = 10,
201
+ smoothing: float = 10.0,
202
+ fuzzy_threshold: int = 90,
203
+ rare_threshold: float = 0.01,
204
+ ):
205
+ super().__init__()
206
+ self.card_threshold = card_threshold
207
+ self.smoothing = smoothing
208
+ self.fuzzy_threshold = fuzzy_threshold
209
+ self.rare_threshold = rare_threshold
210
+
211
+ # Estado aprendido
212
+ self.fuzzy_maps_ = {} # {col: {bad_val: good_val}}
213
+ self.rare_groups_ = {} # {col: [list_of_kept_values]}
214
+ self.encoding_map_ = {} # {col: {val: target_mean}}
215
+ self.global_means_ = {} # {col: global_mean}
216
+ self.ohe_cols_ = []
217
+
218
+ def _learn_fuzzy_map(self, series: pd.Series) -> Dict[str, str]:
219
+ """Agrupa strings muito parecidas (ex: 'SP', 'sp', 'S. Paulo')."""
220
+ unique_vals = series.dropna().unique().astype(str)
221
+ if len(unique_vals) < 3:
222
+ return {}
223
+
224
+ mapping = {}
225
+ # Ordena por tamanho para processar os mais limpos primeiro (heurística simples)
226
+ unique_vals = sorted(unique_vals, key=len)
227
+
228
+ covered = set()
229
+ for val in unique_vals:
230
+ if val in covered:
231
+ continue
232
+
233
+ # Encontra similares
234
+ matches = process.extract(
235
+ val, unique_vals, limit=None, scorer=fuzz.token_sort_ratio
236
+ )
237
+ # Filtra por threshold
238
+ group = [m[0] for m in matches if m[1] >= self.fuzzy_threshold]
239
+
240
+ # O "Líder" do grupo é o valor mais curto ou mais frequente (simplificado aqui como o primeiro)
241
+ leader = val
242
+ for member in group:
243
+ if member != leader:
244
+ mapping[member] = leader
245
+ covered.add(member)
246
+ return mapping
247
+
248
+ def _learn_rare_groups(self, series: pd.Series) -> List[str]:
249
+ """Retorna lista de categorias para MANTER. O resto vira 'OTHER'."""
250
+ freqs = series.value_counts(normalize=True)
251
+ kept = freqs[freqs >= self.rare_threshold].index.tolist()
252
+ return kept
253
+
254
+ def _get_smoothed_mean(self, df, col, y_name):
255
+ """Target Encoding com Suavização (Smoothing)."""
256
+ global_mean = df[y_name].mean()
257
+ agg = df.groupby(col)[y_name].agg(["count", "mean"])
258
+ counts = agg["count"]
259
+ means = agg["mean"]
260
+ smooth = (counts * means + self.smoothing * global_mean) / (
261
+ counts + self.smoothing
262
+ )
263
+ return smooth, global_mean
264
+
265
+ def fit(self, X: pd.DataFrame, y: pd.Series):
266
+ self.history = []
267
+ X = X.copy()
268
+ y_name = "target_internal"
269
+ X[y_name] = y.values
270
+
271
+ cat_cols = X.select_dtypes(include=["object", "category"]).columns
272
+
273
+ for col in cat_cols:
274
+ # 1. Fuzzy Matching Learning
275
+ f_map = self._learn_fuzzy_map(X[col])
276
+ if f_map:
277
+ self.fuzzy_maps_[col] = f_map
278
+ self.history.append(
279
+ {"action": "fuzzy_map", "col": col, "count": len(f_map)}
280
+ )
281
+ X[col] = X[col].replace(
282
+ f_map
283
+ ) # Aplica localmente para continuar o treino
284
+
285
+ # 2. Rare Label Learning
286
+ kept_vals = self._learn_rare_groups(X[col])
287
+ self.rare_groups_[col] = kept_vals
288
+ # Aplica localmente: tudo que não está em kept_vals vira "OTHER"
289
+ X.loc[~X[col].isin(kept_vals), col] = "OTHER"
290
+ self.history.append(
291
+ {"action": "rare_group", "col": col, "kept": len(kept_vals)}
292
+ )
293
+
294
+ # 3. Decision: OHE vs Target Encoding
295
+ cardinality = X[col].nunique()
296
+ if cardinality <= self.card_threshold:
297
+ self.ohe_cols_.append(col)
298
+ self.history.append({"action": "ohe", "col": col})
299
+ else:
300
+ smooth_map, global_m = self._get_smoothed_mean(X, col, y_name)
301
+ self.encoding_map_[col] = smooth_map.to_dict()
302
+ self.global_means_[col] = global_m
303
+ self.history.append({"action": "target_encode", "col": col})
304
+
305
+ return self
306
+
307
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
308
+ X = X.copy()
309
+
310
+ # 1. Aplica Fuzzy Maps
311
+ for col, f_map in self.fuzzy_maps_.items():
312
+ if col in X.columns:
313
+ X[col] = X[col].replace(f_map)
314
+
315
+ # 2. Aplica Rare Groups
316
+ for col, kept_vals in self.rare_groups_.items():
317
+ if col in X.columns:
318
+ # Se não estiver na lista de mantidos, vira "OTHER"
319
+ X[col] = np.where(X[col].isin(kept_vals), X[col], "OTHER")
320
+
321
+ # 3. Aplica Target Encoding
322
+ for col, mapping in self.encoding_map_.items():
323
+ if col in X.columns:
324
+ fallback = self.global_means_[col]
325
+ X[col] = X[col].map(mapping).fillna(fallback)
326
+
327
+ # 4. Aplica OHE
328
+ if self.ohe_cols_:
329
+ X = pd.get_dummies(X, columns=self.ohe_cols_, drop_first=True)
330
+
331
+ return X
332
+
333
+ def fit_transform(
334
+ self, X: pd.DataFrame, y: pd.Series = None, **fit_params
335
+ ) -> pd.DataFrame:
336
+ """K-Fold Target Encoding para evitar Data Leakage."""
337
+ self.fit(X, y) # Aprende os mapas globais (Fuzzy, Rare, Encoding)
338
+
339
+ X_out = X.copy()
340
+
341
+ # Primeiro, precisamos aplicar Fuzzy e Rare em todo o X_out para garantir consistência
342
+ # antes de fazer o K-Fold do Target Encoding
343
+ for col, f_map in self.fuzzy_maps_.items():
344
+ X_out[col] = X_out[col].replace(f_map)
345
+
346
+ for col, kept_vals in self.rare_groups_.items():
347
+ X_out[col] = np.where(X_out[col].isin(kept_vals), X_out[col], "OTHER")
348
+
349
+ # Se não houver target encoding, só retorna transformado
350
+ if not self.encoding_map_:
351
+ return self.transform(X)
352
+
353
+ # K-Fold logic apenas para as colunas de Target Encoding
354
+ kf = KFold(n_splits=5, shuffle=True, random_state=42)
355
+ y_series = y if isinstance(y, pd.Series) else pd.Series(y)
356
+ y_name = "target_tmp"
357
+
358
+ for col in self.encoding_map_.keys():
359
+ # Cria coluna temporária
360
+ X_out[f"{col}_TE"] = np.nan
361
+
362
+ for train_idx, val_idx in kf.split(X_out, y_series):
363
+ # Dados de Treino do Fold (já limpos de fuzzy/rare)
364
+ X_fold_train = X_out.iloc[train_idx].copy()
365
+ y_fold_train = y_series.iloc[train_idx]
366
+
367
+ # Dados de Validação
368
+ X_fold_val = X_out.iloc[val_idx].copy()
369
+
370
+ df_train = X_fold_train.copy()
371
+ df_train[y_name] = y_fold_train.values
372
+
373
+ for col in self.encoding_map_.keys():
374
+ smooth_map, global_m = self._get_smoothed_mean(df_train, col, y_name)
375
+ # Aplica map aprendido no fold de treino sobre o fold de validação
376
+ X_out.loc[val_idx, col] = (
377
+ X_fold_val[col].map(smooth_map).fillna(global_m)
378
+ )
379
+
380
+ # Preencher buracos eventuais com média global aprendida no fit() total
381
+ for col in self.encoding_map_.keys():
382
+ X_out[col] = X_out[col].fillna(self.global_means_[col])
383
+
384
+ # Finalmente, OHE (pós tratamento)
385
+ if self.ohe_cols_:
386
+ X_out = pd.get_dummies(X_out, columns=self.ohe_cols_, drop_first=True)
387
+
388
+ return X_out
@@ -0,0 +1,168 @@
1
+ import logging
2
+ from typing import Optional, List
3
+ from .preprocessing import AutoCleaner, SmartImputer, AdaptiveEncoder
4
+ from .analysis import InsightEngine # Importar para type hinting
5
+
6
+
7
+ # Adicione a importação no topo
8
+ from .preprocessing import AutoCleaner, SmartImputer, AdaptiveEncoder, DateFeaturizer
9
+
10
+
11
+ class CodeSynthesizer:
12
+ def __init__(
13
+ self,
14
+ cleaner: AutoCleaner,
15
+ imputer: SmartImputer,
16
+ encoder: AdaptiveEncoder,
17
+ date_featurizer: Optional[DateFeaturizer] = None, # <--- NOVO
18
+ insight_engine: Optional[InsightEngine] = None,
19
+ ):
20
+ self.cleaner = cleaner
21
+ self.imputer = imputer
22
+ self.encoder = encoder
23
+ self.date_featurizer = date_featurizer # <--- NOVO
24
+ self.insight = insight_engine
25
+
26
+ def export_to_script(self, filepath: str = "pipeline_production.py"):
27
+ lines = []
28
+ lines.append(
29
+ '"""\nScoutML Generated Pipeline\nReady for Databricks/Pandas.\n"""'
30
+ )
31
+ lines.append("import pandas as pd")
32
+ lines.append("import numpy as np")
33
+ lines.append("")
34
+ lines.append("def run_pipeline(df: pd.DataFrame) -> pd.DataFrame:")
35
+ lines.append(" df = df.copy()")
36
+ lines.append("")
37
+
38
+ # ---------------------------------------------------------
39
+ # 0. Date Featurizer (DEVE VIR PRIMEIRO)
40
+ # ---------------------------------------------------------
41
+ if self.date_featurizer and self.date_featurizer.date_cols_:
42
+ lines.append(" # [0] DateFeaturizer: Extracting time features")
43
+ lines.append(f" date_cols = {self.date_featurizer.date_cols_}")
44
+ lines.append(" for col in date_cols:")
45
+ lines.append(" if col in df.columns:")
46
+ lines.append(" # Force datetime conversion")
47
+
48
+ # Injeta o formato se ele existir, senão deixa o pandas adivinhar
49
+ if self.date_featurizer.date_format:
50
+ fmt = self.date_featurizer.date_format
51
+ lines.append(
52
+ f" series = pd.to_datetime(df[col], format='{fmt}', errors='coerce')"
53
+ )
54
+ else:
55
+ lines.append(
56
+ " series = pd.to_datetime(df[col], errors='coerce')"
57
+ )
58
+
59
+ lines.append(" df[f'{col}_year'] = series.dt.year")
60
+ lines.append(" df[f'{col}_month'] = series.dt.month")
61
+ lines.append(" df[f'{col}_day'] = series.dt.day")
62
+ lines.append(" df[f'{col}_dayofweek'] = series.dt.dayofweek")
63
+ lines.append(
64
+ " df[f'{col}_is_weekend'] = (series.dt.dayofweek >= 5).astype(int)"
65
+ )
66
+ lines.append(" df.drop(columns=[col], inplace=True)")
67
+ lines.append("")
68
+
69
+ # ---------------------------------------------------------
70
+ # 1. Cleaner & Insight Drops
71
+ # ---------------------------------------------------------
72
+ # Combina drops do AutoCleaner (técnico) com InsightEngine (estatístico)
73
+ all_drops = set(self.cleaner.cols_to_drop_)
74
+
75
+ if self.insight:
76
+ if self.insight.leakage_candidates_:
77
+ all_drops.update(self.insight.leakage_candidates_)
78
+ if self.insight.useless_features_:
79
+ all_drops.update(self.insight.useless_features_)
80
+
81
+ drops_list = list(all_drops)
82
+
83
+ if drops_list:
84
+ lines.append(f" # [1] AutoCleaner & InsightEngine Drops")
85
+ lines.append(f" # Drops: IDs, Constants, Leakage, Useless Features")
86
+ lines.append(f" drop_cols = {drops_list}")
87
+ lines.append(
88
+ " df.drop(columns=drop_cols, errors='ignore', inplace=True)"
89
+ )
90
+ lines.append("")
91
+
92
+ # ... (Resto do código igual: Imputer, Encoder, etc.) ...
93
+ # (Copie o restante da lógica do imputer e encoder do arquivo anterior aqui)
94
+
95
+ # MANTENHA O RESTO DO CÓDIGO DO IMPUTER E ENCODER AQUI
96
+ # ...
97
+
98
+ # ---------------------------------------------------------
99
+ # 2. Imputer
100
+ # ---------------------------------------------------------
101
+ lines.append(" # [2] SmartImputer")
102
+ if self.imputer.missing_indicators_:
103
+ lines.append(f" # Flagging structural missingness")
104
+ lines.append(f" missing_flags = {self.imputer.missing_indicators_}")
105
+ lines.append(" for col in missing_flags:")
106
+ lines.append(" if col in df.columns:")
107
+ lines.append(
108
+ f" df[f'is_missing_{{col}}'] = df[col].isnull().astype(int)"
109
+ )
110
+
111
+ if self.imputer.imputation_map_:
112
+ lines.append(" # Applying Fillna (Median/Mode)")
113
+ lines.append(f" fill_values = {self.imputer.imputation_map_}")
114
+ lines.append(" df.fillna(value=fill_values, inplace=True)")
115
+ lines.append("")
116
+
117
+ # ---------------------------------------------------------
118
+ # 3. Encoder (Fuzzy -> Rare -> Encode)
119
+ # ---------------------------------------------------------
120
+ lines.append(" # [3] AdaptiveEncoder")
121
+
122
+ if self.encoder.fuzzy_maps_:
123
+ lines.append(" # [3.1] Fuzzy Matching Correction")
124
+ for col, f_map in self.encoder.fuzzy_maps_.items():
125
+ lines.append(f" if '{col}' in df.columns:")
126
+ lines.append(f" f_map_{col} = {f_map}")
127
+ lines.append(f" df['{col}'] = df['{col}'].replace(f_map_{col})")
128
+ lines.append("")
129
+
130
+ if self.encoder.rare_groups_:
131
+ lines.append(" # [3.2] Rare Label Grouping (-> 'OTHER')")
132
+ for col, kept_vals in self.encoder.rare_groups_.items():
133
+ lines.append(f" if '{col}' in df.columns:")
134
+ lines.append(f" kept_{col} = {kept_vals}")
135
+ lines.append(
136
+ f" df['{col}'] = np.where(df['{col}'].isin(kept_{col}), df['{col}'], 'OTHER')"
137
+ )
138
+ lines.append("")
139
+
140
+ if self.encoder.encoding_map_:
141
+ lines.append(" # [3.3] Target Encoding (Smoothed Globals)")
142
+ for col, mapping in self.encoder.encoding_map_.items():
143
+ fallback = self.encoder.global_means_.get(col, 0)
144
+ lines.append(f" if '{col}' in df.columns:")
145
+ lines.append(f" map_{col} = {str(mapping)}")
146
+ lines.append(
147
+ f" df['{col}'] = df['{col}'].map(map_{col}).fillna({fallback})"
148
+ )
149
+ lines.append("")
150
+
151
+ if self.encoder.ohe_cols_:
152
+ lines.append(f" # [3.4] One-Hot Encoding")
153
+ lines.append(f" ohe_cols = {self.encoder.ohe_cols_}")
154
+ lines.append(" # Ensure columns exist before OHE to avoid errors")
155
+ lines.append(" valid_ohe = [c for c in ohe_cols if c in df.columns]")
156
+ lines.append(
157
+ " df = pd.get_dummies(df, columns=valid_ohe, drop_first=True)"
158
+ )
159
+
160
+ lines.append("")
161
+ lines.append(" return df")
162
+
163
+ try:
164
+ with open(filepath, "w", encoding="utf-8") as f:
165
+ f.write("\n".join(lines))
166
+ print(f"Pipeline exportado com sucesso para: {filepath}")
167
+ except Exception as e:
168
+ print(f"Erro ao exportar pipeline: {e}")
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,38 @@
1
+ from setuptools import setup, find_packages
2
+ import pathlib
3
+
4
+ # O diretório onde este arquivo está
5
+ here = pathlib.Path(__file__).parent.resolve()
6
+
7
+ # Lê o README para usar como descrição longa no PyPI
8
+ long_description = (here / "README.md").read_text(encoding="utf-8")
9
+
10
+ setup(
11
+ name="trailblazer-ml", # O nome que será usado no pip install
12
+ version="0.1.0",
13
+ description="Uma biblioteca de AutoML Exploratório e 'Glass-Box'.",
14
+ long_description=long_description,
15
+ long_description_content_type="text/markdown",
16
+ url="https://github.com/gabsalles/trailblazer-ml", # Coloque seu GitHub aqui
17
+ author="Gabriel Sales",
18
+ author_email="ggcs10@gmail.com",
19
+ classifiers=[
20
+ "Development Status :: 3 - Alpha",
21
+ "Intended Audience :: Developers",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "License :: OSI Approved :: MIT License",
24
+ "Programming Language :: Python :: 3",
25
+ ],
26
+ keywords="automl, data-science, preprocessing, cleaning",
27
+ packages=find_packages(),
28
+ python_requires=">=3.8, <4",
29
+ install_requires=[
30
+ "numpy>=1.21.0",
31
+ "pandas>=1.5.0",
32
+ "scikit-learn>=1.0.0",
33
+ "lightgbm>=3.3.0",
34
+ "rapidfuzz>=3.0.0",
35
+ "networkx>=3.0",
36
+ "plotly>=5.0.0",
37
+ ],
38
+ )
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: trailblazer-ml
3
+ Version: 0.1.0
4
+ Summary: Uma biblioteca de AutoML Exploratório e 'Glass-Box'.
5
+ Home-page: https://github.com/gabsalles/trailblazer-ml
6
+ Author: Gabriel Sales
7
+ Author-email: ggcs10@gmail.com
8
+ Keywords: automl,data-science,preprocessing,cleaning
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.8, <4
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: numpy>=1.21.0
17
+ Requires-Dist: pandas>=1.5.0
18
+ Requires-Dist: scikit-learn>=1.0.0
19
+ Requires-Dist: lightgbm>=3.3.0
20
+ Requires-Dist: rapidfuzz>=3.0.0
21
+ Requires-Dist: networkx>=3.0
22
+ Requires-Dist: plotly>=5.0.0
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: keywords
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # ScoutML 🚀
35
+
36
+ Uma biblioteca de AutoML "Glass-Box" focada em exploração, limpeza e geração de código transparente.
37
+
38
+ ## Instalação
39
+
40
+ ```bash
41
+ pip install scoutml
@@ -0,0 +1,11 @@
1
+ README.md
2
+ setup.py
3
+ scoutml/__init__.py
4
+ scoutml/analysis.py
5
+ scoutml/preprocessing.py
6
+ scoutml/transpiler.py
7
+ trailblazer_ml.egg-info/PKG-INFO
8
+ trailblazer_ml.egg-info/SOURCES.txt
9
+ trailblazer_ml.egg-info/dependency_links.txt
10
+ trailblazer_ml.egg-info/requires.txt
11
+ trailblazer_ml.egg-info/top_level.txt
@@ -0,0 +1,7 @@
1
+ numpy>=1.21.0
2
+ pandas>=1.5.0
3
+ scikit-learn>=1.0.0
4
+ lightgbm>=3.3.0
5
+ rapidfuzz>=3.0.0
6
+ networkx>=3.0
7
+ plotly>=5.0.0