PyPI - trailblazer-ml - Versions diffs - 0.1.0__tar.gz - Mend

trailblazer-ml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

trailblazer_ml-0.1.0/PKG-INFO +41 -0
trailblazer_ml-0.1.0/README.md +8 -0
trailblazer_ml-0.1.0/scoutml/__init__.py +29 -0
trailblazer_ml-0.1.0/scoutml/analysis.py +147 -0
trailblazer_ml-0.1.0/scoutml/preprocessing.py +388 -0
trailblazer_ml-0.1.0/scoutml/transpiler.py +168 -0
trailblazer_ml-0.1.0/setup.cfg +4 -0
trailblazer_ml-0.1.0/setup.py +38 -0
trailblazer_ml-0.1.0/trailblazer_ml.egg-info/PKG-INFO +41 -0
trailblazer_ml-0.1.0/trailblazer_ml.egg-info/SOURCES.txt +11 -0
trailblazer_ml-0.1.0/trailblazer_ml.egg-info/dependency_links.txt +1 -0
trailblazer_ml-0.1.0/trailblazer_ml.egg-info/requires.txt +7 -0
trailblazer_ml-0.1.0/trailblazer_ml.egg-info/top_level.txt +1 -0

trailblazer_ml-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,41 @@
+Metadata-Version: 2.4
+Name: trailblazer-ml
+Version: 0.1.0
+Summary: Uma biblioteca de AutoML Exploratório e 'Glass-Box'.
+Home-page: https://github.com/gabsalles/trailblazer-ml
+Author: Gabriel Sales
+Author-email: ggcs10@gmail.com
+Keywords: automl,data-science,preprocessing,cleaning
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.8, <4
+Description-Content-Type: text/markdown
+Requires-Dist: numpy>=1.21.0
+Requires-Dist: pandas>=1.5.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: lightgbm>=3.3.0
+Requires-Dist: rapidfuzz>=3.0.0
+Requires-Dist: networkx>=3.0
+Requires-Dist: plotly>=5.0.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# ScoutML 🚀
+Uma biblioteca de AutoML "Glass-Box" focada em exploração, limpeza e geração de código transparente.
+## Instalação
+```bash
+pip install scoutml

trailblazer_ml-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# ScoutML 🚀
+Uma biblioteca de AutoML "Glass-Box" focada em exploração, limpeza e geração de código transparente.
+## Instalação
+```bash
+pip install scoutml

trailblazer_ml-0.1.0/scoutml/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""
+ScoutML: Exploratory AutoML & Code Synthesis Library.
+Version: 0.1.0-alpha
+"""
+import logging
+import sys
+# Configuração de Logging para o Usuário
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [ScoutML] - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+logger = logging.getLogger("scoutml")
+# Expondo classes principais para acesso fácil
+from .preprocessing import AutoCleaner, SmartImputer, AdaptiveEncoder
+from .analysis import InsightEngine
+from .transpiler import CodeSynthesizer
+__all__ = [
+    "AutoCleaner",
+    "SmartImputer",
+    "AdaptiveEncoder",
+    "InsightEngine",
+    "CodeSynthesizer",
+]

trailblazer_ml-0.1.0/scoutml/analysis.py ADDED Viewed

@@ -0,0 +1,147 @@
+import networkx as nx
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+from lightgbm import LGBMRegressor, LGBMClassifier
+from sklearn.inspection import permutation_importance
+from sklearn.model_selection import train_test_split
+from typing import Tuple, List, Dict
+class InsightEngine:
+    def __init__(self, target_col: str, task: str = "regression"):
+        self.target_col = target_col
+        self.task = task
+        self.importance_df_ = None
+        self.redundancy_clusters_ = {}
+        self.leakage_candidates_ = []
+        self.useless_features_ = []
+        self.corr_matrix_ = None
+    def fit(self, df: pd.DataFrame):
+        X = df.drop(columns=[self.target_col], errors="ignore")
+        y = df[self.target_col]
+        # 1. Permutation Importance (Juiz Imparcial)
+        X_train, X_val, y_train, y_val = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+        model = (
+            LGBMRegressor(n_estimators=100, verbose=-1)
+            if self.task == "regression"
+            else LGBMClassifier(n_estimators=100, verbose=-1)
+        )
+        # Simplificação para rodar rápido: Trata tudo como número
+        X_train_num = X_train.select_dtypes(include=np.number).fillna(0)
+        X_val_num = X_val.select_dtypes(include=np.number).fillna(0)
+        model.fit(X_train_num, y_train)
+        result = permutation_importance(
+            model, X_val_num, y_val, n_repeats=5, random_state=42, n_jobs=-1
+        )
+        self.importance_df_ = pd.DataFrame(
+            {"feature": X_train_num.columns, "importance": result.importances_mean}
+        ).sort_values(by="importance", ascending=False)
+        # 2. Diagnóstico
+        self.leakage_candidates_ = self.importance_df_[
+            self.importance_df_["importance"] > 0.8
+        ].feature.tolist()
+        self.useless_features_ = self.importance_df_[
+            self.importance_df_["importance"] <= 0
+        ].feature.tolist()
+        # 3. Correlação
+        self.corr_matrix_ = (
+            X.select_dtypes(include=np.number).corr(method="spearman").abs()
+        )
+        return self
+    def plot_correlation_network(self, threshold: float = 0.75) -> go.Figure:
+        if self.corr_matrix_ is None:
+            return go.Figure()
+        G = nx.Graph()
+        for col in self.corr_matrix_.columns:
+            G.add_node(col)
+        for i in range(len(self.corr_matrix_.columns)):
+            for j in range(i):
+                val = self.corr_matrix_.iloc[i, j]
+                if val > threshold:
+                    G.add_edge(
+                        self.corr_matrix_.columns[i],
+                        self.corr_matrix_.columns[j],
+                        weight=val,
+                    )
+        # Comunidades
+        communities = nx.community.greedy_modularity_communities(G)
+        node_groups = {}
+        self.redundancy_clusters_ = {}
+        for idx, comm in enumerate(communities):
+            members = list(comm)
+            if len(members) > 1:
+                self.redundancy_clusters_[f"Cluster_{idx}"] = members
+            for node in members:
+                node_groups[node] = idx
+        # Plotly Logic
+        pos = nx.spring_layout(G, k=0.3, seed=42)
+        edge_x, edge_y = [], []
+        for edge in G.edges():
+            x0, y0 = pos[edge[0]]
+            x1, y1 = pos[edge[1]]
+            edge_x.extend([x0, x1, None])
+            edge_y.extend([y0, y1, None])
+        node_x, node_y, node_text, node_color = [], [], [], []
+        for node in G.nodes():
+            x, y = pos[node]
+            node_x.append(x)
+            node_y.append(y)
+            node_text.append(f"{node} (Group {node_groups.get(node,0)})")
+            node_color.append(node_groups.get(node, 0))
+        return go.Figure(
+            data=[
+                go.Scatter(
+                    x=edge_x, y=edge_y, line=dict(width=1, color="#888"), mode="lines"
+                ),
+                go.Scatter(
+                    x=node_x,
+                    y=node_y,
+                    mode="markers",
+                    text=node_text,
+                    marker=dict(color=node_color, size=15, colorscale="Turbo"),
+                ),
+            ],
+            layout=go.Layout(title="Redundancy Network", showlegend=False),
+        )
+    def generate_curation_report(self):
+        actions = []
+        print("# Relatório de Curadoria ScoutML")
+        if self.leakage_candidates_:
+            print(f"🔴 CRÍTICO (Leakage): {self.leakage_candidates_}")
+            actions.extend([("drop", f) for f in self.leakage_candidates_])
+        if self.redundancy_clusters_:
+            print(f"🟡 AVISO (Redundância): {self.redundancy_clusters_}")
+        if self.useless_features_:
+            print(
+                f"⚪ INFO (Zombie Features): {len(self.useless_features_)} features inúteis."
+            )
+        return self._generate_code(actions)
+    def _generate_code(self, actions):
+        drops = [x[1] for x in actions if x[0] == "drop"]
+        return f"cols_to_drop = {drops}"

trailblazer_ml-0.1.0/scoutml/preprocessing.py ADDED Viewed

@@ -0,0 +1,388 @@
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.model_selection import KFold
+from rapidfuzz import process, fuzz
+from typing import List, Dict, Union, Optional
+import logging
+logger = logging.getLogger("scoutml")
+class DateFeaturizer(ScoutEstimator):
+    """Detecta colunas de data e extrai features ciclicas/temporais."""
+    def __init__(self, date_format: str = None):
+        super().__init__()
+        self.date_format = date_format
+        self.date_cols_ = []
+        self.generated_features_ = []
+    def fit(self, X: pd.DataFrame, y=None):
+        self.history = []
+        self.date_cols_ = []
+        # Heurística simples: converte pra datetime e vê se não dá erro massivo
+        # Ou verifica se o dtype já é datetime
+        for col in X.columns:
+            if pd.api.types.is_datetime64_any_dtype(X[col]):
+                self.date_cols_.append(col)
+            elif X[col].dtype == "object":
+                # Tenta inferir se é data (amostra para performance)
+                try:
+                    pd.to_datetime(X[col].dropna().iloc[:100], format=self.date_format)
+                    self.date_cols_.append(col)
+                except (ValueError, TypeError):
+                    continue
+        if self.date_cols_:
+            self.history.append({"action": "detect_dates", "cols": self.date_cols_})
+        return self
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        X = X.copy()
+        for col in self.date_cols_:
+            # Garante conversão
+            if not pd.api.types.is_datetime64_any_dtype(X[col]):
+                X[col] = pd.to_datetime(
+                    X[col], format=self.date_format, errors="coerce"
+                )
+            # Extração de Features
+            pfx = col
+            X[f"{pfx}_year"] = X[col].dt.year
+            X[f"{pfx}_month"] = X[col].dt.month
+            X[f"{pfx}_day"] = X[col].dt.day
+            X[f"{pfx}_dayofweek"] = X[col].dt.dayofweek
+            X[f"{pfx}_is_weekend"] = (X[col].dt.dayofweek >= 5).astype(int)
+            # Remove a original para não quebrar o Encoder depois
+            X.drop(columns=[col], inplace=True)
+        return X
+class ScoutEstimator(BaseEstimator, TransformerMixin):
+    """Classe base para manter o histórico de decisões."""
+    def __init__(self):
+        self.history = []
+class AutoCleaner(ScoutEstimator):
+    """Remove colunas constantes, IDs e detecta vazamento de dados."""
+    def __init__(self, leakage_threshold: float = 0.95):
+        super().__init__()
+        self.leakage_threshold = leakage_threshold
+        self.cols_to_drop_ = []
+        self.leakage_warnings_ = []
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        self.cols_to_drop_ = []
+        self.history = []
+        n_rows = len(X)
+        for col in X.columns:
+            # 1. ID Detection
+            if X[col].nunique() >= (n_rows * 0.99):
+                self.cols_to_drop_.append(col)
+                self.history.append(
+                    {"action": "drop", "col": col, "reason": "High Cardinality (ID)"}
+                )
+                continue
+            # 2. Constant Columns
+            if X[col].nunique() <= 1:
+                self.cols_to_drop_.append(col)
+                self.history.append(
+                    {"action": "drop", "col": col, "reason": "Zero Variance"}
+                )
+                continue
+            # 3. Leakage Hunter
+            if y is not None and pd.api.types.is_numeric_dtype(X[col]):
+                try:
+                    corr = X[col].corr(y, method="spearman")
+                    if abs(corr) > self.leakage_threshold:
+                        self.leakage_warnings_.append((col, corr))
+                        self.history.append(
+                            {
+                                "action": "warning",
+                                "col": col,
+                                "reason": f"Leakage ({corr:.2f})",
+                            }
+                        )
+                except:
+                    pass
+        return self
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        if not self.cols_to_drop_:
+            return X
+        return X.drop(columns=self.cols_to_drop_, errors="ignore")
+class SmartImputer(ScoutEstimator):
+    """Imputação Híbrida: Mediana para numéricos, Moda para categóricos."""
+    def __init__(
+        self, null_threshold_drop: float = 0.60, structural_threshold: float = 0.05
+    ):
+        super().__init__()
+        self.null_threshold_drop = null_threshold_drop
+        self.structural_threshold = structural_threshold
+        self.imputation_map_ = {}
+        self.missing_indicators_ = []
+        self.cols_to_drop_ = []
+    def fit(self, X: pd.DataFrame, y=None):
+        self.history = []
+        null_percent = X.isnull().mean()
+        for col in X.columns:
+            pct = null_percent[col]
+            if pct == 0:
+                continue
+            if pct > self.null_threshold_drop:
+                self.cols_to_drop_.append(col)
+                self.history.append(
+                    {"action": "drop", "col": col, "reason": f"Nulls {pct:.1%}"}
+                )
+                continue
+            if pct > self.structural_threshold:
+                self.missing_indicators_.append(col)
+                self.history.append({"action": "add_indicator", "col": col})
+            # Estratégia de Preenchimento Robusta
+            if pd.api.types.is_numeric_dtype(X[col]):
+                fill_val = X[col].median()
+                if pd.isna(fill_val):
+                    fill_val = 0
+                self.imputation_map_[col] = float(fill_val)
+                self.history.append(
+                    {"action": "impute_num", "col": col, "val": fill_val}
+                )
+            else:
+                # Usa a Moda (valor mais frequente) ou "MISSING" se vazio
+                modes = X[col].mode()
+                fill_val = modes[0] if not modes.empty else "MISSING"
+                self.imputation_map_[col] = str(fill_val)
+                self.history.append(
+                    {"action": "impute_cat", "col": col, "val": fill_val}
+                )
+        return self
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        X = X.copy()
+        if self.cols_to_drop_:
+            X.drop(columns=self.cols_to_drop_, inplace=True, errors="ignore")
+        for col in self.missing_indicators_:
+            if col in X.columns:
+                X[f"is_missing_{col}"] = X[col].isnull().astype(int)
+        X.fillna(value=self.imputation_map_, inplace=True)
+        return X
+class AdaptiveEncoder(ScoutEstimator):
+    """
+    Pipeline Completo: Fuzzy Match -> Rare Label Grouping -> Encoding (Target/OHE).
+    """
+    def __init__(
+        self,
+        card_threshold: int = 10,
+        smoothing: float = 10.0,
+        fuzzy_threshold: int = 90,
+        rare_threshold: float = 0.01,
+    ):
+        super().__init__()
+        self.card_threshold = card_threshold
+        self.smoothing = smoothing
+        self.fuzzy_threshold = fuzzy_threshold
+        self.rare_threshold = rare_threshold
+        # Estado aprendido
+        self.fuzzy_maps_ = {}  # {col: {bad_val: good_val}}
+        self.rare_groups_ = {}  # {col: [list_of_kept_values]}
+        self.encoding_map_ = {}  # {col: {val: target_mean}}
+        self.global_means_ = {}  # {col: global_mean}
+        self.ohe_cols_ = []
+    def _learn_fuzzy_map(self, series: pd.Series) -> Dict[str, str]:
+        """Agrupa strings muito parecidas (ex: 'SP', 'sp', 'S. Paulo')."""
+        unique_vals = series.dropna().unique().astype(str)
+        if len(unique_vals) < 3:
+            return {}
+        mapping = {}
+        # Ordena por tamanho para processar os mais limpos primeiro (heurística simples)
+        unique_vals = sorted(unique_vals, key=len)
+        covered = set()
+        for val in unique_vals:
+            if val in covered:
+                continue
+            # Encontra similares
+            matches = process.extract(
+                val, unique_vals, limit=None, scorer=fuzz.token_sort_ratio
+            )
+            # Filtra por threshold
+            group = [m[0] for m in matches if m[1] >= self.fuzzy_threshold]
+            # O "Líder" do grupo é o valor mais curto ou mais frequente (simplificado aqui como o primeiro)
+            leader = val
+            for member in group:
+                if member != leader:
+                    mapping[member] = leader
+                covered.add(member)
+        return mapping
+    def _learn_rare_groups(self, series: pd.Series) -> List[str]:
+        """Retorna lista de categorias para MANTER. O resto vira 'OTHER'."""
+        freqs = series.value_counts(normalize=True)
+        kept = freqs[freqs >= self.rare_threshold].index.tolist()
+        return kept
+    def _get_smoothed_mean(self, df, col, y_name):
+        """Target Encoding com Suavização (Smoothing)."""
+        global_mean = df[y_name].mean()
+        agg = df.groupby(col)[y_name].agg(["count", "mean"])
+        counts = agg["count"]
+        means = agg["mean"]
+        smooth = (counts * means + self.smoothing * global_mean) / (
+            counts + self.smoothing
+        )
+        return smooth, global_mean
+    def fit(self, X: pd.DataFrame, y: pd.Series):
+        self.history = []
+        X = X.copy()
+        y_name = "target_internal"
+        X[y_name] = y.values
+        cat_cols = X.select_dtypes(include=["object", "category"]).columns
+        for col in cat_cols:
+            # 1. Fuzzy Matching Learning
+            f_map = self._learn_fuzzy_map(X[col])
+            if f_map:
+                self.fuzzy_maps_[col] = f_map
+                self.history.append(
+                    {"action": "fuzzy_map", "col": col, "count": len(f_map)}
+                )
+                X[col] = X[col].replace(
+                    f_map
+                )  # Aplica localmente para continuar o treino
+            # 2. Rare Label Learning
+            kept_vals = self._learn_rare_groups(X[col])
+            self.rare_groups_[col] = kept_vals
+            # Aplica localmente: tudo que não está em kept_vals vira "OTHER"
+            X.loc[~X[col].isin(kept_vals), col] = "OTHER"
+            self.history.append(
+                {"action": "rare_group", "col": col, "kept": len(kept_vals)}
+            )
+            # 3. Decision: OHE vs Target Encoding
+            cardinality = X[col].nunique()
+            if cardinality <= self.card_threshold:
+                self.ohe_cols_.append(col)
+                self.history.append({"action": "ohe", "col": col})
+            else:
+                smooth_map, global_m = self._get_smoothed_mean(X, col, y_name)
+                self.encoding_map_[col] = smooth_map.to_dict()
+                self.global_means_[col] = global_m
+                self.history.append({"action": "target_encode", "col": col})
+        return self
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        X = X.copy()
+        # 1. Aplica Fuzzy Maps
+        for col, f_map in self.fuzzy_maps_.items():
+            if col in X.columns:
+                X[col] = X[col].replace(f_map)
+        # 2. Aplica Rare Groups
+        for col, kept_vals in self.rare_groups_.items():
+            if col in X.columns:
+                # Se não estiver na lista de mantidos, vira "OTHER"
+                X[col] = np.where(X[col].isin(kept_vals), X[col], "OTHER")
+        # 3. Aplica Target Encoding
+        for col, mapping in self.encoding_map_.items():
+            if col in X.columns:
+                fallback = self.global_means_[col]
+                X[col] = X[col].map(mapping).fillna(fallback)
+        # 4. Aplica OHE
+        if self.ohe_cols_:
+            X = pd.get_dummies(X, columns=self.ohe_cols_, drop_first=True)
+        return X
+    def fit_transform(
+        self, X: pd.DataFrame, y: pd.Series = None, **fit_params
+    ) -> pd.DataFrame:
+        """K-Fold Target Encoding para evitar Data Leakage."""
+        self.fit(X, y)  # Aprende os mapas globais (Fuzzy, Rare, Encoding)
+        X_out = X.copy()
+        # Primeiro, precisamos aplicar Fuzzy e Rare em todo o X_out para garantir consistência
+        # antes de fazer o K-Fold do Target Encoding
+        for col, f_map in self.fuzzy_maps_.items():
+            X_out[col] = X_out[col].replace(f_map)
+        for col, kept_vals in self.rare_groups_.items():
+            X_out[col] = np.where(X_out[col].isin(kept_vals), X_out[col], "OTHER")
+        # Se não houver target encoding, só retorna transformado
+        if not self.encoding_map_:
+            return self.transform(X)
+        # K-Fold logic apenas para as colunas de Target Encoding
+        kf = KFold(n_splits=5, shuffle=True, random_state=42)
+        y_series = y if isinstance(y, pd.Series) else pd.Series(y)
+        y_name = "target_tmp"
+        for col in self.encoding_map_.keys():
+            # Cria coluna temporária
+            X_out[f"{col}_TE"] = np.nan
+        for train_idx, val_idx in kf.split(X_out, y_series):
+            # Dados de Treino do Fold (já limpos de fuzzy/rare)
+            X_fold_train = X_out.iloc[train_idx].copy()
+            y_fold_train = y_series.iloc[train_idx]
+            # Dados de Validação
+            X_fold_val = X_out.iloc[val_idx].copy()
+            df_train = X_fold_train.copy()
+            df_train[y_name] = y_fold_train.values
+            for col in self.encoding_map_.keys():
+                smooth_map, global_m = self._get_smoothed_mean(df_train, col, y_name)
+                # Aplica map aprendido no fold de treino sobre o fold de validação
+                X_out.loc[val_idx, col] = (
+                    X_fold_val[col].map(smooth_map).fillna(global_m)
+                )
+        # Preencher buracos eventuais com média global aprendida no fit() total
+        for col in self.encoding_map_.keys():
+            X_out[col] = X_out[col].fillna(self.global_means_[col])
+        # Finalmente, OHE (pós tratamento)
+        if self.ohe_cols_:
+            X_out = pd.get_dummies(X_out, columns=self.ohe_cols_, drop_first=True)
+        return X_out

trailblazer_ml-0.1.0/scoutml/transpiler.py ADDED Viewed

@@ -0,0 +1,168 @@
+import logging
+from typing import Optional, List
+from .preprocessing import AutoCleaner, SmartImputer, AdaptiveEncoder
+from .analysis import InsightEngine  # Importar para type hinting
+# Adicione a importação no topo
+from .preprocessing import AutoCleaner, SmartImputer, AdaptiveEncoder, DateFeaturizer
+class CodeSynthesizer:
+    def __init__(
+        self,
+        cleaner: AutoCleaner,
+        imputer: SmartImputer,
+        encoder: AdaptiveEncoder,
+        date_featurizer: Optional[DateFeaturizer] = None,  # <--- NOVO
+        insight_engine: Optional[InsightEngine] = None,
+    ):
+        self.cleaner = cleaner
+        self.imputer = imputer
+        self.encoder = encoder
+        self.date_featurizer = date_featurizer  # <--- NOVO
+        self.insight = insight_engine
+    def export_to_script(self, filepath: str = "pipeline_production.py"):
+        lines = []
+        lines.append(
+            '"""\nScoutML Generated Pipeline\nReady for Databricks/Pandas.\n"""'
+        )
+        lines.append("import pandas as pd")
+        lines.append("import numpy as np")
+        lines.append("")
+        lines.append("def run_pipeline(df: pd.DataFrame) -> pd.DataFrame:")
+        lines.append("    df = df.copy()")
+        lines.append("")
+        # ---------------------------------------------------------
+        # 0. Date Featurizer (DEVE VIR PRIMEIRO)
+        # ---------------------------------------------------------
+        if self.date_featurizer and self.date_featurizer.date_cols_:
+            lines.append("    # [0] DateFeaturizer: Extracting time features")
+            lines.append(f"    date_cols = {self.date_featurizer.date_cols_}")
+            lines.append("    for col in date_cols:")
+            lines.append("        if col in df.columns:")
+            lines.append("            # Force datetime conversion")
+            # Injeta o formato se ele existir, senão deixa o pandas adivinhar
+            if self.date_featurizer.date_format:
+                fmt = self.date_featurizer.date_format
+                lines.append(
+                    f"            series = pd.to_datetime(df[col], format='{fmt}', errors='coerce')"
+                )
+            else:
+                lines.append(
+                    "            series = pd.to_datetime(df[col], errors='coerce')"
+                )
+            lines.append("            df[f'{col}_year'] = series.dt.year")
+            lines.append("            df[f'{col}_month'] = series.dt.month")
+            lines.append("            df[f'{col}_day'] = series.dt.day")
+            lines.append("            df[f'{col}_dayofweek'] = series.dt.dayofweek")
+            lines.append(
+                "            df[f'{col}_is_weekend'] = (series.dt.dayofweek >= 5).astype(int)"
+            )
+            lines.append("            df.drop(columns=[col], inplace=True)")
+            lines.append("")
+        # ---------------------------------------------------------
+        # 1. Cleaner & Insight Drops
+        # ---------------------------------------------------------
+        # Combina drops do AutoCleaner (técnico) com InsightEngine (estatístico)
+        all_drops = set(self.cleaner.cols_to_drop_)
+        if self.insight:
+            if self.insight.leakage_candidates_:
+                all_drops.update(self.insight.leakage_candidates_)
+            if self.insight.useless_features_:
+                all_drops.update(self.insight.useless_features_)
+        drops_list = list(all_drops)
+        if drops_list:
+            lines.append(f"    # [1] AutoCleaner & InsightEngine Drops")
+            lines.append(f"    # Drops: IDs, Constants, Leakage, Useless Features")
+            lines.append(f"    drop_cols = {drops_list}")
+            lines.append(
+                "    df.drop(columns=drop_cols, errors='ignore', inplace=True)"
+            )
+            lines.append("")
+        # ... (Resto do código igual: Imputer, Encoder, etc.) ...
+        # (Copie o restante da lógica do imputer e encoder do arquivo anterior aqui)
+        # MANTENHA O RESTO DO CÓDIGO DO IMPUTER E ENCODER AQUI
+        # ...
+        # ---------------------------------------------------------
+        # 2. Imputer
+        # ---------------------------------------------------------
+        lines.append("    # [2] SmartImputer")
+        if self.imputer.missing_indicators_:
+            lines.append(f"    # Flagging structural missingness")
+            lines.append(f"    missing_flags = {self.imputer.missing_indicators_}")
+            lines.append("    for col in missing_flags:")
+            lines.append("        if col in df.columns:")
+            lines.append(
+                f"            df[f'is_missing_{{col}}'] = df[col].isnull().astype(int)"
+            )
+        if self.imputer.imputation_map_:
+            lines.append("    # Applying Fillna (Median/Mode)")
+            lines.append(f"    fill_values = {self.imputer.imputation_map_}")
+            lines.append("    df.fillna(value=fill_values, inplace=True)")
+        lines.append("")
+        # ---------------------------------------------------------
+        # 3. Encoder (Fuzzy -> Rare -> Encode)
+        # ---------------------------------------------------------
+        lines.append("    # [3] AdaptiveEncoder")
+        if self.encoder.fuzzy_maps_:
+            lines.append("    # [3.1] Fuzzy Matching Correction")
+            for col, f_map in self.encoder.fuzzy_maps_.items():
+                lines.append(f"    if '{col}' in df.columns:")
+                lines.append(f"        f_map_{col} = {f_map}")
+                lines.append(f"        df['{col}'] = df['{col}'].replace(f_map_{col})")
+            lines.append("")
+        if self.encoder.rare_groups_:
+            lines.append("    # [3.2] Rare Label Grouping (-> 'OTHER')")
+            for col, kept_vals in self.encoder.rare_groups_.items():
+                lines.append(f"    if '{col}' in df.columns:")
+                lines.append(f"        kept_{col} = {kept_vals}")
+                lines.append(
+                    f"        df['{col}'] = np.where(df['{col}'].isin(kept_{col}), df['{col}'], 'OTHER')"
+                )
+            lines.append("")
+        if self.encoder.encoding_map_:
+            lines.append("    # [3.3] Target Encoding (Smoothed Globals)")
+            for col, mapping in self.encoder.encoding_map_.items():
+                fallback = self.encoder.global_means_.get(col, 0)
+                lines.append(f"    if '{col}' in df.columns:")
+                lines.append(f"        map_{col} = {str(mapping)}")
+                lines.append(
+                    f"        df['{col}'] = df['{col}'].map(map_{col}).fillna({fallback})"
+                )
+            lines.append("")
+        if self.encoder.ohe_cols_:
+            lines.append(f"    # [3.4] One-Hot Encoding")
+            lines.append(f"    ohe_cols = {self.encoder.ohe_cols_}")
+            lines.append("    # Ensure columns exist before OHE to avoid errors")
+            lines.append("    valid_ohe = [c for c in ohe_cols if c in df.columns]")
+            lines.append(
+                "    df = pd.get_dummies(df, columns=valid_ohe, drop_first=True)"
+            )
+        lines.append("")
+        lines.append("    return df")
+        try:
+            with open(filepath, "w", encoding="utf-8") as f:
+                f.write("\n".join(lines))
+            print(f"Pipeline exportado com sucesso para: {filepath}")
+        except Exception as e:
+            print(f"Erro ao exportar pipeline: {e}")

trailblazer_ml-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

trailblazer_ml-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,38 @@
+from setuptools import setup, find_packages
+import pathlib
+# O diretório onde este arquivo está
+here = pathlib.Path(__file__).parent.resolve()
+# Lê o README para usar como descrição longa no PyPI
+long_description = (here / "README.md").read_text(encoding="utf-8")
+setup(
+    name="trailblazer-ml",  # O nome que será usado no pip install
+    version="0.1.0",
+    description="Uma biblioteca de AutoML Exploratório e 'Glass-Box'.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/gabsalles/trailblazer-ml",  # Coloque seu GitHub aqui
+    author="Gabriel Sales",
+    author_email="ggcs10@gmail.com",
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+    ],
+    keywords="automl, data-science, preprocessing, cleaning",
+    packages=find_packages(),
+    python_requires=">=3.8, <4",
+    install_requires=[
+        "numpy>=1.21.0",
+        "pandas>=1.5.0",
+        "scikit-learn>=1.0.0",
+        "lightgbm>=3.3.0",
+        "rapidfuzz>=3.0.0",
+        "networkx>=3.0",
+        "plotly>=5.0.0",
+    ],
+)

trailblazer_ml-0.1.0/trailblazer_ml.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,41 @@
+Metadata-Version: 2.4
+Name: trailblazer-ml
+Version: 0.1.0
+Summary: Uma biblioteca de AutoML Exploratório e 'Glass-Box'.
+Home-page: https://github.com/gabsalles/trailblazer-ml
+Author: Gabriel Sales
+Author-email: ggcs10@gmail.com
+Keywords: automl,data-science,preprocessing,cleaning
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.8, <4
+Description-Content-Type: text/markdown
+Requires-Dist: numpy>=1.21.0
+Requires-Dist: pandas>=1.5.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: lightgbm>=3.3.0
+Requires-Dist: rapidfuzz>=3.0.0
+Requires-Dist: networkx>=3.0
+Requires-Dist: plotly>=5.0.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# ScoutML 🚀
+Uma biblioteca de AutoML "Glass-Box" focada em exploração, limpeza e geração de código transparente.
+## Instalação
+```bash
+pip install scoutml

trailblazer_ml-0.1.0/trailblazer_ml.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+README.md
+setup.py
+scoutml/__init__.py
+scoutml/analysis.py
+scoutml/preprocessing.py
+scoutml/transpiler.py
+trailblazer_ml.egg-info/PKG-INFO
+trailblazer_ml.egg-info/SOURCES.txt
+trailblazer_ml.egg-info/dependency_links.txt
+trailblazer_ml.egg-info/requires.txt
+trailblazer_ml.egg-info/top_level.txt

trailblazer_ml-0.1.0/trailblazer_ml.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

trailblazer_ml-0.1.0/trailblazer_ml.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,7 @@
+numpy>=1.21.0
+pandas>=1.5.0
+scikit-learn>=1.0.0
+lightgbm>=3.3.0
+rapidfuzz>=3.0.0
+networkx>=3.0
+plotly>=5.0.0

trailblazer_ml-0.1.0/trailblazer_ml.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ scoutml