PyPI - featcopilot - Versions diffs - 0.1.0__py3-none-any.whl - Mend

featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

featcopilot/__init__.py +29 -0
featcopilot/core/__init__.py +13 -0
featcopilot/core/base.py +195 -0
featcopilot/core/feature.py +224 -0
featcopilot/core/registry.py +128 -0
featcopilot/engines/__init__.py +13 -0
featcopilot/engines/relational.py +256 -0
featcopilot/engines/tabular.py +293 -0
featcopilot/engines/text.py +211 -0
featcopilot/engines/timeseries.py +402 -0
featcopilot/llm/__init__.py +16 -0
featcopilot/llm/code_generator.py +295 -0
featcopilot/llm/copilot_client.py +521 -0
featcopilot/llm/explainer.py +200 -0
featcopilot/llm/semantic_engine.py +379 -0
featcopilot/selection/__init__.py +13 -0
featcopilot/selection/importance.py +161 -0
featcopilot/selection/redundancy.py +156 -0
featcopilot/selection/statistical.py +199 -0
featcopilot/selection/unified.py +172 -0
featcopilot/transformers/__init__.py +11 -0
featcopilot/transformers/sklearn_compat.py +401 -0
featcopilot/utils/__init__.py +9 -0
featcopilot/utils/cache.py +221 -0
featcopilot/utils/parallel.py +109 -0
featcopilot-0.1.0.dist-info/METADATA +218 -0
featcopilot-0.1.0.dist-info/RECORD +29 -0
featcopilot-0.1.0.dist-info/WHEEL +5 -0
featcopilot-0.1.0.dist-info/top_level.txt +1 -0

featcopilot/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""
+FeatCopilot - Next-Generation LLM-Powered Auto Feature Engineering
+A unified feature engineering framework combining traditional approaches
+with novel LLM-powered capabilities via GitHub Copilot SDK.
+"""
+__version__ = "0.1.0"
+__author__ = "FeatCopilot Contributors"
+from featcopilot.core.base import BaseEngine, BaseSelector
+from featcopilot.core.feature import Feature, FeatureSet
+from featcopilot.transformers.sklearn_compat import (
+    AutoFeatureEngineer,
+    FeatureEngineerTransformer,
+)
+__all__ = [
+    # Core
+    "BaseEngine",
+    "BaseSelector",
+    "Feature",
+    "FeatureSet",
+    # Main API
+    "AutoFeatureEngineer",
+    "FeatureEngineerTransformer",
+    # Version
+    "__version__",
+]

featcopilot/core/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Core module containing base classes and interfaces."""
+from featcopilot.core.base import BaseEngine, BaseSelector
+from featcopilot.core.feature import Feature, FeatureSet
+from featcopilot.core.registry import FeatureRegistry
+__all__ = [
+    "BaseEngine",
+    "BaseSelector",
+    "Feature",
+    "FeatureSet",
+    "FeatureRegistry",
+]

featcopilot/core/base.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Base classes for feature engineering engines and selectors."""
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Union
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field
+class EngineConfig(BaseModel):
+    """Configuration for feature engineering engines."""
+    name: str = Field(description="Engine name")
+    enabled: bool = Field(default=True, description="Whether engine is enabled")
+    max_features: Optional[int] = Field(default=None, description="Max features to generate")
+    verbose: bool = Field(default=False, description="Verbose output")
+class BaseEngine(ABC):
+    """
+    Abstract base class for feature engineering engines.
+    All engines (tabular, timeseries, relational, llm) inherit from this class.
+    """
+    def __init__(self, config: Optional[EngineConfig] = None, **kwargs):
+        self.config = config or EngineConfig(name=self.__class__.__name__, **kwargs)
+        self._is_fitted = False
+        self._feature_names: list[str] = []
+        self._feature_metadata: dict[str, Any] = {}
+    @property
+    def is_fitted(self) -> bool:
+        """Check if engine has been fitted."""
+        return self._is_fitted
+    @abstractmethod
+    def fit(
+        self,
+        X: Union[pd.DataFrame, np.ndarray],
+        y: Optional[Union[pd.Series, np.ndarray]] = None,
+        **kwargs,
+    ) -> "BaseEngine":
+        """
+        Fit the engine to the data.
+        Parameters
+        ----------
+        X : DataFrame or ndarray
+            Input features
+        y : Series or ndarray, optional
+            Target variable
+        **kwargs : dict
+            Additional parameters
+        Returns
+        -------
+        self : BaseEngine
+            Fitted engine
+        """
+        pass
+    @abstractmethod
+    def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
+        """
+        Transform data to generate new features.
+        Parameters
+        ----------
+        X : DataFrame or ndarray
+            Input features
+        **kwargs : dict
+            Additional parameters
+        Returns
+        -------
+        X_transformed : DataFrame
+            Transformed features
+        """
+        pass
+    def fit_transform(
+        self,
+        X: Union[pd.DataFrame, np.ndarray],
+        y: Optional[Union[pd.Series, np.ndarray]] = None,
+        **kwargs,
+    ) -> pd.DataFrame:
+        """Fit and transform in one step."""
+        return self.fit(X, y, **kwargs).transform(X, **kwargs)
+    def get_feature_names(self) -> list[str]:
+        """Get names of generated features."""
+        return self._feature_names.copy()
+    def get_feature_metadata(self) -> dict[str, Any]:
+        """Get metadata for generated features."""
+        return self._feature_metadata.copy()
+    def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
+        """Convert input to DataFrame and validate."""
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
+        elif not isinstance(X, pd.DataFrame):
+            raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
+        return X
+class SelectorConfig(BaseModel):
+    """Configuration for feature selectors."""
+    max_features: Optional[int] = Field(default=None, description="Max features to select")
+    min_importance: float = Field(default=0.0, description="Minimum importance threshold")
+    correlation_threshold: float = Field(default=0.95, description="Threshold for correlation-based elimination")
+class BaseSelector(ABC):
+    """
+    Abstract base class for feature selection.
+    Handles selection of most important/relevant features from generated set.
+    """
+    def __init__(self, config: Optional[SelectorConfig] = None, **kwargs):
+        self.config = config or SelectorConfig(**kwargs)
+        self._is_fitted = False
+        self._selected_features: list[str] = []
+        self._feature_scores: dict[str, float] = {}
+    @property
+    def is_fitted(self) -> bool:
+        """Check if selector has been fitted."""
+        return self._is_fitted
+    @abstractmethod
+    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs) -> "BaseSelector":
+        """
+        Fit the selector to determine feature importance.
+        Parameters
+        ----------
+        X : DataFrame or ndarray
+            Input features
+        y : Series or ndarray
+            Target variable
+        **kwargs : dict
+            Additional parameters
+        Returns
+        -------
+        self : BaseSelector
+            Fitted selector
+        """
+        pass
+    @abstractmethod
+    def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
+        """
+        Transform data to keep only selected features.
+        Parameters
+        ----------
+        X : DataFrame or ndarray
+            Input features
+        **kwargs : dict
+            Additional parameters
+        Returns
+        -------
+        X_selected : DataFrame
+            Data with only selected features
+        """
+        pass
+    def fit_transform(
+        self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], **kwargs
+    ) -> pd.DataFrame:
+        """Fit and transform in one step."""
+        return self.fit(X, y, **kwargs).transform(X, **kwargs)
+    def get_selected_features(self) -> list[str]:
+        """Get names of selected features."""
+        return self._selected_features.copy()
+    def get_feature_scores(self) -> dict[str, float]:
+        """Get importance scores for all features."""
+        return self._feature_scores.copy()
+    def _validate_input(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
+        """Convert input to DataFrame and validate."""
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
+        elif not isinstance(X, pd.DataFrame):
+            raise TypeError(f"Expected DataFrame or ndarray, got {type(X)}")
+        return X

featcopilot/core/feature.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""Feature representation and feature sets."""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Optional
+import numpy as np
+import pandas as pd
+class FeatureType(Enum):
+    """Types of features."""
+    NUMERIC = "numeric"
+    CATEGORICAL = "categorical"
+    DATETIME = "datetime"
+    TEXT = "text"
+    BOOLEAN = "boolean"
+class FeatureOrigin(Enum):
+    """Origin/source of feature."""
+    ORIGINAL = "original"  # Original input feature
+    POLYNOMIAL = "polynomial"  # Polynomial transformation
+    INTERACTION = "interaction"  # Interaction between features
+    AGGREGATION = "aggregation"  # Aggregation operation
+    TIMESERIES = "timeseries"  # Time series extraction
+    LLM_GENERATED = "llm_generated"  # Generated by LLM
+    LLM_SUGGESTED = "llm_suggested"  # Suggested by LLM, implemented traditionally
+    CUSTOM = "custom"  # Custom user-defined
+@dataclass
+class Feature:
+    """
+    Represents a single feature with metadata.
+    Attributes
+    ----------
+    name : str
+        Feature name
+    dtype : FeatureType
+        Data type of feature
+    origin : FeatureOrigin
+        How the feature was created
+    source_columns : list
+        Original columns used to create this feature
+    transformation : str
+        Description of transformation applied
+    explanation : str, optional
+        Human-readable explanation of the feature
+    code : str, optional
+        Python code that generates this feature
+    importance : float, optional
+        Feature importance score
+    metadata : dict
+        Additional metadata
+    """
+    name: str
+    dtype: FeatureType = FeatureType.NUMERIC
+    origin: FeatureOrigin = FeatureOrigin.ORIGINAL
+    source_columns: list[str] = field(default_factory=list)
+    transformation: str = ""
+    explanation: Optional[str] = None
+    code: Optional[str] = None
+    importance: Optional[float] = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self):
+        if not self.source_columns:
+            self.source_columns = [self.name]
+    def to_dict(self) -> dict[str, Any]:
+        """Convert feature to dictionary."""
+        return {
+            "name": self.name,
+            "dtype": self.dtype.value,
+            "origin": self.origin.value,
+            "source_columns": self.source_columns,
+            "transformation": self.transformation,
+            "explanation": self.explanation,
+            "code": self.code,
+            "importance": self.importance,
+            "metadata": self.metadata,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "Feature":
+        """Create feature from dictionary."""
+        return cls(
+            name=data["name"],
+            dtype=FeatureType(data.get("dtype", "numeric")),
+            origin=FeatureOrigin(data.get("origin", "original")),
+            source_columns=data.get("source_columns", []),
+            transformation=data.get("transformation", ""),
+            explanation=data.get("explanation"),
+            code=data.get("code"),
+            importance=data.get("importance"),
+            metadata=data.get("metadata", {}),
+        )
+    def compute(self, df: pd.DataFrame) -> pd.Series:
+        """
+        Compute feature values from DataFrame using stored code.
+        Parameters
+        ----------
+        df : DataFrame
+            Input data
+        Returns
+        -------
+        Series
+            Computed feature values
+        """
+        if self.code:
+            # Execute stored code to compute feature
+            local_vars = {"df": df, "np": np, "pd": pd}
+            exec(self.code, {"__builtins__": {}}, local_vars)
+            if "result" in local_vars:
+                return local_vars["result"]
+        raise ValueError(f"No code defined for feature {self.name}")
+class FeatureSet:
+    """
+    Collection of features with operations for manipulation.
+    Provides methods for adding, removing, filtering, and combining features.
+    """
+    def __init__(self, features: Optional[list[Feature]] = None):
+        self._features: dict[str, Feature] = {}
+        if features:
+            for f in features:
+                self.add(f)
+    def __len__(self) -> int:
+        return len(self._features)
+    def __iter__(self):
+        return iter(self._features.values())
+    def __contains__(self, name: str) -> bool:
+        return name in self._features
+    def __getitem__(self, name: str) -> Feature:
+        return self._features[name]
+    def add(self, feature: Feature) -> None:
+        """Add a feature to the set."""
+        self._features[feature.name] = feature
+    def remove(self, name: str) -> Optional[Feature]:
+        """Remove and return a feature by name."""
+        return self._features.pop(name, None)
+    def get(self, name: str) -> Optional[Feature]:
+        """Get a feature by name."""
+        return self._features.get(name)
+    def get_names(self) -> list[str]:
+        """Get all feature names."""
+        return list(self._features.keys())
+    def filter_by_origin(self, origin: FeatureOrigin) -> "FeatureSet":
+        """Filter features by origin."""
+        return FeatureSet([f for f in self._features.values() if f.origin == origin])
+    def filter_by_type(self, dtype: FeatureType) -> "FeatureSet":
+        """Filter features by data type."""
+        return FeatureSet([f for f in self._features.values() if f.dtype == dtype])
+    def filter_by_importance(self, min_importance: float) -> "FeatureSet":
+        """Filter features by minimum importance."""
+        return FeatureSet(
+            [f for f in self._features.values() if f.importance is not None and f.importance >= min_importance]
+        )
+    def sort_by_importance(self, descending: bool = True) -> list[Feature]:
+        """Sort features by importance."""
+        features = [f for f in self._features.values() if f.importance is not None]
+        return sorted(features, key=lambda f: f.importance or 0, reverse=descending)
+    def merge(self, other: "FeatureSet") -> "FeatureSet":
+        """Merge with another feature set."""
+        result = FeatureSet(list(self._features.values()))
+        for f in other:
+            result.add(f)
+        return result
+    def to_dataframe(self) -> pd.DataFrame:
+        """Convert feature set to DataFrame with metadata."""
+        return pd.DataFrame([f.to_dict() for f in self._features.values()])
+    def get_explanations(self) -> dict[str, str]:
+        """Get explanations for all features that have them."""
+        return {f.name: f.explanation for f in self._features.values() if f.explanation}
+    def compute_all(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Compute all features that have code defined.
+        Parameters
+        ----------
+        df : DataFrame
+            Input data
+        Returns
+        -------
+        DataFrame
+            DataFrame with computed features
+        """
+        result = df.copy()
+        for feature in self._features.values():
+            if feature.code and feature.name not in result.columns:
+                try:
+                    result[feature.name] = feature.compute(df)
+                except Exception as e:
+                    # Log warning but continue
+                    print(f"Warning: Could not compute feature {feature.name}: {e}")
+        return result

featcopilot/core/registry.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Feature registry for tracking and managing features."""
+from typing import Callable, Optional
+from featcopilot.core.feature import Feature, FeatureOrigin
+class FeatureRegistry:
+    """
+    Global registry for feature definitions and generators.
+    Provides registration and lookup of:
+    - Feature transformation functions
+    - Feature generator classes
+    - Custom feature definitions
+    """
+    _instance: Optional["FeatureRegistry"] = None
+    _transformations: dict[str, Callable] = {}
+    _generators: dict[str, type] = {}
+    def __new__(cls) -> "FeatureRegistry":
+        """Singleton pattern for global registry."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._init_default_transformations()
+        return cls._instance
+    def _init_default_transformations(self) -> None:
+        """Initialize default transformation functions."""
+        import numpy as np
+        self._transformations = {
+            "log": lambda x: np.log1p(np.abs(x)),
+            "log10": lambda x: np.log10(np.abs(x) + 1),
+            "sqrt": lambda x: np.sqrt(np.abs(x)),
+            "square": lambda x: x**2,
+            "cube": lambda x: x**3,
+            "reciprocal": lambda x: 1 / (x + 1e-8),
+            "abs": lambda x: np.abs(x),
+            "sign": lambda x: np.sign(x),
+            "exp": lambda x: np.exp(np.clip(x, -50, 50)),
+            "sin": lambda x: np.sin(x),
+            "cos": lambda x: np.cos(x),
+            "tanh": lambda x: np.tanh(x),
+        }
+    def register_transformation(self, name: str, func: Callable) -> None:
+        """
+        Register a transformation function.
+        Parameters
+        ----------
+        name : str
+            Name of transformation
+        func : callable
+            Function that takes array and returns transformed array
+        """
+        self._transformations[name] = func
+    def get_transformation(self, name: str) -> Optional[Callable]:
+        """Get a registered transformation by name."""
+        return self._transformations.get(name)
+    def list_transformations(self) -> list[str]:
+        """List all registered transformation names."""
+        return list(self._transformations.keys())
+    def register_generator(self, name: str, generator_class: type) -> None:
+        """
+        Register a feature generator class.
+        Parameters
+        ----------
+        name : str
+            Name of generator
+        generator_class : type
+            Class that generates features
+        """
+        self._generators[name] = generator_class
+    def get_generator(self, name: str) -> Optional[type]:
+        """Get a registered generator by name."""
+        return self._generators.get(name)
+    def list_generators(self) -> list[str]:
+        """List all registered generator names."""
+        return list(self._generators.keys())
+    def create_feature(self, name: str, transformation: str, source_columns: list[str], **kwargs) -> Feature:
+        """
+        Create a feature using a registered transformation.
+        Parameters
+        ----------
+        name : str
+            Feature name
+        transformation : str
+            Name of registered transformation
+        source_columns : list
+            Columns used in transformation
+        **kwargs : dict
+            Additional feature attributes
+        Returns
+        -------
+        Feature
+            Created feature object
+        """
+        func = self.get_transformation(transformation)
+        if func is None:
+            raise ValueError(f"Unknown transformation: {transformation}")
+        # Generate code string for the transformation
+        code = f"result = {transformation}(df['{source_columns[0]}'])"
+        return Feature(
+            name=name,
+            source_columns=source_columns,
+            transformation=transformation,
+            code=code,
+            origin=FeatureOrigin.POLYNOMIAL,
+            **kwargs,
+        )
+# Global registry instance
+registry = FeatureRegistry()

featcopilot/engines/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Feature engineering engines."""
+from featcopilot.engines.relational import RelationalEngine
+from featcopilot.engines.tabular import TabularEngine
+from featcopilot.engines.text import TextEngine
+from featcopilot.engines.timeseries import TimeSeriesEngine
+__all__ = [
+    "TabularEngine",
+    "TimeSeriesEngine",
+    "RelationalEngine",
+    "TextEngine",
+]