PyPI - InsideForest - Versions diffs - 0.3.2__tar.gz → 0.3.3__tar.gz - Mend

InsideForest 0.3.2tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/cluster_selector.py RENAMED Viewed

@@ -86,6 +86,9 @@ def select_clusters(
         ponderador = regla['ponderador']
         cluster = regla['cluster']
+        missing_cols = [col for col in variables if col not in df_datos.columns]
+        if missing_cols:
+            raise KeyError(f"Columns not found in df_datos: {missing_cols}")
         X_datos = df_datos[variables]
         condiciones = [
             (X_datos[var].to_numpy() >= linf[var]) & (X_datos[var].to_numpy() <= lsup[var])

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/inside_forest.py RENAMED Viewed

@@ -1,14 +1,93 @@
-import pandas as pd
+from typing import Optional, Dict, Any, List, Tuple
+import numpy as np
 import joblib
+try:
+    import pandas as pd
+    _HAS_PANDAS = True
+except Exception:
+    _HAS_PANDAS = False
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.validation import check_is_fitted
+from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression
+from sklearn.utils.multiclass import type_of_target
 from .trees import Trees
 from .regions import Regions
 from .descrip import get_frontiers
+# ---------- FAST helpers ----------
+def _size_bucket(n: int, d: int) -> str:
+    prod = n * d
+    if prod <= 50_000:
+        return "small"
+    elif prod <= 200_000:
+        return "medium"
+    elif prod <= 1_000_000:
+        return "large"
+    else:
+        return "huge"
+def _choose_k_features(n: int, d: int) -> int:
+    bucket = _size_bucket(n, d)
+    if bucket == "small":
+        k = min(d, 64)
+    elif bucket == "medium":
+        k = min(d, 48)
+    elif bucket == "large":
+        k = min(d, 32)
+    else:
+        k = min(d, 24)
+    return max(8, k)
+def _choose_fast_params(n: int, d: int) -> Dict[str, Any]:
+    bucket = _size_bucket(n, d)
+    if bucket == "small":
+        rf_params = dict(n_estimators=80, max_depth=12, min_samples_leaf=3, n_jobs=-1, random_state=42)
+        tree_params = dict(percentil=98, low_frac=0.02)
+        divide = 3
+        method = "menu"
+    elif bucket == "medium":
+        rf_params = dict(n_estimators=60, max_depth=10, min_samples_leaf=5, n_jobs=-1, random_state=42)
+        tree_params = dict(percentil=99, low_frac=0.01)
+        divide = 3
+        method = "menu"
+    elif bucket == "large":
+        rf_params = dict(n_estimators=40, max_depth=9, min_samples_leaf=8, n_jobs=-1, random_state=42)
+        tree_params = dict(percentil=99.5, low_frac=0.0075)
+        divide = 3
+        method = "menu"
+    else:
+        rf_params = dict(n_estimators=30, max_depth=8, min_samples_leaf=10, n_jobs=-1, random_state=42)
+        tree_params = dict(percentil=99.7, low_frac=0.005)
+        divide = 3
+        method = "menu"
+    return dict(
+        rf_params=rf_params,
+        tree_params=tree_params,
+        divide=divide,
+        method=method,
+        get_detail=False,
+    )
+def _merge_dicts(base: Dict[str, Any], override: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    if not override:
+        return base
+    out = dict(base)
+    for k, v in override.items():
+        if isinstance(v, dict) and isinstance(out.get(k), dict):
+            out[k] = {**out[k], **v}
+        else:
+            out[k] = v
+    return out
 class _BaseInsideForest:
     """Internal base class handling shared ``fit`` and ``predict`` logic.
@@ -67,7 +146,12 @@ class _BaseInsideForest:
         get_detail=False,
         leaf_percentile=95,
         low_leaf_fraction=0.05,
+        auto_fast=False,
+        auto_feature_reduce=False,
+        explicit_k_features: Optional[int] = None,
+        fast_overrides: Optional[Dict[str, Any]] = None,
     ):
+        self.rf_cls = rf_cls
         self.rf_params = rf_params or {}
         self.tree_params = tree_params or {}
         self.var_obj = var_obj
@@ -79,6 +163,19 @@ class _BaseInsideForest:
         self.leaf_percentile = leaf_percentile
         self.low_leaf_fraction = low_leaf_fraction
+        # FAST knobs
+        self.auto_fast = auto_fast
+        self.auto_feature_reduce = auto_feature_reduce
+        self.explicit_k_features = explicit_k_features
+        self.fast_overrides = fast_overrides or {}
+        # FAST bookkeeping
+        self._feature_mask_: Optional[np.ndarray] = None
+        self.feature_names_in_: Optional[List[str]] = None
+        self.feature_names_out_: Optional[List[str]] = None
+        self._size_bucket_: Optional[str] = None
+        self._fast_params_used_: Optional[Dict[str, Any]] = None
         # Ensure tree parameters include the percentile settings
         self.tree_params.setdefault("percentil", leaf_percentile)
         self.tree_params.setdefault("low_frac", low_leaf_fraction)
@@ -121,6 +218,10 @@ class _BaseInsideForest:
             "get_detail": self.get_detail,
             "leaf_percentile": self.leaf_percentile,
             "low_leaf_fraction": self.low_leaf_fraction,
+            "auto_fast": self.auto_fast,
+            "auto_feature_reduce": self.auto_feature_reduce,
+            "explicit_k_features": self.explicit_k_features,
+            "fast_overrides": self.fast_overrides,
         }
     def set_params(self, **params):
@@ -164,6 +265,10 @@ class _BaseInsideForest:
                 "get_detail",
                 "leaf_percentile",
                 "low_leaf_fraction",
+                "auto_fast",
+                "auto_feature_reduce",
+                "explicit_k_features",
+                "fast_overrides",
             }:
                 setattr(self, key, value)
                 if key == "leaf_percentile":
@@ -177,6 +282,59 @@ class _BaseInsideForest:
         return self
+    def _maybe_reduce_features(self, X, y=None):
+        """Optionally reduce features; preserve original column names if DataFrame."""
+        if not self.auto_feature_reduce:
+            if _HAS_PANDAS and isinstance(X, pd.DataFrame):
+                self.feature_names_in_ = list(X.columns)
+                self.feature_names_out_ = list(X.columns)
+            else:
+                self.feature_names_in_ = None
+                self.feature_names_out_ = None
+            self._feature_mask_ = None
+            return X
+        n, d = X.shape
+        k = (
+            self.explicit_k_features
+            if self.explicit_k_features is not None
+            else _choose_k_features(n, d)
+        )
+        k = min(k, d)
+        is_df = _HAS_PANDAS and isinstance(X, pd.DataFrame)
+        self.feature_names_in_ = list(X.columns) if is_df else None
+        X_arr = X.values if is_df else np.asarray(X)
+        support = None
+        if y is not None:
+            try:
+                ytype = type_of_target(y)
+            except Exception:
+                ytype = None
+            if ytype in {"binary", "multiclass"}:
+                sel = SelectKBest(mutual_info_classif, k=k).fit(X_arr, y)
+                support = sel.get_support()
+            elif ytype in {"continuous", "continuous-multioutput"}:
+                sel = SelectKBest(mutual_info_regression, k=k).fit(X_arr, y)
+                support = sel.get_support()
+        if support is None:
+            variances = X_arr.var(axis=0)
+            idx_sorted = np.argsort(-variances)[:k]
+            support = np.zeros(X_arr.shape[1], dtype=bool)
+            support[idx_sorted] = True
+        self._feature_mask_ = support
+        if is_df:
+            cols = np.array(self.feature_names_in_)
+            keep_cols = cols[support].tolist()
+            self.feature_names_out_ = keep_cols
+            return X[keep_cols]
+        else:
+            self.feature_names_out_ = None
+            return X_arr[:, support]
     def fit(self, X, y=None, rf=None):
         """Fit the internal random forest and compute cluster labels.
@@ -220,13 +378,53 @@ class _BaseInsideForest:
         else:
             if isinstance(X, pd.DataFrame):
                 X_df = X.copy()
+                if self.var_obj in X_df.columns:
+                    X_df = X_df.drop(columns=[self.var_obj])
             else:
                 X_df = pd.DataFrame(data=X)
         # Replace spaces with underscores to keep compatibility with Trees
         X_df.columns = [str(c).replace(" ", "_") for c in X_df.columns]
+        # 0) Feature reduction (optional)
+        Xr = self._maybe_reduce_features(X_df, y)
+        if _HAS_PANDAS and isinstance(Xr, pd.DataFrame):
+            X_df = Xr
+        else:
+            X_df = pd.DataFrame(Xr)
         self.feature_names_ = list(X_df.columns)
+        # 1) Fast preset (optional)
+        if self.auto_fast:
+            n, d = X_df.shape
+            auto = _choose_fast_params(n, d)
+            combined = dict(auto)
+            if isinstance(self.rf_params, dict):
+                combined["rf_params"] = {**auto["rf_params"], **self.rf_params}
+            if isinstance(self.tree_params, dict):
+                combined["tree_params"] = {**auto["tree_params"], **self.tree_params}
+            if hasattr(self, "divide"):
+                combined["divide"] = getattr(self, "divide", auto["divide"])
+            if hasattr(self, "method"):
+                combined["method"] = "menu" if y is not None else getattr(self, "method", auto["method"])
+            if hasattr(self, "get_detail"):
+                combined["get_detail"] = getattr(self, "get_detail", auto["get_detail"])
+            combined = _merge_dicts(combined, self.fast_overrides)
+            self._fast_params_used_ = combined
+            self._size_bucket_ = _size_bucket(n, d)
+            self.rf_params = combined.get("rf_params", self.rf_params)
+            self.tree_params = combined.get("tree_params", self.tree_params)
+            self.divide = combined.get("divide", self.divide)
+            self.method = combined.get("method", self.method)
+            self.get_detail = combined.get("get_detail", self.get_detail)
+            self.rf = self.rf_cls(**self.rf_params)
+            self.trees = Trees(**self.tree_params)
         # Allow passing a custom random forest estimator
         if rf is not None:
             self.rf = rf
@@ -518,6 +716,10 @@ class InsideForestClassifier(_BaseInsideForest):
         get_detail=False,
         leaf_percentile=95,
         low_leaf_fraction=0.05,
+        auto_fast=False,
+        auto_feature_reduce=False,
+        explicit_k_features: Optional[int] = None,
+        fast_overrides: Optional[Dict[str, Any]] = None,
     ):
         super().__init__(
             RandomForestClassifier,
@@ -531,6 +733,10 @@ class InsideForestClassifier(_BaseInsideForest):
             get_detail=get_detail,
             leaf_percentile=leaf_percentile,
             low_leaf_fraction=low_leaf_fraction,
+            auto_fast=auto_fast,
+            auto_feature_reduce=auto_feature_reduce,
+            explicit_k_features=explicit_k_features,
+            fast_overrides=fast_overrides,
         )
@@ -550,6 +756,10 @@ class InsideForestRegressor(_BaseInsideForest):
         get_detail=False,
         leaf_percentile=95,
         low_leaf_fraction=0.05,
+        auto_fast=False,
+        auto_feature_reduce=False,
+        explicit_k_features: Optional[int] = None,
+        fast_overrides: Optional[Dict[str, Any]] = None,
     ):
         super().__init__(
             RandomForestRegressor,
@@ -563,6 +773,10 @@ class InsideForestRegressor(_BaseInsideForest):
             get_detail=get_detail,
             leaf_percentile=leaf_percentile,
             low_leaf_fraction=low_leaf_fraction,
+            auto_fast=auto_fast,
+            auto_feature_reduce=auto_feature_reduce,
+            explicit_k_features=explicit_k_features,
+            fast_overrides=fast_overrides,
         )

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/labels.py RENAMED Viewed

@@ -129,6 +129,12 @@ class Labels:
         upper_bounds = sub_df.loc[row_index, 'lsup'].copy()
         variables = list(upper_bounds.index)
+        # Early exit when there are no variables to filter on.
+        # Returning an empty DataFrame prevents index errors when constructing
+        # boolean conditions on an empty list of variables.
+        if len(variables) == 0:
+            return df.iloc[0:0]
         conditions = [
             (df[var] <= upper_bounds[var]) & (df[var] > lower_bounds[var])
             for var in variables

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/metadata.py RENAMED Viewed

@@ -297,7 +297,12 @@ def conditions_to_tokens(conds: list[str]) -> set[str]:
 # 2. GENERADOR DE EXPERIMENTOS PARA UN SOLO Df2
 # ------------------------------------------------------------------ #
-def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
+def experiments_from_df2(
+    df2: pd.DataFrame,
+    meta: pd.DataFrame,
+    df_data: pd.DataFrame | None = None,
+    var_obj: str = "target",
+) -> pd.DataFrame:
     """Generate pairwise cluster comparisons for a single Df2.
     Parameters
@@ -307,13 +312,20 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
         ``cluster_ef_sample`` and ``cluster_n_sample`` columns.
     meta : pd.DataFrame
         Metadata indexed by ``rule_token`` providing actionability metrics.
+    df_data : pd.DataFrame, optional
+        Raw dataset used to compute cluster descriptions. If provided, the
+        effectiveness and number of observations for the intersection between
+        clusters is computed by applying the shared rules to ``df_data``.
+    var_obj : str, default "target"
+        Name of the target column inside ``df_data`` used to estimate
+        effectiveness.
     Returns
     -------
     pd.DataFrame
         Each row contains the comparison between two clusters along with the
-        exclusive variables and a score penalizing difficult actions and
-        rewarding overlap.
+        exclusive variables, intersection statistics and a score penalizing
+        difficult actions and rewarding overlap.
     """
     # --- action table -----------------------------
     meta_idx = meta.set_index('rule_token')
@@ -328,6 +340,28 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
         only_a     = sorted(conds_a - conds_b)
         only_b     = sorted(conds_b - conds_a)
+        # -------------------- intersection stats -----------------------
+        inter_ef = None
+        inter_n = None
+        if df_data is not None and inters:
+            def _apply_conditions(data, conds):
+                for cond in conds:
+                    match = re.match(
+                        r"\s*([-\d\.eE]+)\s*<=\s*([A-Za-z_][A-Za-z0-9_]*)\s*<=\s*([-\d\.eE]+)",
+                        str(cond),
+                    )
+                    if match and match.group(2) in data.columns:
+                        low = float(match.group(1))
+                        high = float(match.group(3))
+                        var = match.group(2)
+                        data = data[(data[var] >= low) & (data[var] <= high)]
+                return data
+            df_inter = _apply_conditions(df_data.copy(), inters)
+            inter_n = int(df_inter.shape[0])
+            if var_obj in df_inter.columns and inter_n > 0:
+                inter_ef = float(df_inter[var_obj].mean())
         # Determine which cluster has lower effectiveness
         delta_ef   = row_a['cluster_ef_sample'] - row_b['cluster_ef_sample']
         row_a_subset, row_b_subset = (row_a, row_b) if delta_ef < 0 else (row_b, row_a)
@@ -378,6 +412,8 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
             'n_only_a'           : n_only_a,
             'n_only_b'           : n_only_b,
             'intersection'       : inters,
+            'intersection_ef_sample': inter_ef,
+            'intersection_n_sample' : inter_n,
             'only_cluster_a'     : only_subset_a,
             'only_cluster_b'     : only_subset_b,
             'score'              : score,
@@ -390,7 +426,11 @@ def experiments_from_df2(df2: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
 # ------------------------------------------------------------------ #
 # 3. PIPELINE GENERAL PARA «n» Df2
 # ------------------------------------------------------------------ #
-def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
+def run_experiments(
+    mx,
+    df2_dict: dict[str, pd.DataFrame],
+    data_dict: dict[str, pd.DataFrame] | None = None,
+) -> pd.DataFrame:
     """Generate and consolidate hypotheses for multiple datasets.
     Parameters
@@ -399,6 +439,10 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
         Instance used to extract metadata from ``cluster_descripcion`` fields.
     df2_dict : dict[str, pd.DataFrame]
         Mapping of dataset name to its corresponding Df2 table.
+    data_dict : dict[str, pd.DataFrame], optional
+        Optional mapping of dataset name to the raw data used to create each
+        Df2. When provided, intersection effectiveness and observation counts
+        are computed using these DataFrames.
     Returns
     -------
@@ -410,7 +454,8 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
     for name, df2 in df2_dict.items():
         df1  = mx.extract(df2)
-        hypo = experiments_from_df2(df2, df1)
+        df_raw = data_dict.get(name) if data_dict else None
+        hypo = experiments_from_df2(df2, df1, df_raw, var_obj=getattr(mx, "var_obj", "target"))
         if not hypo.empty:
             hypo['dataset'] = name
@@ -427,6 +472,7 @@ def run_experiments(mx, df2_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
                 'variables_a', 'variables_b', 'variables_intersection',
                 'difficulty_a', 'difficulty_b', 'n_intersection',
                 'n_only_a', 'n_only_b', 'intersection',
+                'intersection_ef_sample', 'intersection_n_sample',
                 'only_cluster_a', 'only_cluster_b', 'score']
         return pd.DataFrame(columns=cols)

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/models.py RENAMED Viewed

@@ -44,7 +44,9 @@ class Models:
     X = df.drop(columns=[target_col]).values
     y = df.loc[:, target_col].values
-    for k in range(1,int(len(df))):
+    fp = fn = 0
+    y_pred = None
+    for k in range(1, int(len(df))):
       try:
         knn = KNeighborsClassifier(n_neighbors=k)
         knn.fit(X, y)
@@ -55,17 +57,20 @@ class Models:
         break
       tn, fp, fn, tp = cm.ravel()
       if criterio_fp:
-        if fp>min_obs:
+        if fp > min_obs:
           break
       else:
-        if fn>min_obs:
+        if fn > min_obs:
           break
-    if fn>0:
+    if y_pred is None:
+      return df.iloc[0:0], df
+    if fn > 0:
       false_negatives = (y == 1) & (y_pred == 0)
       return df[false_negatives], df[~false_negatives]
-    if fp>0:
+    if fp > 0:
       false_positives = (y == 0) & (y_pred == 1)
       return df[false_positives], df[~false_positives]
+    return df.iloc[0:0], df
   def get_cvRF(self, X_train, y_train, param_grid):
     """Grid-search a RandomForest classifier.

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: InsideForest
-Version: 0.3.2
+Version: 0.3.3
 Summary: A comprehensive library for describing and analyzing data insights via AI
 Home-page: https://github.com/jcval94/InsideForest.git
 Author: Jose Carlos Del Valle

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/SOURCES.txt RENAMED Viewed

@@ -18,6 +18,7 @@ experiments/__init__.py
 experiments/benchmark.py
 experiments/benchmark_get_rangos.py
 experiments/rf_param_benchmark.py
+experiments/select_clusters_hyperparam.py
 experiments/summary_benchmark.py
 tests/test_chimera_values_selector.py
 tests/test_cluster_selector.py
@@ -28,4 +29,6 @@ tests/test_inside_forest_fit_predict.py
 tests/test_inside_forest_params.py
 tests/test_inside_forest_regressor_fit_predict.py
 tests/test_iou_equivalence.py
+tests/test_metadata_run_experiments.py
+tests/test_models.py
 tests/test_trees.py

{insideforest-0.3.2 → insideforest-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: InsideForest
-Version: 0.3.2
+Version: 0.3.3
 Summary: A comprehensive library for describing and analyzing data insights via AI
 Home-page: https://github.com/jcval94/InsideForest.git
 Author: Jose Carlos Del Valle

{insideforest-0.3.2 → insideforest-0.3.3}/README.md RENAMED Viewed

@@ -74,6 +74,21 @@ pred_labels = in_f.predict(X_rest)  # cluster labels for the remaining data
 training_labels = in_f.labels_  # labels for the training subset
 ```
+### FAST presets and feature reduction
+InsideForest can automatically pick faster training parameters and reduce
+features based on dataset size:
+```python
+in_f = InsideForestClassifier(auto_fast=True, auto_feature_reduce=True)
+in_f.fit(X_train, y_train)
+```
+Use `explicit_k_features` to fix the number of retained features and
+`fast_overrides` to tweak the automatic presets. After fitting, the
+attributes `_feature_mask_`, `feature_names_in_`, `feature_names_out_`,
+`_size_bucket_`, and `_fast_params_used_` reveal the applied settings.
 You can control how final cluster labels are consolidated through the
 `method` parameter. Available strategies are:
@@ -221,12 +236,25 @@ Compares clusters A and B using the rules provided by a row from the experiments
 ## Experiments
 The `experiments/benchmark.py` module runs supervised clustering
-benchmarks on a medium sized dataset (`Digits`) and on a synthetically
-generated large dataset. It compares `InsideForest` with traditional
-baselines like KMeans and DBSCAN, reporting purity, macro F1-score and
-runtime for each method. It also performs a basic sensitivity analysis
-on key hyperparameters: `K` for KMeans and `eps`/`min_samples` for
-DBSCAN.
+benchmarks on datasets such as `Digits`, `Iris` and `Wine`. It compares
+`InsideForest` with traditional baselines like KMeans and DBSCAN,
+reporting purity, macro F1-score, accuracy, information-theoretic
+metrics and runtime. A basic sensitivity analysis is also provided for
+key hyperparameters: `K` for KMeans and `eps`/`min_samples` for DBSCAN.
+Recent results are summarized below:
+| Dataset | Algorithm | Purity | Macro F1 | Accuracy | NMI | AMI | ARI | Bcubed F1 | Divergence | Time (s) |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| Digits | InsideForest | 0.783 | 0.362 | 0.261 | 0.501 | 0.339 | 0.169 | 0.218 | 0.789 | 39.570 |
+| Digits | KMeans(k=10) | 0.673 | 0.620 | 0.666 | 0.672 | 0.669 | 0.531 | 0.633 | 0.711 | 0.047 |
+| Digits | DBSCAN(eps=0.5,min=5) | 0.102 | 0.018 | 0.102 | 0.000 | 0.000 | 0.000 | 0.182 | 0.000 | 0.014 |
+| Iris | InsideForest | 0.714 | 0.581 | 0.673 | 0.511 | 0.481 | 0.445 | 0.680 | 0.388 | 0.990 |
+| Iris | KMeans(k=3) | 0.667 | 0.531 | 0.580 | 0.590 | 0.584 | 0.433 | 0.710 | 0.427 | 0.002 |
+| Iris | DBSCAN(eps=0.5,min=5) | 0.680 | 0.674 | 0.680 | 0.511 | 0.505 | 0.442 | 0.651 | 0.402 | 0.002 |
+| Wine | InsideForest | 0.810 | 0.511 | 0.422 | 0.398 | 0.285 | 0.248 | 0.484 | 0.495 | 3.308 |
+| Wine | KMeans(k=3) | 0.966 | 0.967 | 0.966 | 0.876 | 0.875 | 0.897 | 0.937 | 0.628 | 0.004 |
+| Wine | DBSCAN(eps=0.5,min=5) | 0.399 | 0.190 | 0.399 | 0.000 | 0.000 | 0.000 | 0.509 | 0.000 | 0.002 |
 Execute the script with:
@@ -254,6 +282,29 @@ os.environ["OPENAI_API_KEY"] = "sk-your-key"
 res = generate_descriptions(iris_conds, OPENAI_API_KEY=os.getenv("OPENAI_API_KEY"))
 ```
+You can also interact with the OpenAI API directly:
+```python
+from openai import OpenAI
+import os
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+response = client.chat.completions.create(
+    model="gpt-4o-mini",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": (
+                "Summarize: 4.3 <= sepal length (cm) <= 5.8 and "
+                "1.0 <= petal width (cm) <= 1.8"
+            ),
+        },
+    ],
+)
+print(response.choices[0].message.content)
+```
 ### `categorize_conditions`
 ```python

insideforest-0.3.3/experiments/select_clusters_hyperparam.py ADDED Viewed

@@ -0,0 +1,86 @@
+import itertools
+import time
+from typing import Dict, List
+import pandas as pd
+from sklearn.datasets import load_iris, load_wine
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from InsideForest import InsideForestClassifier
+from experiments.benchmark import _evaluate
+def _prepare_data(loader):
+    X, y = loader(return_X_y=True)
+    X = StandardScaler().fit_transform(X)
+    return train_test_split(X, y, train_size=0.35, stratify=y, random_state=42)
+def run_experiments() -> pd.DataFrame:
+    datasets = {
+        "iris": load_iris,
+        "wine": load_wine,
+    }
+    param_grid = {
+        "divide": [3, 5, 7],
+        "leaf_percentile": [85, 90, 95],
+        "low_leaf_fraction": [0.01, 0.03, 0.05],
+    }
+    rows: List[Dict] = []
+    for ds_name, loader in datasets.items():
+        X_train, X_test, y_train, y_test = _prepare_data(loader)
+        for divide, leaf, low_frac in itertools.product(
+            param_grid["divide"],
+            param_grid["leaf_percentile"],
+            param_grid["low_leaf_fraction"],
+        ):
+            clf = InsideForestClassifier(
+                method="select_clusters",
+                divide=divide,
+                get_detail=False,
+                leaf_percentile=leaf,
+                low_leaf_fraction=low_frac,
+            )
+            start = time.time()
+            clf.fit(X_train, y_train)
+            preds = clf.predict(X_test)
+            runtime = time.time() - start
+            name = f"{ds_name}_div{divide}_leaf{leaf}_low{low_frac}"
+            metrics = _evaluate(y_test, preds, runtime, name).as_dict()
+            metrics.update(
+                {
+                    "dataset": ds_name,
+                    "divide": divide,
+                    "leaf_percentile": leaf,
+                    "low_leaf_fraction": low_frac,
+                }
+            )
+            rows.append(metrics)
+    return pd.DataFrame(rows)
+def main() -> None:
+    df = run_experiments()
+    cols = [
+        "dataset",
+        "divide",
+        "leaf_percentile",
+        "low_leaf_fraction",
+        "purity",
+        "macro_f1",
+        "accuracy",
+        "nmi",
+        "bcubed_f1",
+        "runtime",
+    ]
+    print(df[cols].to_string(index=False))
+if __name__ == "__main__":
+    main()

{insideforest-0.3.2 → insideforest-0.3.3}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name='InsideForest',
-    version='0.3.2',
+    version='0.3.3',
     packages=find_packages(),
     license='MIT',
     license_files=['LICENSE'],

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_cluster_selector.py RENAMED Viewed

@@ -2,6 +2,7 @@ import os, sys
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 import pandas as pd
+import pytest
 from InsideForest.cluster_selector import select_clusters
@@ -23,3 +24,18 @@ def test_fallback_cluster_assignment():
     assert clusters[1] == 99
     assert clusters_all[1] == [99]
     assert ponderadores_all[1] == [0.0]
+def test_missing_column_in_rule_raises_error():
+    df_datos = pd.DataFrame({'x': [0.5]})
+    cols = pd.MultiIndex.from_tuples([
+        ('linf', 'y'),
+        ('lsup', 'y'),
+        ('metrics', 'ponderador'),
+    ])
+    df_reglas = pd.DataFrame([[0.0, 1.0, 1.0]], columns=cols)
+    df_reglas['cluster'] = [1.0]
+    with pytest.raises(KeyError) as excinfo:
+        select_clusters(df_datos, df_reglas)
+    assert 'y' in str(excinfo.value)

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_descrip_helpers.py RENAMED Viewed

@@ -9,6 +9,7 @@ from InsideForest.descrip import (
     _scale_clusters,
     _compute_inflection_points,
     _merge_outputs,
+    _list_rules_to_text,
 )
@@ -87,3 +88,8 @@ def test_merge_outputs():
     }
     assert expected_cols.issubset(final_df.columns)
     assert "cluster_ponderador" not in final_df.columns
+def test_list_rules_to_text_empty_rule_set_returns_placeholder():
+    meta_df = pd.DataFrame()
+    assert _list_rules_to_text([], meta_df, lang="en") == "—"

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_fit_predict.py RENAMED Viewed

@@ -49,6 +49,22 @@ def test_fit_accepts_df_with_target_column():
     assert model.labels_.shape[0] == len(df)
+def test_fit_with_y_and_df_includes_target_column():
+    df = pd.DataFrame(
+        data={
+            'feat1': [0, 1, 2, 3],
+            'feat2': [3, 2, 1, 0],
+            'target': [0, 1, 0, 1],
+        }
+    )
+    y = df['target'].to_numpy()
+    model = InsideForestClassifier(rf_params={'n_estimators': 5, 'random_state': 0})
+    model.fit(X=df, y=y)
+    assert 'target' not in model.feature_names_
+    preds = model.predict(df[['feat1', 'feat2']])
+    assert preds.shape == (4,)
 def test_fit_df_missing_target_raises():
     df = pd.DataFrame(data={'feat1': [0, 1], 'feat2': [1, 0]})
     model = InsideForestClassifier()

insideforest-0.3.3/tests/test_metadata_run_experiments.py ADDED Viewed

@@ -0,0 +1,40 @@
+import pandas as pd
+import pytest
+from InsideForest.metadata import MetaExtractor, run_experiments
+def test_run_experiments_includes_intersection_stats():
+    # dataset with simple target
+    df = pd.DataFrame({
+        'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        'target': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
+    })
+    # cluster descriptions with a shared rule on x
+    df2 = pd.DataFrame({
+        'cluster': [0, 1],
+        'cluster_descripcion': [
+            '0 <= x <= 5 AND 0 <= y <= 5',
+            '0 <= x <= 5 AND 5 <= y <= 10',
+        ],
+        'cluster_ef_sample': [0.2, 1.0],
+        'cluster_n_sample': [5, 1],
+    })
+    # minimal metadata for variables x and y
+    meta_df = pd.DataFrame({
+        'actionability.increase_difficulty': [1, 1],
+        'actionability.decrease_difficulty': [1, 1],
+    }, index=['x', 'y'])
+    mx = MetaExtractor(meta_df, var_obj='target')
+    result = run_experiments(mx, {'ds': df2}, data_dict={'ds': df})
+    assert 'intersection_n_sample' in result.columns
+    assert 'intersection_ef_sample' in result.columns
+    row = result.iloc[0]
+    assert row['intersection_n_sample'] == 5
+    assert row['intersection_ef_sample'] == pytest.approx(0.2)

insideforest-0.3.3/tests/test_models.py ADDED Viewed

@@ -0,0 +1,30 @@
+import pandas as pd
+from InsideForest.models import Models
+def test_get_knn_rows_success():
+    df = pd.DataFrame({'feature': [0, 1, 2, 3, 4, 5],
+                       'target': [0, 0, 0, 1, 1, 1]})
+    models = Models()
+    mis_df, rest_df = models.get_knn_rows(df, 'target', criterio_fp=False, min_obs=0)
+    assert not mis_df.empty
+    assert len(mis_df) + len(rest_df) == len(df)
+    assert rest_df.equals(df.drop(mis_df.index))
+def test_get_knn_rows_no_misclassification():
+    df = pd.DataFrame({'feature': [0, 1, 2, 3, 4, 5],
+                       'target': [0, 0, 0, 1, 1, 1]})
+    models = Models()
+    mis_df, rest_df = models.get_knn_rows(df, 'target', min_obs=10)
+    assert mis_df.empty
+    assert rest_df.equals(df)
+def test_get_knn_rows_training_error():
+    df = pd.DataFrame({'feature': ['a', 'b', 'c'],
+                       'target': [0, 1, 0]})
+    models = Models()
+    mis_df, rest_df = models.get_knn_rows(df, 'target')
+    assert mis_df.empty
+    assert rest_df.equals(df)

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/__init__.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/descrip.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/regions.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest/trees.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/InsideForest.egg-info/top_level.txt RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/LICENSE RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/experiments/__init__.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/experiments/benchmark.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/experiments/benchmark_get_rangos.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/experiments/rf_param_benchmark.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/experiments/summary_benchmark.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/setup.cfg RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_chimera_values_selector.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_descrip.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_eps_search_perf.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_params.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_inside_forest_regressor_fit_predict.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_iou_equivalence.py RENAMED Viewed

File without changes

{insideforest-0.3.2 → insideforest-0.3.3}/tests/test_trees.py RENAMED Viewed

File without changes

InsideForest 0.3.2__tar.gz → 0.3.3__tar.gz

InsideForest 0.3.2tar.gz → 0.3.3tar.gz