PyPI - dataforge-ml - Versions diffs - 2.0.0__tar.gz → 2.0.1__tar.gz - Mend

dataforge-ml 2.0.0tar.gz → 2.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{dataforge_ml-2.0.0/src/dataforge_ml.egg-info → dataforge_ml-2.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 2.0.0
+Version: 2.0.1
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "2.0.0"
+version = "2.0.1"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">3.10"

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_categorical.py RENAMED Viewed

@@ -11,9 +11,9 @@ Per-column metrics (opt-in via ProfileConfig.categorical_columns):
   7. Free-text / natural-language flag
         (avg word count >5 OR avg char length >50 OR avg token count >10)
   8. Imbalance metrics
-        – class ratio  (max_freq / min_freq)
-        – Shannon entropy
-        – Gini impurity
+        – dominant class ratio (max_freq / second_max_freq)
+        – normalized Shannon entropy
+        – normalized Gini impurity
 Integration
 -----------
@@ -221,24 +221,36 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
         )
         # --- Imbalance metrics ---
-        # Class Ratio -> raw distribution
-        # Entropy -> randomness / information content
-        # Gini -> impurity / misclassification risk
+        # Dominant Class Ratio -> max_freq / second_max_freq
+        # Normalized Entropy -> scaled to [0, 1]
+        # Normalized Gini -> scaled to [0, 1]
         counts = vc["count"].cast(pl.Float64)
         total = float(counts.sum())
-        if total > 0:
+        card = len(counts)
+        if total > 0 and card >= 2:
             probs = counts / total
-            max_freq = float(probs.max())  # type: ignore[arg-type]
-            min_freq = float(probs.min())  # type: ignore[arg-type]
+            max_freq = float(probs[0])
+            second_max_freq = float(probs[1])
-            class_ratio = max_freq / min_freq if min_freq > 0 else float("inf")
+            dominant_class_ratio = max_freq / second_max_freq if second_max_freq > 0 else float("inf")
             entropy = float(-(probs * probs.log(base=2)).fill_nan(0.0).sum())
+            normalized_entropy = entropy / math.log2(card)
             gini = float(1.0 - (probs**2).sum())
+            normalized_gini = gini / (1.0 - 1.0 / card)
             profile.imbalance = ImbalanceMetrics(
-                class_ratio=class_ratio,
-                shannon_entropy=entropy,
-                gini_impurity=gini,
+                dominant_class_ratio=dominant_class_ratio,
+                normalized_shannon_entropy=normalized_entropy,
+                normalized_gini=normalized_gini,
+            )
+        else:
+            profile.imbalance = ImbalanceMetrics(
+                dominant_class_ratio=None,
+                normalized_shannon_entropy=None,
+                normalized_gini=None,
             )
         return vc

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_categorical_config.py RENAMED Viewed

@@ -194,20 +194,20 @@ class ImbalanceMetrics:
     Attributes
     ----------
-    class_ratio : float
-        Ratio of the least-frequent to most-frequent class, in [0, 1].
-        A value of 1.0 indicates perfect balance.
-    shannon_entropy : float
-        Shannon entropy of the class distribution (nats).  Higher values
-        indicate more uniform distributions.
-    gini_impurity : float
-        Gini impurity of the class distribution, in [0, 1].  Zero means
-        a single class dominates entirely.
+    dominant_class_ratio : float, optional
+        Ratio of the most-frequent to second-most-frequent class frequency.
+        None when cardinality < 2.
+    normalized_shannon_entropy : float, optional
+        Shannon entropy of the class distribution scaled to [0, 1] by dividing
+        by log2(cardinality). None when cardinality < 2.
+    normalized_gini : float, optional
+        Gini impurity of the class distribution scaled to [0, 1] by dividing
+        by (1 - 1/cardinality). None when cardinality < 2.
     """
-    class_ratio: float = 0.0
-    shannon_entropy: float = 0.0
-    gini_impurity: float = 0.0
+    dominant_class_ratio: float | None = None
+    normalized_shannon_entropy: float | None = None
+    normalized_gini: float | None = None
     def to_dict(self) -> dict:
         """
@@ -216,12 +216,12 @@ class ImbalanceMetrics:
         Returns
         -------
         dict
-            Keys: ``class_ratio``, ``shannon_entropy``, ``gini_impurity``.
+            Keys: ``dominant_class_ratio``, ``normalized_shannon_entropy``, ``normalized_gini``.
         """
         return {
-            "class_ratio": self.class_ratio,
-            "shannon_entropy": self.shannon_entropy,
-            "gini_impurity": self.gini_impurity,
+            "dominant_class_ratio": self.dominant_class_ratio,
+            "normalized_shannon_entropy": self.normalized_shannon_entropy,
+            "normalized_gini": self.normalized_gini,
         }

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_config.py RENAMED Viewed

@@ -236,6 +236,79 @@ class StructuralProfileResult:
         """
         return json.dumps(self.to_dict(), indent=indent, default=str)
+    def to_markdown(self) -> str:
+        """
+        Produce a complete, lossless Markdown representation of the profiling result.
+        Returns
+        -------
+        str
+            Markdown string containing a summary table followed by per-column detail
+            sections and dataset-level statistics.
+        """
+        data = self.to_dict()
+        lines = ["# Structural Profile Report\n"]
+        # 1. Summary navigation table
+        lines.append("## Summary\n")
+        lines.append("| Column | Semantic Type | Missing % | Severity | Key Flags |")
+        lines.append("|---|---|---|---|---|")
+        for col_name, col_data in data.get("columns", {}).items():
+            sem_type = col_data.get("semantic_type") or "None"
+            missingness = col_data.get("missingness") or {}
+            missing_pct = missingness.get("effective_null_ratio", 0.0) * 100
+            missing_str = f"{missing_pct:.2f}%"
+            severity = missingness.get("severity") or "None"
+            flags = ", ".join(col_data.get("type_flags", [])) or "None"
+            lines.append(f"| {col_name} | {sem_type} | {missing_str} | {severity} | {flags} |")
+        lines.append("\n## Column Details\n")
+        def _format_dict(d: dict, indent: int = 0) -> list[str]:
+            out = []
+            prefix = "  " * indent
+            for k, v in d.items():
+                if isinstance(v, dict):
+                    if not v:
+                        out.append(f"{prefix}- **{k}**: (empty)")
+                    else:
+                        out.append(f"{prefix}- **{k}**:")
+                        out.extend(_format_dict(v, indent + 1))
+                elif isinstance(v, list):
+                    if not v:
+                        out.append(f"{prefix}- **{k}**: (empty)")
+                    else:
+                        out.append(f"{prefix}- **{k}**:")
+                        for item in v:
+                            out.append(f"{prefix}  - {item}")
+                else:
+                    out.append(f"{prefix}- **{k}**: {v}")
+            return out
+        for col_name, col_data in data.get("columns", {}).items():
+            lines.append(f"### `{col_name}`\n")
+            lines.extend(_format_dict(col_data))
+            lines.append("")
+        lines.append("## Dataset\n")
+        lines.extend(_format_dict(data.get("dataset", {})))
+        lines.append("")
+        lines.append("## Targets\n")
+        lines.extend(_format_dict(data.get("targets", {})))
+        lines.append("")
+        lines.append("## Numeric Sentinels\n")
+        lines.extend(_format_dict(data.get("numeric_sentinels", {})))
+        lines.append("")
+        lines.append("## String Sentinels\n")
+        lines.extend(_format_dict(data.get("string_sentinels", {})))
+        lines.append("")
+        return "\n".join(lines).strip() + "\n"
 # ---------------------------------------------------------------------------
 # ProfileConfig — clean break from per-profiler column lists

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -74,6 +74,22 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         data: pl.DataFrame,
         columns: list[str],
     ) -> NumericProfileResult:
+        """
+        Profile the specified numeric columns in a DataFrame.
+        Parameters
+        ----------
+        data : pl.DataFrame
+            The input Polars DataFrame containing the columns to profile.
+        columns : list[str]
+            A list of column names to profile. Non-numeric columns in this list
+            are skipped.
+        Returns
+        -------
+        NumericProfileResult
+            A result object containing distribution statistics for the profiled columns.
+        """
         return self._run(data, columns)
     # ------------------------------------------------------------------
@@ -128,7 +144,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             profile.mean = mean
             profile.median = median
             if median == 0.0:
-                profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
+                profile.mean_median_ratio = None if mean != 0.0 else 1.0
             else:
                 profile.mean_median_ratio = mean / median

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_target_profiler.py RENAMED Viewed

@@ -137,11 +137,12 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
         result.categorical_profile = cat_profile
         # Flag Imbalances
-        ratio = cat_profile.imbalance.class_ratio
-        if ratio > 20.0:
-            result.flags.append(TargetFlag.SevereImbalance)
-        elif ratio > 5.0:
-            result.flags.append(TargetFlag.HighImbalance)
+        ratio = cat_profile.imbalance.dominant_class_ratio
+        if ratio is not None:
+            if ratio > 20.0:
+                result.flags.append(TargetFlag.SevereImbalance)
+            elif ratio > 5.0:
+                result.flags.append(TargetFlag.HighImbalance)
     def _profile_regression(
         self, series: pl.Series, n_rows: int, result: TargetProfileResult

{dataforge_ml-2.0.0 → dataforge_ml-2.0.1/src/dataforge_ml.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 2.0.0
+Version: 2.0.1
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License