PyPI - dataforge-ml - Versions diffs - 0.6.0__tar.gz → 0.8.0__tar.gz - Mend

dataforge-ml 0.6.0tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.6.0
+Version: 0.8.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.6.0"
+version = "0.8.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_profiler.py RENAMED Viewed

@@ -400,7 +400,13 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
                 phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
                 r_corr = r - (r - 1) ** 2 / (n - 1)
                 c_corr = c - (c - 1) ** 2 / (n - 1)
-                v = float(np.sqrt(phi2_corr / min(r_corr - 1, c_corr - 1)))
+                denom = min(r_corr - 1, c_corr - 1)
+                if denom <= 0:
+                    # Near-saturated contingency table (n_unique ≈ n_rows):
+                    # bias correction collapses denominator; skip the pair.
+                    pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                    continue
+                v = float(np.sqrt(phi2_corr / denom))
                 v = max(0.0, min(1.0, v))
             except Exception as exc:
                 warnings.warn(

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_profiler.py RENAMED Viewed

@@ -208,7 +208,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
         r = profile.effective_null_ratio
-        if r < _SEVERITY_MINOR and r != 0:
+        if r == 0.0:
+            profile.severity = None
+        elif r < _SEVERITY_MINOR:
             profile.severity = MissingSeverity.Minor
         elif r < _SEVERITY_MODERATE:
             profile.severity = MissingSeverity.Moderate

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -254,16 +254,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
                 for i in range(top_rows)
             ]
         else:
-            # --- 20-Bin Histogram Distribution (Continuous) ---
+            # --- Histogram Distribution (Continuous) ---
             import numpy as np
             counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
+            n_clean = clean_f64.len()
             profile.histogram = [
                 HistogramBin(
                     lower_bound=float(bin_edges[i]),
                     upper_bound=float(bin_edges[i + 1]),
                     count=int(counts[i]),
-                    percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
+                    percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
                 )
                 for i in range(len(counts))
             ]

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_type_detector.py RENAMED Viewed

@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99  # >99 % unique → identifier
 _IDENTIFIER_MAX_MEDIAN_LENGTH = 40
 _DISCRETE_NUNIQUE_THRESHOLD = 20  # numeric with <20 unique values → discrete
-_FREE_TEXT_AVG_WORDS: int = 5  # avg word count above which → Text
-_FREE_TEXT_MEDIAN_CHARS: int = 35
-_FREE_TEXT_P90_CHARS: int = 60
+_FREE_TEXT_AVG_WORDS: int = 3
+_FREE_TEXT_MEDIAN_CHARS: int = 20
+_FREE_TEXT_P90_CHARS: int = 35
 _FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
+_FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70  # unique ratio above which multi-token strings → Text
 # Common boolean string values (lowercased)
@@ -77,115 +78,87 @@ class TypeDetector:
                 original_dtype=original_dtype,
                 inferred_dtype=original_dtype,
             )
-            # Work with a copy that may be re-assigned after coercion
             working = series
             # 1 & 2: Coercion for string columns
-            if series.dtype == pl.Utf8 or series.dtype == pl.String:
+            if series.dtype in (pl.Utf8, pl.String):
                 coerced, flag = self._try_numeric_coerce(series, n_rows)
                 if coerced is not None:
                     info.inferred_dtype = str(coerced.dtype)
                     info.flags.append(flag)  # type: ignore[arg-type]
                     working = coerced
-                    self._check_coerced_encoded_category(working, info, n_rows)
+                    self._check_coerced_encoded_category(working, info)
                 else:
                     coerced_dt, flag_dt = self._try_datetime_coerce(
-                        series, col_name, n_rows
+                        series, n_rows
                     )
                     if coerced_dt is not None:
                         info.inferred_dtype = str(coerced_dt.dtype)
                         info.flags.append(flag_dt)  # type: ignore[arg-type]
-                        working = coerced_dt
                         info.semantic_type = SemanticType.Datetime
                         results[col_name] = info
                         continue
             # 3: Boolean candidate
             self._check_boolean_candidate(working, info)
-            # Work only on numeric-ish columns for the remaining checks
+            if TypeFlag.BooleanCandidate in info.flags:
+                info.semantic_type = SemanticType.Boolean
+                results[col_name] = info
+                continue
+            # Native datetime types
+            if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
+                working.dtype, pl.Datetime
+            ):
+                info.semantic_type = SemanticType.Datetime
+                results[col_name] = info
+                continue
+            # 4–7: Numeric path
             if working.dtype in _NUMERIC_DTYPES:
-                # 4 & 5: Encoded category and identifier checks — integers only.
-                # Continuous floats have high cardinality by nature and are never
-                # identifiers; restricting these checks prevents false Identifier
-                # classification of genuine numeric features.
                 if working.dtype in _INT_DTYPES:
-                    self._check_encoded_category(working, info, n_rows)
-                    self._check_identifier(working, info, n_rows)
-                # 6: Sequential index (integers only)
-                if working.dtype in _INT_DTYPES or working.dtype in (
-                    pl.Float32,
-                    pl.Float64,
-                ):
-                    self._check_sequential_index(working, info, n_rows)
-                # 7: Numeric kind (skip for identifiers / sequential indices)
-                if not any(
-                    info.has_flag(f)
-                    for f in (
-                        TypeFlag.IdentifierColumn,
-                        TypeFlag.SequentialIndex,
-                        TypeFlag.FloatSequentialIndex,
-                    )
-                ):
+                    # EncodedCategory and IdentifierColumn are mutually exclusive:
+                    # low-cardinality and near-unique cannot both be true.
+                    # Check encoded category first; skip identifier if it matches.
+                    self._check_encoded_category(working, info)
+                    if TypeFlag.EncodedCategory not in info.flags:
+                        self._check_identifier(working, info, n_rows)
+                        if TypeFlag.IdentifierColumn in info.flags:
+                            self._check_sequential_index(working, info, n_rows)
+                if TypeFlag.EncodedCategory in info.flags:
+                    info.semantic_type = SemanticType.Categorical
+                elif TypeFlag.IdentifierColumn in info.flags:
+                    info.semantic_type = SemanticType.Identifier
+                else:
                     self._classify_numeric_kind(working, info)
+                    info.semantic_type = SemanticType.Numeric
-            elif working.dtype == pl.Utf8 or working.dtype == pl.String:
-                # String identifier check
-                self._check_identifier(working, info, n_rows)
+                results[col_name] = info
+                continue
+            # String path
+            if working.dtype in (pl.Utf8, pl.String):
                 self._check_free_text(working, info, n_rows)
-            info.semantic_type = self._derive_semantic_type(
-                info,
-                working,
-                n_rows,
-            )
+                if TypeFlag.FreeTextCandidate in info.flags:
+                    info.semantic_type = SemanticType.Text
+                    results[col_name] = info
+                    continue
+                self._check_identifier(working, info, n_rows)
+                info.semantic_type = (
+                    SemanticType.Identifier
+                    if TypeFlag.IdentifierColumn in info.flags
+                    else SemanticType.Categorical
+                )
+                results[col_name] = info
+                continue
+            # Fallback
+            info.semantic_type = SemanticType.Text
             results[col_name] = info
         return results
-    @staticmethod
-    def _derive_semantic_type(
-        info: ColumnTypeInfo,
-        working: pl.Series,
-        n_rows: int,
-    ) -> SemanticType:
-        if TypeFlag.IdentifierColumn in info.flags:
-            return SemanticType.Identifier
-        if TypeFlag.BooleanCandidate in info.flags:
-            return SemanticType.Boolean
-        is_native_datetime = working.dtype in (
-            pl.Date,
-            pl.Datetime,
-            pl.Duration,
-            pl.Time,
-        ) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
-        if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
-            return SemanticType.Datetime
-        if TypeFlag.EncodedCategory in info.flags:
-            return SemanticType.Categorical
-        if working.dtype in (pl.Utf8, pl.String):
-            if TypeFlag.FreeTextCandidate in info.flags:
-                return SemanticType.Text
-            return SemanticType.Categorical
-        if working.dtype in _NUMERIC_DTYPES:
-            return SemanticType.Numeric
-        return SemanticType.Categorical
     # ------------------------------------------------------------------
     # Step 1: Numeric coercion
     # ------------------------------------------------------------------
@@ -221,7 +194,7 @@ class TypeDetector:
     @staticmethod
     def _try_datetime_coerce(
-        series: pl.Series, col_name: str, n_rows: int
+        series: pl.Series, n_rows: int
     ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
         """
         Attempt datetime coercion if the column name looks date-like.
@@ -269,7 +242,7 @@ class TypeDetector:
     @staticmethod
     def _check_coerced_encoded_category(
-        series: pl.Series, info: ColumnTypeInfo, n_rows: int
+        series: pl.Series, info: ColumnTypeInfo
     ) -> None:
         """
         Post-coercion low-cardinality check for Float64 series that originated
@@ -312,9 +285,8 @@ class TypeDetector:
     @staticmethod
     def _check_encoded_category(
-        series: pl.Series, info: ColumnTypeInfo, n_rows: int
+        series: pl.Series, info: ColumnTypeInfo
     ) -> None:
-        # Skip if already flagged as boolean candidate (subset of {0,1})
         if TypeFlag.BooleanCandidate in info.flags:
             return
@@ -441,24 +413,27 @@ class TypeDetector:
         char_lengths = non_null.str.len_chars()
         median_chars = float(char_lengths.median() or 0.0)
+        space_counts = non_null.str.count_matches(r"\s+")
+        median_spaces = float(space_counts.median() or 0.0)
+        median_words = median_spaces + 1.0
+        unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
-        if median_chars > _FREE_TEXT_MEDIAN_CHARS:
+        # Multi-word strings of medium length: names, addresses, short descriptions
+        if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
             info.flags.append(TypeFlag.FreeTextCandidate)
             return
-        space_counts = non_null.str.count_matches(r"\s+")
-        median_words = float(space_counts.median() or 0.0) + 1.0
+        # Long average word count: sentences, paragraphs
         if median_words > _FREE_TEXT_AVG_WORDS:
             info.flags.append(TypeFlag.FreeTextCandidate)
             return
         p90_chars = float(char_lengths.quantile(0.9) or 0.0)
+        if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
+            info.flags.append(TypeFlag.FreeTextCandidate)
+            return
-        unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
-        if (
-            p90_chars > _FREE_TEXT_P90_CHARS
-            and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
-        ):
+        # High-cardinality multi-token strings that don't meet char thresholds:
+        # e.g. short full names like "John Smith", compound tokens
+        if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
             info.flags.append(TypeFlag.FreeTextCandidate)

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/config.py RENAMED Viewed

@@ -71,6 +71,7 @@ class TypeFlag(StrEnum):
     SequentialIndex = "sequential_index"
     FloatSequentialIndex = "float_sequential_index"
     FreeTextCandidate = "free_text_candidate"
+    UserOverride = "user_override"
 # ---------------------------------------------------------------------------
@@ -240,6 +241,34 @@ class ProfileConfig:
     memory_threshold_mb: float = 500.0
     chunk_size: int = 100_000
+    def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
+        """
+        Explicitly set the semantic type for a column, overriding auto-detection.
+        The override is the sole source of truth for that column's type — the
+        type detector's verdict is ignored during profiling.  Calling this method
+        multiple times on the same column is valid; the last call wins.
+        Parameters
+        ----------
+        column : str
+            Name of the column to override.
+        semantic_type : str | SemanticType
+            Target semantic type.  Accepts a plain string (e.g. ``"numeric"``,
+            ``"categorical"``) or a ``SemanticType`` enum value.  Invalid strings
+            raise ``ValueError``.
+        """
+        if isinstance(semantic_type, str):
+            try:
+                semantic_type = SemanticType(semantic_type)
+            except ValueError:
+                valid = [e.value for e in SemanticType]
+                raise ValueError(
+                    f"Unknown semantic type {semantic_type!r}. "
+                    f"Valid values: {valid}"
+                )
+        self.column_overrides[column] = semantic_type
     def to_dict(self) -> dict:
         return {
             "modality": str(self.modality),

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/structural.py RENAMED Viewed

@@ -40,6 +40,7 @@ from .config import (
     StructuralProfileResult,
     RowMissingnessDistribution,
     SemanticType,
+    TypeFlag,
     Modality,
 )
@@ -130,7 +131,10 @@ class StructuralProfiler:
         # Overrides for excluded / non-existent columns are silently ignored.
         for col_name, override_type in self.config.column_overrides.items():
             if col_name in result.columns:
-                result.columns[col_name].semantic_type = override_type
+                cp = result.columns[col_name]
+                cp.semantic_type = override_type
+                if TypeFlag.UserOverride not in cp.type_flags:
+                    cp.type_flags.append(TypeFlag.UserOverride)
         # ── 6. Per-column profiling routed by SemanticType ───────────────
         # Batch all columns of the same SemanticType together and call each

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/data_loader.py RENAMED Viewed

@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
 class DataLoader:
-    def __init__(self, fmt: str | None = None) -> None:
-        self._fmt_override = fmt.lower() if fmt else None
     def load(
         self,
@@ -92,7 +90,7 @@ class DataLoader:
     ) -> pl.DataFrame:
         raw, ext_from_path = _read_raw(source)
-        resolved_fmt = (fmt or self._fmt_override or ext_from_path or "").lower()
+        resolved_fmt = (ext_from_path or "").lower()
         if resolved_fmt not in _EXT_LOADERS:
             label = resolved_fmt if resolved_fmt else "<unknown>"

{dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.6.0
+Version: 0.8.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License