PyPI - freshdata-cleaner - Versions diffs - 0.1.0__py3-none-any.whl - Mend

freshdata-cleaner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

freshdata/__init__.py +39 -0
freshdata/_sentinels.py +50 -0
freshdata/_util.py +59 -0
freshdata/api.py +88 -0
freshdata/cleaner.py +127 -0
freshdata/config.py +133 -0
freshdata/profile.py +219 -0
freshdata/py.typed +0 -0
freshdata/report.py +135 -0
freshdata/steps/__init__.py +8 -0
freshdata/steps/columns.py +68 -0
freshdata/steps/dtypes.py +240 -0
freshdata/steps/duplicates.py +43 -0
freshdata/steps/memory.py +89 -0
freshdata/steps/missing.py +78 -0
freshdata/steps/outliers.py +72 -0
freshdata/steps/prune.py +58 -0
freshdata/steps/strings.py +90 -0
freshdata_cleaner-0.1.0.dist-info/METADATA +205 -0
freshdata_cleaner-0.1.0.dist-info/RECORD +22 -0
freshdata_cleaner-0.1.0.dist-info/WHEEL +4 -0
freshdata_cleaner-0.1.0.dist-info/licenses/LICENSE +21 -0

freshdata/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""freshdata — fast, safe, automatic data cleaning for real-world tabular data.
+>>> import freshdata as fd
+>>> cleaned = fd.clean(df)
+>>> cleaned, report = fd.clean(df, report=True)
+>>> print(fd.profile(df))
+Design principles
+-----------------
+- **No surprises.** Defaults only fix representation (whitespace, sentinel
+  strings, wrong dtypes, exact duplicates, empty rows/columns). Anything that
+  changes your data's statistics is opt-in.
+- **Everything is reported.** Each transformation is recorded with the column
+  and the number of affected cells.
+- **Never mutates input.** ``clean`` returns a new frame; profiling is
+  read-only.
+- **Fast by construction.** Vectorized pandas operations only, with
+  sample-based pre-screening so type inference stays cheap on large frames.
+"""
+from .api import clean, profile
+from .cleaner import Cleaner
+from .config import CleanConfig
+from .profile import ColumnProfile, Profile
+from .report import Action, CleanReport
+__version__ = "0.1.0"
+__all__ = [
+    "Action",
+    "CleanConfig",
+    "CleanReport",
+    "Cleaner",
+    "ColumnProfile",
+    "Profile",
+    "__version__",
+    "clean",
+    "profile",
+]

freshdata/_sentinels.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Registry of string values that conventionally mean "missing".
+All entries are stored casefolded; matching is case-insensitive and happens
+after whitespace stripping, so ``" N/A "`` and ``"n/a"`` both match.
+"""
+from __future__ import annotations
+#: Values commonly used in CSV / Excel / SQL exports to denote a missing cell.
+#: Deliberately conservative: entries here are near-certain to mean "missing"
+#: when they appear as the entire cell value. Domain words that merely *might*
+#: mean missing (e.g. ``"unknown"``) are excluded; pass them via the
+#: ``extra_sentinels`` option instead.
+DEFAULT_SENTINELS: frozenset[str] = frozenset(
+    {
+        # empty / placeholder punctuation
+        "",
+        "-",
+        "--",
+        "---",
+        "?",
+        "??",
+        # spelled-out missing markers
+        "na",
+        "n/a",
+        "n\\a",
+        "n.a",
+        "n.a.",
+        "nan",
+        "null",
+        "none",
+        "nil",
+        "missing",
+        "(null)",
+        "(none)",
+        "(blank)",
+        "(empty)",
+        "(missing)",
+        # Excel error codes — never legitimate data
+        "#n/a",
+        "#n/a n/a",
+        "#na",
+        "#null!",
+        "#div/0!",
+        "#ref!",
+        "#value!",
+        "#name?",
+        "#num!",
+    }
+)

freshdata/_util.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Small shared helpers. Internal — no stability guarantees."""
+from __future__ import annotations
+import pandas as pd
+#: Major version of the installed pandas, for the few places behavior differs.
+PANDAS_MAJOR: int = int(pd.__version__.split(".")[0])
+#: Above this many rows, object payloads are estimated from a sample instead
+#: of measured cell by cell, keeping report bookkeeping ~free on tall frames.
+_MEMORY_SAMPLE_THRESHOLD = 200_000
+_MEMORY_SAMPLE_SIZE = 20_000
+def memory_bytes(df: pd.DataFrame) -> int:
+    """Total memory footprint of *df* in bytes, including object payloads.
+    Exact for frames up to ~200k rows; for taller frames the per-row payload
+    of object/string columns is estimated from a 20k-row random sample (other
+    dtypes are always exact — their size does not depend on values).
+    """
+    n = len(df)
+    if n <= _MEMORY_SAMPLE_THRESHOLD:
+        return int(df.memory_usage(deep=True).sum())
+    total = int(df.memory_usage(deep=False).sum())
+    for i, dtype in enumerate(df.dtypes):
+        if not _is_stringlike_dtype(dtype):
+            continue
+        sample = df.iloc[:, i].sample(_MEMORY_SAMPLE_SIZE, random_state=0)
+        payload = sample.memory_usage(deep=True) - sample.memory_usage(deep=False)
+        total += int(payload / len(sample) * n)
+    return total
+def format_bytes(n: float) -> str:
+    """Render a byte count for humans: ``format_bytes(2048) == '2.0 KB'``."""
+    for unit in ("B", "KB", "MB", "GB"):
+        if abs(n) < 1024.0:
+            return f"{n:.1f} {unit}" if unit != "B" else f"{int(n)} B"
+        n /= 1024.0
+    return f"{n:.1f} TB"
+def sample_series(s: pd.Series, size: int, random_state: int) -> pd.Series:
+    """Return *s* itself if small, else a reproducible random sample of *size*."""
+    if len(s) <= size:
+        return s
+    return s.sample(size, random_state=random_state)
+def stringlike_columns(df: pd.DataFrame) -> list:
+    """Column labels whose dtype can hold free-form text (object or string)."""
+    return list(df.columns[[_is_stringlike_dtype(dt) for dt in df.dtypes]])
+def _is_stringlike_dtype(dtype: object) -> bool:
+    return pd.api.types.is_object_dtype(dtype) or isinstance(dtype, pd.StringDtype)

freshdata/api.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Top-level convenience functions: ``fd.clean(df)`` and ``fd.profile(df)``."""
+from __future__ import annotations
+import pandas as pd
+from .cleaner import Cleaner
+from .config import CleanConfig, merge_options
+from .profile import Profile, build_profile
+from .report import CleanReport
+def clean(
+    df: pd.DataFrame,
+    *,
+    report: bool = False,
+    config: CleanConfig | None = None,
+    **options: object,
+) -> pd.DataFrame | tuple[pd.DataFrame, CleanReport]:
+    """Clean a DataFrame and return a new, repaired one.
+    The input is never mutated. By default only *representation* problems are
+    fixed; anything that would change the statistics of your data (imputation,
+    outlier handling, lossy downcasting) is opt-in.
+    Default steps, in order:
+    1.  ``column_names`` — snake_case column names, deduplicate collisions.
+    2.  ``strip_whitespace`` — trim surrounding whitespace in text cells.
+    3.  ``normalize_sentinels`` — turn "N/A", "null", "-", "" … into missing.
+    4.  ``drop_empty_columns`` / ``drop_empty_rows`` — remove all-missing ones.
+    5.  ``fix_dtypes`` — text that is really numeric / datetime / boolean gets
+        the right dtype (validated; ``numeric_threshold`` of values must parse).
+    6.  ``drop_duplicates`` — drop exact duplicate rows (keep first).
+    Opt-in steps: ``drop_constant_columns``, ``impute`` ("auto", "mean",
+    "median", "mode"), ``outliers`` ("clip" or "flag", method "iqr"/"zscore"),
+    ``optimize_memory`` (downcast numerics, categorize low-cardinality text),
+    ``reset_index``. See :class:`freshdata.CleanConfig` for every option and
+    its default.
+    Parameters
+    ----------
+    df:
+        The DataFrame to clean.
+    report:
+        If True, return ``(cleaned_df, CleanReport)`` — the report lists every
+        action taken with affected counts.
+    config:
+        A prebuilt :class:`~freshdata.CleanConfig` to start from.
+    **options:
+        Any :class:`~freshdata.CleanConfig` field as a keyword override.
+        Unknown names raise :class:`TypeError` immediately.
+    Examples
+    --------
+    >>> import freshdata as fd
+    >>> cleaned = fd.clean(df)
+    >>> cleaned, rep = fd.clean(df, report=True)
+    >>> print(rep.summary())
+    >>> fd.clean(df, impute="median", outliers="clip", optimize_memory=True)
+    """
+    return Cleaner(config=config, **options).clean(df, report=report)
+def profile(
+    df: pd.DataFrame,
+    *,
+    config: CleanConfig | None = None,
+    **options: object,
+) -> Profile:
+    """Inspect a DataFrame without changing it.
+    Returns a :class:`~freshdata.Profile` describing shape, memory, missing
+    data, duplicates, and per-column issues — including a faithful preview of
+    the dtype conversions :func:`clean` would perform, computed by the same
+    inference code.
+    Examples
+    --------
+    >>> import freshdata as fd
+    >>> p = fd.profile(df)
+    >>> print(p)             # human-readable issue table
+    >>> p.to_frame()         # one row per column, sortable in a notebook
+    >>> p.to_dict()          # JSON-friendly
+    """
+    return build_profile(df, merge_options(config, **options))

freshdata/cleaner.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""The cleaning pipeline and the reusable :class:`Cleaner` front-end."""
+from __future__ import annotations
+import dataclasses
+import time
+import pandas as pd
+from ._util import memory_bytes
+from .config import CleanConfig, merge_options
+from .report import CleanReport
+from .steps.columns import normalize_column_names
+from .steps.dtypes import fix_dtypes
+from .steps.duplicates import drop_duplicate_rows
+from .steps.memory import optimize_memory
+from .steps.missing import impute_missing
+from .steps.outliers import handle_outliers
+from .steps.prune import drop_constant_columns, drop_empty_columns, drop_empty_rows
+from .steps.strings import clean_strings
+def _validate_input(df: object, config: CleanConfig) -> pd.DataFrame:
+    if isinstance(df, pd.Series):
+        raise TypeError(
+            "freshdata works on DataFrames; got a Series. "
+            "Convert it first with s.to_frame()."
+        )
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError(f"expected a pandas DataFrame, got {type(df).__name__}")
+    if df.columns.duplicated().any() and not config.column_names:
+        dupes = sorted({str(c) for c in df.columns[df.columns.duplicated()]})
+        raise ValueError(
+            f"DataFrame has duplicate column labels {dupes}, which makes "
+            "column-wise cleaning ambiguous. Rename them, or leave "
+            "column_names=True to deduplicate automatically."
+        )
+    return df
+def run_pipeline(df: pd.DataFrame, config: CleanConfig) -> tuple[pd.DataFrame, CleanReport]:
+    """Run every enabled step, in a fixed and documented order.
+    The input frame is never mutated: the pipeline works on a shallow copy and
+    steps only rebind whole columns or build new frames, so the only extra
+    memory used is for the columns that actually change.
+    """
+    df = _validate_input(df, config)
+    report = CleanReport(
+        rows_before=len(df),
+        cols_before=df.shape[1],
+        memory_before=memory_bytes(df),
+    )
+    started = time.perf_counter()
+    out = df.copy(deep=False)
+    if config.column_names:
+        out = normalize_column_names(out, report)
+    out = clean_strings(out, config, report)
+    if config.drop_empty_columns:
+        out = drop_empty_columns(out, report)
+    if config.drop_empty_rows:
+        out = drop_empty_rows(out, report)
+    if config.fix_dtypes:
+        out = fix_dtypes(out, config, report)
+    if config.drop_constant_columns:
+        out = drop_constant_columns(out, config, report)
+    if config.drop_duplicates:
+        out = drop_duplicate_rows(out, config, report)
+    out = impute_missing(out, config, report)
+    out = handle_outliers(out, config, report)
+    out = optimize_memory(out, config, report)
+    if config.reset_index:
+        out = out.reset_index(drop=True)
+    report.rows_after = len(out)
+    report.cols_after = out.shape[1]
+    report.memory_after = memory_bytes(out)
+    report.duration_seconds = time.perf_counter() - started
+    return out, report
+class Cleaner:
+    """A configured, reusable cleaning pipeline.
+    Useful when the same settings are applied to many frames (e.g. every file
+    in a directory), or when you want the report after the fact::
+        cleaner = fd.Cleaner(impute="median", drop_constant_columns=True)
+        for path in paths:
+            cleaned = cleaner.clean(pd.read_csv(path))
+            print(cleaner.report_.summary())
+    Attributes
+    ----------
+    config:
+        The immutable :class:`~freshdata.CleanConfig` in effect.
+    report_:
+        The :class:`~freshdata.CleanReport` from the most recent
+        :meth:`clean` call (``None`` before the first call).
+    """
+    def __init__(self, config: CleanConfig | None = None, **options: object) -> None:
+        self.config: CleanConfig = merge_options(config, **options)
+        self.report_: CleanReport | None = None
+    def clean(
+        self, df: pd.DataFrame, *, report: bool = False
+    ) -> pd.DataFrame | tuple[pd.DataFrame, CleanReport]:
+        """Clean *df* and return the result (never mutates the input).
+        With ``report=True``, returns ``(cleaned_df, CleanReport)`` instead.
+        The latest report is always available as :attr:`report_`.
+        """
+        cleaned, rep = run_pipeline(df, self.config)
+        self.report_ = rep
+        return (cleaned, rep) if report else cleaned
+    def __repr__(self) -> str:
+        defaults = CleanConfig()
+        overrides = {
+            f.name: getattr(self.config, f.name)
+            for f in dataclasses.fields(CleanConfig)
+            if getattr(self.config, f.name) != getattr(defaults, f.name)
+        }
+        inner = ", ".join(f"{k}={v!r}" for k, v in overrides.items())
+        return f"Cleaner({inner})"

freshdata/config.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Configuration for the cleaning pipeline.
+:class:`CleanConfig` is the single source of truth for every option accepted
+by :func:`freshdata.clean`, :func:`freshdata.profile`, and
+:class:`freshdata.Cleaner`. It is frozen (hashable, safely shareable) and
+validates itself on construction so that bad options fail loudly and early.
+"""
+from __future__ import annotations
+import dataclasses
+import difflib
+from dataclasses import dataclass
+_IMPUTE_CHOICES = (None, "auto", "mean", "median", "mode")
+_OUTLIER_CHOICES = (None, "clip", "flag")
+_OUTLIER_METHODS = ("iqr", "zscore")
+#: Default outlier factor per method: 1.5×IQR (Tukey) or 3.0 standard deviations.
+_DEFAULT_FACTOR = {"iqr": 1.5, "zscore": 3.0}
+@dataclass(frozen=True)
+class CleanConfig:
+    """Options controlling what :func:`freshdata.clean` does.
+    Defaults are conservative: steps that only repair representation
+    (whitespace, sentinel strings, wrong dtypes, exact duplicate rows,
+    structurally empty rows/columns) are on; steps that change the *statistics*
+    of the data (imputation, outlier handling, lossy downcasting) are opt-in.
+    """
+    #: Normalize column names to snake_case and deduplicate collisions.
+    column_names: bool = True
+    #: Drop rows where every cell is missing.
+    drop_empty_rows: bool = True
+    #: Drop columns where every cell is missing.
+    drop_empty_columns: bool = True
+    #: Drop columns with a single distinct value (off by default).
+    drop_constant_columns: bool = False
+    #: Trim leading/trailing whitespace in text cells.
+    strip_whitespace: bool = True
+    #: Replace sentinel strings ("N/A", "null", "-", …) with missing values.
+    normalize_sentinels: bool = True
+    #: Additional sentinel strings to treat as missing (case-insensitive).
+    extra_sentinels: tuple[str, ...] = ()
+    #: Infer better dtypes for text columns (numeric, datetime, boolean).
+    fix_dtypes: bool = True
+    #: Fraction of non-missing values that must parse for a numeric conversion.
+    numeric_threshold: float = 0.95
+    #: Fraction of non-missing values that must parse for a datetime conversion.
+    datetime_threshold: float = 0.95
+    #: Drop exact duplicate rows (keeps the first occurrence).
+    drop_duplicates: bool = True
+    #: Restrict duplicate detection to these columns (post-rename names).
+    duplicate_subset: tuple[str, ...] | None = None
+    #: Missing-value imputation: None (off), "auto", "mean", "median", "mode".
+    impute: str | None = None
+    #: Outlier handling for numeric columns: None (off), "clip", "flag".
+    outliers: str | None = None
+    #: Outlier detection method: "iqr" or "zscore".
+    outlier_method: str = "iqr"
+    #: Detection factor; defaults to 1.5 for "iqr" and 3.0 for "zscore".
+    outlier_factor: float | None = None
+    #: Downcast numerics and convert low-cardinality text to category.
+    optimize_memory: bool = False
+    #: Max unique/total ratio for object→category conversion.
+    category_threshold: float = 0.5
+    #: Reset the index to 0..n-1 after cleaning (off: original labels kept).
+    reset_index: bool = False
+    #: Sample size used to cheaply pre-screen expensive type inference.
+    sample_size: int = 10_000
+    #: Seed for the (rare) sampling used during inference pre-screening.
+    random_state: int = 0
+    def __post_init__(self) -> None:
+        if self.impute not in _IMPUTE_CHOICES:
+            raise ValueError(f"impute must be one of {_IMPUTE_CHOICES}, got {self.impute!r}")
+        if self.outliers not in _OUTLIER_CHOICES:
+            raise ValueError(f"outliers must be one of {_OUTLIER_CHOICES}, got {self.outliers!r}")
+        if self.outlier_method not in _OUTLIER_METHODS:
+            raise ValueError(
+                f"outlier_method must be one of {_OUTLIER_METHODS}, got {self.outlier_method!r}"
+            )
+        for name in ("numeric_threshold", "datetime_threshold", "category_threshold"):
+            value = getattr(self, name)
+            if not 0.0 < value <= 1.0:
+                raise ValueError(f"{name} must be in (0, 1], got {value!r}")
+        if self.outlier_factor is not None and not self.outlier_factor > 0:
+            raise ValueError(f"outlier_factor must be > 0, got {self.outlier_factor!r}")
+        if self.sample_size < 1:
+            raise ValueError(f"sample_size must be >= 1, got {self.sample_size!r}")
+        if not all(isinstance(s, str) for s in self.extra_sentinels):
+            raise TypeError("extra_sentinels must be strings")
+        # Normalize user-facing conveniences onto the frozen instance.
+        object.__setattr__(
+            self, "extra_sentinels", tuple(s.casefold().strip() for s in self.extra_sentinels)
+        )
+        if self.duplicate_subset is not None:
+            object.__setattr__(self, "duplicate_subset", tuple(self.duplicate_subset))
+    @property
+    def resolved_outlier_factor(self) -> float:
+        """The outlier factor in effect, applying the per-method default."""
+        if self.outlier_factor is not None:
+            return self.outlier_factor
+        return _DEFAULT_FACTOR[self.outlier_method]
+_FIELD_NAMES = frozenset(f.name for f in dataclasses.fields(CleanConfig))
+def merge_options(base: CleanConfig | None, **options: object) -> CleanConfig:
+    """Build a config from *base* (or defaults) plus keyword overrides.
+    Unknown option names raise :class:`TypeError` with a "did you mean"
+    suggestion, so typos never silently fall back to defaults.
+    """
+    unknown = sorted(set(options) - _FIELD_NAMES)
+    if unknown:
+        hints = []
+        for name in unknown:
+            match = difflib.get_close_matches(name, _FIELD_NAMES, n=1)
+            hints.append(f"{name!r}" + (f" (did you mean {match[0]!r}?)" if match else ""))
+        raise TypeError(
+            f"unknown option(s): {', '.join(hints)}. "
+            f"Valid options: {', '.join(sorted(_FIELD_NAMES))}"
+        )
+    if base is None:
+        return CleanConfig(**options)  # type: ignore[arg-type]
+    if not isinstance(base, CleanConfig):
+        raise TypeError(f"config must be a CleanConfig, got {type(base).__name__}")
+    return dataclasses.replace(base, **options)  # type: ignore[arg-type]