freshdata-cleaner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
freshdata/__init__.py ADDED
@@ -0,0 +1,39 @@
1
+ """freshdata — fast, safe, automatic data cleaning for real-world tabular data.
2
+
3
+ >>> import freshdata as fd
4
+ >>> cleaned = fd.clean(df)
5
+ >>> cleaned, report = fd.clean(df, report=True)
6
+ >>> print(fd.profile(df))
7
+
8
+ Design principles
9
+ -----------------
10
+ - **No surprises.** Defaults only fix representation (whitespace, sentinel
11
+ strings, wrong dtypes, exact duplicates, empty rows/columns). Anything that
12
+ changes your data's statistics is opt-in.
13
+ - **Everything is reported.** Each transformation is recorded with the column
14
+ and the number of affected cells.
15
+ - **Never mutates input.** ``clean`` returns a new frame; profiling is
16
+ read-only.
17
+ - **Fast by construction.** Vectorized pandas operations only, with
18
+ sample-based pre-screening so type inference stays cheap on large frames.
19
+ """
20
+
21
+ from .api import clean, profile
22
+ from .cleaner import Cleaner
23
+ from .config import CleanConfig
24
+ from .profile import ColumnProfile, Profile
25
+ from .report import Action, CleanReport
26
+
27
+ __version__ = "0.1.0"
28
+
29
+ __all__ = [
30
+ "Action",
31
+ "CleanConfig",
32
+ "CleanReport",
33
+ "Cleaner",
34
+ "ColumnProfile",
35
+ "Profile",
36
+ "__version__",
37
+ "clean",
38
+ "profile",
39
+ ]
@@ -0,0 +1,50 @@
1
+ """Registry of string values that conventionally mean "missing".
2
+
3
+ All entries are stored casefolded; matching is case-insensitive and happens
4
+ after whitespace stripping, so ``" N/A "`` and ``"n/a"`` both match.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ #: Values commonly used in CSV / Excel / SQL exports to denote a missing cell.
10
+ #: Deliberately conservative: entries here are near-certain to mean "missing"
11
+ #: when they appear as the entire cell value. Domain words that merely *might*
12
+ #: mean missing (e.g. ``"unknown"``) are excluded; pass them via the
13
+ #: ``extra_sentinels`` option instead.
14
+ DEFAULT_SENTINELS: frozenset[str] = frozenset(
15
+ {
16
+ # empty / placeholder punctuation
17
+ "",
18
+ "-",
19
+ "--",
20
+ "---",
21
+ "?",
22
+ "??",
23
+ # spelled-out missing markers
24
+ "na",
25
+ "n/a",
26
+ "n\\a",
27
+ "n.a",
28
+ "n.a.",
29
+ "nan",
30
+ "null",
31
+ "none",
32
+ "nil",
33
+ "missing",
34
+ "(null)",
35
+ "(none)",
36
+ "(blank)",
37
+ "(empty)",
38
+ "(missing)",
39
+ # Excel error codes — never legitimate data
40
+ "#n/a",
41
+ "#n/a n/a",
42
+ "#na",
43
+ "#null!",
44
+ "#div/0!",
45
+ "#ref!",
46
+ "#value!",
47
+ "#name?",
48
+ "#num!",
49
+ }
50
+ )
freshdata/_util.py ADDED
@@ -0,0 +1,59 @@
1
+ """Small shared helpers. Internal — no stability guarantees."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ #: Major version of the installed pandas, for the few places behavior differs.
8
+ PANDAS_MAJOR: int = int(pd.__version__.split(".")[0])
9
+
10
+
11
+ #: Above this many rows, object payloads are estimated from a sample instead
12
+ #: of measured cell by cell, keeping report bookkeeping ~free on tall frames.
13
+ _MEMORY_SAMPLE_THRESHOLD = 200_000
14
+ _MEMORY_SAMPLE_SIZE = 20_000
15
+
16
+
17
+ def memory_bytes(df: pd.DataFrame) -> int:
18
+ """Total memory footprint of *df* in bytes, including object payloads.
19
+
20
+ Exact for frames up to ~200k rows; for taller frames the per-row payload
21
+ of object/string columns is estimated from a 20k-row random sample (other
22
+ dtypes are always exact — their size does not depend on values).
23
+ """
24
+ n = len(df)
25
+ if n <= _MEMORY_SAMPLE_THRESHOLD:
26
+ return int(df.memory_usage(deep=True).sum())
27
+ total = int(df.memory_usage(deep=False).sum())
28
+ for i, dtype in enumerate(df.dtypes):
29
+ if not _is_stringlike_dtype(dtype):
30
+ continue
31
+ sample = df.iloc[:, i].sample(_MEMORY_SAMPLE_SIZE, random_state=0)
32
+ payload = sample.memory_usage(deep=True) - sample.memory_usage(deep=False)
33
+ total += int(payload / len(sample) * n)
34
+ return total
35
+
36
+
37
+ def format_bytes(n: float) -> str:
38
+ """Render a byte count for humans: ``format_bytes(2048) == '2.0 KB'``."""
39
+ for unit in ("B", "KB", "MB", "GB"):
40
+ if abs(n) < 1024.0:
41
+ return f"{n:.1f} {unit}" if unit != "B" else f"{int(n)} B"
42
+ n /= 1024.0
43
+ return f"{n:.1f} TB"
44
+
45
+
46
+ def sample_series(s: pd.Series, size: int, random_state: int) -> pd.Series:
47
+ """Return *s* itself if small, else a reproducible random sample of *size*."""
48
+ if len(s) <= size:
49
+ return s
50
+ return s.sample(size, random_state=random_state)
51
+
52
+
53
+ def stringlike_columns(df: pd.DataFrame) -> list:
54
+ """Column labels whose dtype can hold free-form text (object or string)."""
55
+ return list(df.columns[[_is_stringlike_dtype(dt) for dt in df.dtypes]])
56
+
57
+
58
+ def _is_stringlike_dtype(dtype: object) -> bool:
59
+ return pd.api.types.is_object_dtype(dtype) or isinstance(dtype, pd.StringDtype)
freshdata/api.py ADDED
@@ -0,0 +1,88 @@
1
+ """Top-level convenience functions: ``fd.clean(df)`` and ``fd.profile(df)``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from .cleaner import Cleaner
8
+ from .config import CleanConfig, merge_options
9
+ from .profile import Profile, build_profile
10
+ from .report import CleanReport
11
+
12
+
13
+ def clean(
14
+ df: pd.DataFrame,
15
+ *,
16
+ report: bool = False,
17
+ config: CleanConfig | None = None,
18
+ **options: object,
19
+ ) -> pd.DataFrame | tuple[pd.DataFrame, CleanReport]:
20
+ """Clean a DataFrame and return a new, repaired one.
21
+
22
+ The input is never mutated. By default only *representation* problems are
23
+ fixed; anything that would change the statistics of your data (imputation,
24
+ outlier handling, lossy downcasting) is opt-in.
25
+
26
+ Default steps, in order:
27
+
28
+ 1. ``column_names`` — snake_case column names, deduplicate collisions.
29
+ 2. ``strip_whitespace`` — trim surrounding whitespace in text cells.
30
+ 3. ``normalize_sentinels`` — turn "N/A", "null", "-", "" … into missing.
31
+ 4. ``drop_empty_columns`` / ``drop_empty_rows`` — remove all-missing ones.
32
+ 5. ``fix_dtypes`` — text that is really numeric / datetime / boolean gets
33
+ the right dtype (validated; ``numeric_threshold`` of values must parse).
34
+ 6. ``drop_duplicates`` — drop exact duplicate rows (keep first).
35
+
36
+ Opt-in steps: ``drop_constant_columns``, ``impute`` ("auto", "mean",
37
+ "median", "mode"), ``outliers`` ("clip" or "flag", method "iqr"/"zscore"),
38
+ ``optimize_memory`` (downcast numerics, categorize low-cardinality text),
39
+ ``reset_index``. See :class:`freshdata.CleanConfig` for every option and
40
+ its default.
41
+
42
+ Parameters
43
+ ----------
44
+ df:
45
+ The DataFrame to clean.
46
+ report:
47
+ If True, return ``(cleaned_df, CleanReport)`` — the report lists every
48
+ action taken with affected counts.
49
+ config:
50
+ A prebuilt :class:`~freshdata.CleanConfig` to start from.
51
+ **options:
52
+ Any :class:`~freshdata.CleanConfig` field as a keyword override.
53
+ Unknown names raise :class:`TypeError` immediately.
54
+
55
+ Examples
56
+ --------
57
+ >>> import freshdata as fd
58
+ >>> cleaned = fd.clean(df)
59
+ >>> cleaned, rep = fd.clean(df, report=True)
60
+ >>> print(rep.summary())
61
+
62
+ >>> fd.clean(df, impute="median", outliers="clip", optimize_memory=True)
63
+ """
64
+ return Cleaner(config=config, **options).clean(df, report=report)
65
+
66
+
67
+ def profile(
68
+ df: pd.DataFrame,
69
+ *,
70
+ config: CleanConfig | None = None,
71
+ **options: object,
72
+ ) -> Profile:
73
+ """Inspect a DataFrame without changing it.
74
+
75
+ Returns a :class:`~freshdata.Profile` describing shape, memory, missing
76
+ data, duplicates, and per-column issues — including a faithful preview of
77
+ the dtype conversions :func:`clean` would perform, computed by the same
78
+ inference code.
79
+
80
+ Examples
81
+ --------
82
+ >>> import freshdata as fd
83
+ >>> p = fd.profile(df)
84
+ >>> print(p) # human-readable issue table
85
+ >>> p.to_frame() # one row per column, sortable in a notebook
86
+ >>> p.to_dict() # JSON-friendly
87
+ """
88
+ return build_profile(df, merge_options(config, **options))
freshdata/cleaner.py ADDED
@@ -0,0 +1,127 @@
1
+ """The cleaning pipeline and the reusable :class:`Cleaner` front-end."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import dataclasses
6
+ import time
7
+
8
+ import pandas as pd
9
+
10
+ from ._util import memory_bytes
11
+ from .config import CleanConfig, merge_options
12
+ from .report import CleanReport
13
+ from .steps.columns import normalize_column_names
14
+ from .steps.dtypes import fix_dtypes
15
+ from .steps.duplicates import drop_duplicate_rows
16
+ from .steps.memory import optimize_memory
17
+ from .steps.missing import impute_missing
18
+ from .steps.outliers import handle_outliers
19
+ from .steps.prune import drop_constant_columns, drop_empty_columns, drop_empty_rows
20
+ from .steps.strings import clean_strings
21
+
22
+
23
+ def _validate_input(df: object, config: CleanConfig) -> pd.DataFrame:
24
+ if isinstance(df, pd.Series):
25
+ raise TypeError(
26
+ "freshdata works on DataFrames; got a Series. "
27
+ "Convert it first with s.to_frame()."
28
+ )
29
+ if not isinstance(df, pd.DataFrame):
30
+ raise TypeError(f"expected a pandas DataFrame, got {type(df).__name__}")
31
+ if df.columns.duplicated().any() and not config.column_names:
32
+ dupes = sorted({str(c) for c in df.columns[df.columns.duplicated()]})
33
+ raise ValueError(
34
+ f"DataFrame has duplicate column labels {dupes}, which makes "
35
+ "column-wise cleaning ambiguous. Rename them, or leave "
36
+ "column_names=True to deduplicate automatically."
37
+ )
38
+ return df
39
+
40
+
41
+ def run_pipeline(df: pd.DataFrame, config: CleanConfig) -> tuple[pd.DataFrame, CleanReport]:
42
+ """Run every enabled step, in a fixed and documented order.
43
+
44
+ The input frame is never mutated: the pipeline works on a shallow copy and
45
+ steps only rebind whole columns or build new frames, so the only extra
46
+ memory used is for the columns that actually change.
47
+ """
48
+ df = _validate_input(df, config)
49
+ report = CleanReport(
50
+ rows_before=len(df),
51
+ cols_before=df.shape[1],
52
+ memory_before=memory_bytes(df),
53
+ )
54
+ started = time.perf_counter()
55
+
56
+ out = df.copy(deep=False)
57
+ if config.column_names:
58
+ out = normalize_column_names(out, report)
59
+ out = clean_strings(out, config, report)
60
+ if config.drop_empty_columns:
61
+ out = drop_empty_columns(out, report)
62
+ if config.drop_empty_rows:
63
+ out = drop_empty_rows(out, report)
64
+ if config.fix_dtypes:
65
+ out = fix_dtypes(out, config, report)
66
+ if config.drop_constant_columns:
67
+ out = drop_constant_columns(out, config, report)
68
+ if config.drop_duplicates:
69
+ out = drop_duplicate_rows(out, config, report)
70
+ out = impute_missing(out, config, report)
71
+ out = handle_outliers(out, config, report)
72
+ out = optimize_memory(out, config, report)
73
+ if config.reset_index:
74
+ out = out.reset_index(drop=True)
75
+
76
+ report.rows_after = len(out)
77
+ report.cols_after = out.shape[1]
78
+ report.memory_after = memory_bytes(out)
79
+ report.duration_seconds = time.perf_counter() - started
80
+ return out, report
81
+
82
+
83
+ class Cleaner:
84
+ """A configured, reusable cleaning pipeline.
85
+
86
+ Useful when the same settings are applied to many frames (e.g. every file
87
+ in a directory), or when you want the report after the fact::
88
+
89
+ cleaner = fd.Cleaner(impute="median", drop_constant_columns=True)
90
+ for path in paths:
91
+ cleaned = cleaner.clean(pd.read_csv(path))
92
+ print(cleaner.report_.summary())
93
+
94
+ Attributes
95
+ ----------
96
+ config:
97
+ The immutable :class:`~freshdata.CleanConfig` in effect.
98
+ report_:
99
+ The :class:`~freshdata.CleanReport` from the most recent
100
+ :meth:`clean` call (``None`` before the first call).
101
+ """
102
+
103
+ def __init__(self, config: CleanConfig | None = None, **options: object) -> None:
104
+ self.config: CleanConfig = merge_options(config, **options)
105
+ self.report_: CleanReport | None = None
106
+
107
+ def clean(
108
+ self, df: pd.DataFrame, *, report: bool = False
109
+ ) -> pd.DataFrame | tuple[pd.DataFrame, CleanReport]:
110
+ """Clean *df* and return the result (never mutates the input).
111
+
112
+ With ``report=True``, returns ``(cleaned_df, CleanReport)`` instead.
113
+ The latest report is always available as :attr:`report_`.
114
+ """
115
+ cleaned, rep = run_pipeline(df, self.config)
116
+ self.report_ = rep
117
+ return (cleaned, rep) if report else cleaned
118
+
119
+ def __repr__(self) -> str:
120
+ defaults = CleanConfig()
121
+ overrides = {
122
+ f.name: getattr(self.config, f.name)
123
+ for f in dataclasses.fields(CleanConfig)
124
+ if getattr(self.config, f.name) != getattr(defaults, f.name)
125
+ }
126
+ inner = ", ".join(f"{k}={v!r}" for k, v in overrides.items())
127
+ return f"Cleaner({inner})"
freshdata/config.py ADDED
@@ -0,0 +1,133 @@
1
+ """Configuration for the cleaning pipeline.
2
+
3
+ :class:`CleanConfig` is the single source of truth for every option accepted
4
+ by :func:`freshdata.clean`, :func:`freshdata.profile`, and
5
+ :class:`freshdata.Cleaner`. It is frozen (hashable, safely shareable) and
6
+ validates itself on construction so that bad options fail loudly and early.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import dataclasses
12
+ import difflib
13
+ from dataclasses import dataclass
14
+
15
+ _IMPUTE_CHOICES = (None, "auto", "mean", "median", "mode")
16
+ _OUTLIER_CHOICES = (None, "clip", "flag")
17
+ _OUTLIER_METHODS = ("iqr", "zscore")
18
+
19
+ #: Default outlier factor per method: 1.5×IQR (Tukey) or 3.0 standard deviations.
20
+ _DEFAULT_FACTOR = {"iqr": 1.5, "zscore": 3.0}
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class CleanConfig:
25
+ """Options controlling what :func:`freshdata.clean` does.
26
+
27
+ Defaults are conservative: steps that only repair representation
28
+ (whitespace, sentinel strings, wrong dtypes, exact duplicate rows,
29
+ structurally empty rows/columns) are on; steps that change the *statistics*
30
+ of the data (imputation, outlier handling, lossy downcasting) are opt-in.
31
+ """
32
+
33
+ #: Normalize column names to snake_case and deduplicate collisions.
34
+ column_names: bool = True
35
+ #: Drop rows where every cell is missing.
36
+ drop_empty_rows: bool = True
37
+ #: Drop columns where every cell is missing.
38
+ drop_empty_columns: bool = True
39
+ #: Drop columns with a single distinct value (off by default).
40
+ drop_constant_columns: bool = False
41
+ #: Trim leading/trailing whitespace in text cells.
42
+ strip_whitespace: bool = True
43
+ #: Replace sentinel strings ("N/A", "null", "-", …) with missing values.
44
+ normalize_sentinels: bool = True
45
+ #: Additional sentinel strings to treat as missing (case-insensitive).
46
+ extra_sentinels: tuple[str, ...] = ()
47
+ #: Infer better dtypes for text columns (numeric, datetime, boolean).
48
+ fix_dtypes: bool = True
49
+ #: Fraction of non-missing values that must parse for a numeric conversion.
50
+ numeric_threshold: float = 0.95
51
+ #: Fraction of non-missing values that must parse for a datetime conversion.
52
+ datetime_threshold: float = 0.95
53
+ #: Drop exact duplicate rows (keeps the first occurrence).
54
+ drop_duplicates: bool = True
55
+ #: Restrict duplicate detection to these columns (post-rename names).
56
+ duplicate_subset: tuple[str, ...] | None = None
57
+ #: Missing-value imputation: None (off), "auto", "mean", "median", "mode".
58
+ impute: str | None = None
59
+ #: Outlier handling for numeric columns: None (off), "clip", "flag".
60
+ outliers: str | None = None
61
+ #: Outlier detection method: "iqr" or "zscore".
62
+ outlier_method: str = "iqr"
63
+ #: Detection factor; defaults to 1.5 for "iqr" and 3.0 for "zscore".
64
+ outlier_factor: float | None = None
65
+ #: Downcast numerics and convert low-cardinality text to category.
66
+ optimize_memory: bool = False
67
+ #: Max unique/total ratio for object→category conversion.
68
+ category_threshold: float = 0.5
69
+ #: Reset the index to 0..n-1 after cleaning (off: original labels kept).
70
+ reset_index: bool = False
71
+ #: Sample size used to cheaply pre-screen expensive type inference.
72
+ sample_size: int = 10_000
73
+ #: Seed for the (rare) sampling used during inference pre-screening.
74
+ random_state: int = 0
75
+
76
+ def __post_init__(self) -> None:
77
+ if self.impute not in _IMPUTE_CHOICES:
78
+ raise ValueError(f"impute must be one of {_IMPUTE_CHOICES}, got {self.impute!r}")
79
+ if self.outliers not in _OUTLIER_CHOICES:
80
+ raise ValueError(f"outliers must be one of {_OUTLIER_CHOICES}, got {self.outliers!r}")
81
+ if self.outlier_method not in _OUTLIER_METHODS:
82
+ raise ValueError(
83
+ f"outlier_method must be one of {_OUTLIER_METHODS}, got {self.outlier_method!r}"
84
+ )
85
+ for name in ("numeric_threshold", "datetime_threshold", "category_threshold"):
86
+ value = getattr(self, name)
87
+ if not 0.0 < value <= 1.0:
88
+ raise ValueError(f"{name} must be in (0, 1], got {value!r}")
89
+ if self.outlier_factor is not None and not self.outlier_factor > 0:
90
+ raise ValueError(f"outlier_factor must be > 0, got {self.outlier_factor!r}")
91
+ if self.sample_size < 1:
92
+ raise ValueError(f"sample_size must be >= 1, got {self.sample_size!r}")
93
+ if not all(isinstance(s, str) for s in self.extra_sentinels):
94
+ raise TypeError("extra_sentinels must be strings")
95
+ # Normalize user-facing conveniences onto the frozen instance.
96
+ object.__setattr__(
97
+ self, "extra_sentinels", tuple(s.casefold().strip() for s in self.extra_sentinels)
98
+ )
99
+ if self.duplicate_subset is not None:
100
+ object.__setattr__(self, "duplicate_subset", tuple(self.duplicate_subset))
101
+
102
+ @property
103
+ def resolved_outlier_factor(self) -> float:
104
+ """The outlier factor in effect, applying the per-method default."""
105
+ if self.outlier_factor is not None:
106
+ return self.outlier_factor
107
+ return _DEFAULT_FACTOR[self.outlier_method]
108
+
109
+
110
+ _FIELD_NAMES = frozenset(f.name for f in dataclasses.fields(CleanConfig))
111
+
112
+
113
+ def merge_options(base: CleanConfig | None, **options: object) -> CleanConfig:
114
+ """Build a config from *base* (or defaults) plus keyword overrides.
115
+
116
+ Unknown option names raise :class:`TypeError` with a "did you mean"
117
+ suggestion, so typos never silently fall back to defaults.
118
+ """
119
+ unknown = sorted(set(options) - _FIELD_NAMES)
120
+ if unknown:
121
+ hints = []
122
+ for name in unknown:
123
+ match = difflib.get_close_matches(name, _FIELD_NAMES, n=1)
124
+ hints.append(f"{name!r}" + (f" (did you mean {match[0]!r}?)" if match else ""))
125
+ raise TypeError(
126
+ f"unknown option(s): {', '.join(hints)}. "
127
+ f"Valid options: {', '.join(sorted(_FIELD_NAMES))}"
128
+ )
129
+ if base is None:
130
+ return CleanConfig(**options) # type: ignore[arg-type]
131
+ if not isinstance(base, CleanConfig):
132
+ raise TypeError(f"config must be a CleanConfig, got {type(base).__name__}")
133
+ return dataclasses.replace(base, **options) # type: ignore[arg-type]