freshdata-cleaner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. freshdata_cleaner-0.1.0/.gitignore +16 -0
  2. freshdata_cleaner-0.1.0/CHANGELOG.md +28 -0
  3. freshdata_cleaner-0.1.0/LICENSE +21 -0
  4. freshdata_cleaner-0.1.0/PKG-INFO +205 -0
  5. freshdata_cleaner-0.1.0/README.md +169 -0
  6. freshdata_cleaner-0.1.0/pyproject.toml +96 -0
  7. freshdata_cleaner-0.1.0/src/freshdata/__init__.py +39 -0
  8. freshdata_cleaner-0.1.0/src/freshdata/_sentinels.py +50 -0
  9. freshdata_cleaner-0.1.0/src/freshdata/_util.py +59 -0
  10. freshdata_cleaner-0.1.0/src/freshdata/api.py +88 -0
  11. freshdata_cleaner-0.1.0/src/freshdata/cleaner.py +127 -0
  12. freshdata_cleaner-0.1.0/src/freshdata/config.py +133 -0
  13. freshdata_cleaner-0.1.0/src/freshdata/profile.py +219 -0
  14. freshdata_cleaner-0.1.0/src/freshdata/py.typed +0 -0
  15. freshdata_cleaner-0.1.0/src/freshdata/report.py +135 -0
  16. freshdata_cleaner-0.1.0/src/freshdata/steps/__init__.py +8 -0
  17. freshdata_cleaner-0.1.0/src/freshdata/steps/columns.py +68 -0
  18. freshdata_cleaner-0.1.0/src/freshdata/steps/dtypes.py +240 -0
  19. freshdata_cleaner-0.1.0/src/freshdata/steps/duplicates.py +43 -0
  20. freshdata_cleaner-0.1.0/src/freshdata/steps/memory.py +89 -0
  21. freshdata_cleaner-0.1.0/src/freshdata/steps/missing.py +78 -0
  22. freshdata_cleaner-0.1.0/src/freshdata/steps/outliers.py +72 -0
  23. freshdata_cleaner-0.1.0/src/freshdata/steps/prune.py +58 -0
  24. freshdata_cleaner-0.1.0/src/freshdata/steps/strings.py +90 -0
  25. freshdata_cleaner-0.1.0/tests/conftest.py +29 -0
  26. freshdata_cleaner-0.1.0/tests/test_api.py +92 -0
  27. freshdata_cleaner-0.1.0/tests/test_columns.py +56 -0
  28. freshdata_cleaner-0.1.0/tests/test_dtypes.py +146 -0
  29. freshdata_cleaner-0.1.0/tests/test_memory.py +58 -0
  30. freshdata_cleaner-0.1.0/tests/test_missing.py +79 -0
  31. freshdata_cleaner-0.1.0/tests/test_outliers.py +65 -0
  32. freshdata_cleaner-0.1.0/tests/test_profile.py +91 -0
  33. freshdata_cleaner-0.1.0/tests/test_properties.py +67 -0
  34. freshdata_cleaner-0.1.0/tests/test_report.py +62 -0
  35. freshdata_cleaner-0.1.0/tests/test_rows.py +73 -0
  36. freshdata_cleaner-0.1.0/tests/test_strings.py +67 -0
@@ -0,0 +1,16 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ build/
6
+ dist/
7
+ .venv/
8
+ venv/
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+ .coverage
13
+ coverage.xml
14
+ htmlcov/
15
+ .DS_Store
16
+ .ipynb_checkpoints/
@@ -0,0 +1,28 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project
5
+ adheres to [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.0] - 2026-06-12
8
+
9
+ Initial release.
10
+
11
+ ### Added
12
+ - `freshdata.clean()` — automatic, audited cleaning: column-name
13
+ normalization, whitespace stripping, sentinel-string normalization,
14
+ empty row/column pruning, validated dtype inference (numeric incl.
15
+ currency/thousands separators, datetime, boolean), and exact duplicate
16
+ removal.
17
+ - Opt-in steps: imputation (`auto`/`mean`/`median`/`mode`), outlier
18
+ clipping/flagging (IQR or z-score), constant-column dropping, memory
19
+ optimization (numeric downcasting + category conversion), index reset.
20
+ - `freshdata.profile()` — read-only profiling whose dtype suggestions are
21
+ produced by the same inference code `clean` uses.
22
+ - `freshdata.Cleaner` — reusable configured pipeline with `report_`.
23
+ - `freshdata.CleanConfig` — frozen, self-validating configuration;
24
+ unknown options raise with a "did you mean" suggestion.
25
+ - `freshdata.CleanReport` / `freshdata.Action` — structured audit trail
26
+ with `summary()`, `to_dict()`, `to_frame()`.
27
+ - Type hints throughout (`py.typed`), zero dependencies beyond
28
+ pandas/numpy, support for Python 3.9–3.13.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Johnny Wilson Dougherty
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,205 @@
1
+ Metadata-Version: 2.4
2
+ Name: freshdata-cleaner
3
+ Version: 0.1.0
4
+ Summary: Fast, safe, automatic data cleaning for real-world tabular data.
5
+ Project-URL: Homepage, https://github.com/JohnnyWilson-Portfolio/freshdata
6
+ Project-URL: Repository, https://github.com/JohnnyWilson-Portfolio/freshdata
7
+ Project-URL: Issues, https://github.com/JohnnyWilson-Portfolio/freshdata/issues
8
+ Project-URL: Changelog, https://github.com/JohnnyWilson-Portfolio/freshdata/blob/main/CHANGELOG.md
9
+ Author-email: Johnny Wilson Dougherty <jyothiswaroop2803@gmail.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: data-cleaning,data-quality,etl,pandas,preprocessing,tabular-data
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.9
27
+ Requires-Dist: numpy>=1.21
28
+ Requires-Dist: pandas>=1.5
29
+ Provides-Extra: dev
30
+ Requires-Dist: build>=1.0; extra == 'dev'
31
+ Requires-Dist: mypy>=1.8; extra == 'dev'
32
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
33
+ Requires-Dist: pytest>=7.0; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4; extra == 'dev'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # freshdata
38
+
39
+ **Fast, safe, automatic data cleaning for real-world tabular data.**
40
+
41
+ [![CI](https://github.com/JohnnyWilson-Portfolio/freshdata/actions/workflows/ci.yml/badge.svg)](https://github.com/JohnnyWilson-Portfolio/freshdata/actions/workflows/ci.yml)
42
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://pypi.org/project/freshdata-cleaner/)
43
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
44
+
45
+ `freshdata` fixes the messy parts of CSV / Excel / SQL-export data — stray
46
+ whitespace, `"N/A"` strings, numbers stored as text, duplicate rows — in one
47
+ call, and tells you exactly what it did.
48
+
49
+ ```python
50
+ import pandas as pd
51
+ import freshdata as fd
52
+
53
+ df = pd.read_csv("export.csv")
54
+
55
+ cleaned = fd.clean(df) # one line
56
+ cleaned, report = fd.clean(df, report=True) # ... with a full audit trail
57
+ print(report.summary())
58
+ ```
59
+
60
+ ```text
61
+ freshdata clean report
62
+ rows: 5 -> 4 (-1)
63
+ columns: 6 -> 5 (-1)
64
+ memory: 1.5 KB -> 298 B
65
+ time: 0.011s
66
+ actions (12):
67
+ - [column_names] renamed 5 column(s): ' First Name '->'first_name', 'AGE'->'age', …
68
+ - [strip_whitespace] 'first_name': trimmed surrounding whitespace
69
+ - [normalize_sentinels] 'age': replaced sentinel strings ("N/A", "-", "", …) with missing
70
+ - [drop_empty_columns] dropped 1 all-missing column(s): empty
71
+ - [fix_dtypes] 'age': converted to Int64
72
+ - [fix_dtypes] 'joined_date': converted to datetime64[ns]
73
+ - [fix_dtypes] 'active': converted to bool
74
+ - [fix_dtypes] 'salary': converted to float64
75
+ - [drop_duplicates] dropped 1 duplicate row(s)
76
+ ```
77
+
78
+ ## Install
79
+
80
+ ```bash
81
+ pip install freshdata-cleaner
82
+ ```
83
+
84
+ Requires Python ≥ 3.9 and pandas ≥ 1.5. No other dependencies.
85
+
86
+ ## Why another cleaning library?
87
+
88
+ Most auto-cleaners are either trivial wrappers or opaque frameworks that
89
+ guess. `freshdata` is built on four rules:
90
+
91
+ 1. **No surprises.** Defaults only repair *representation* — whitespace,
92
+ sentinel strings, wrong dtypes, exact duplicate rows, all-empty
93
+ rows/columns. Anything that changes your data's *statistics* (imputation,
94
+ outlier handling, lossy downcasting) is opt-in.
95
+ 2. **Everything is reported.** Every transformation is recorded with the
96
+ column name and the number of affected cells. `bool(report)` is `False`
97
+ when nothing changed.
98
+ 3. **Never mutates your input.** `clean` returns a new frame (built from a
99
+ shallow copy, so unchanged columns cost no extra memory). `profile` is
100
+ read-only.
101
+ 4. **Fast by construction.** Vectorized pandas operations only — no
102
+ row-wise `apply`. Type inference pre-screens a sample, so hopeless
103
+ conversions are rejected at O(sample), not O(n), and conversions only
104
+ stick when ≥ 95 % of values parse (configurable).
105
+
106
+ ## What `clean` does by default
107
+
108
+ | order | step | what it does |
109
+ |---|---|---|
110
+ | 1 | `column_names` | snake_case names, deduplicate collisions (`"a", "a"` → `"a", "a_2"`) |
111
+ | 2 | `strip_whitespace` | trim surrounding whitespace in text cells (internal spacing kept) |
112
+ | 3 | `normalize_sentinels` | `"N/A"`, `"null"`, `"-"`, `""`, `"#REF!"`, … → missing |
113
+ | 4 | `drop_empty_columns` / `drop_empty_rows` | remove all-missing columns and rows |
114
+ | 5 | `fix_dtypes` | text → numeric (`"$1,234.56"` works) / datetime / boolean, validated |
115
+ | 6 | `drop_duplicates` | drop exact duplicate rows, keep the first |
116
+
117
+ Conversions are conservative: a column converts only when at least
118
+ `numeric_threshold` (default 0.95) of its non-missing values parse, mixed-type
119
+ columns never lose their non-string values, and every value coerced to missing
120
+ is counted in the report.
121
+
122
+ ## Opt-in steps
123
+
124
+ ```python
125
+ fd.clean(
126
+ df,
127
+ impute="auto", # median for numeric, mode otherwise ("mean"/"median"/"mode")
128
+ outliers="clip", # or "flag" to add a boolean <col>_outlier column
129
+ outlier_method="iqr", # or "zscore"; factors default to 1.5 / 3.0
130
+ drop_constant_columns=True, # single-valued columns
131
+ optimize_memory=True, # downcast numerics, categorize low-cardinality text
132
+ reset_index=True, # 0..n-1 index instead of original labels
133
+ )
134
+ ```
135
+
136
+ Every option lives on one frozen dataclass — `fd.CleanConfig` — and unknown
137
+ names fail immediately with a "did you mean" suggestion:
138
+
139
+ ```python
140
+ config = fd.CleanConfig(drop_duplicates=False, extra_sentinels=("unknown",))
141
+ fd.clean(df, config=config, impute="median") # config + overrides
142
+
143
+ cleaner = fd.Cleaner(impute="median") # reusable pipeline
144
+ for path in paths:
145
+ out = cleaner.clean(pd.read_csv(path))
146
+ log.info(cleaner.report_.summary())
147
+ ```
148
+
149
+ ## Profiling
150
+
151
+ `fd.profile(df)` inspects without changing anything — and because it runs the
152
+ *same* inference code as `clean`, its suggestions are a faithful preview:
153
+
154
+ ```python
155
+ print(fd.profile(df))
156
+ ```
157
+
158
+ ```text
159
+ freshdata profile — 5 rows x 6 columns, 1.5 KB
160
+ missing cells: 6 (20.0%) duplicate rows: 1
161
+ column dtype missing issues
162
+ First Name object 20% 20.0% missing; 1 value(s) with surrounding whitespace; …
163
+ AGE object - 1 sentinel value(s) meaning missing; would convert to Int64
164
+ Joined Date object - would convert to datetime64[ns]
165
+ Active object - would convert to bool
166
+ Salary($) object - would convert to float64
167
+ empty object 100% 100.0% missing; constant column
168
+ ```
169
+
170
+ `profile.to_frame()` gives the same as a DataFrame; `profile.to_dict()` is
171
+ JSON-friendly for logging and data-quality dashboards.
172
+
173
+ ## What freshdata will not do
174
+
175
+ - Guess at fuzzy entity resolution ("Jon" vs "John").
176
+ - Impute, drop outliers, or change distributions unless you ask.
177
+ - Parse ambiguous European decimal commas (`"1.234,56"`) — too risky to guess.
178
+ - Mutate your DataFrame, ever.
179
+
180
+ ## API
181
+
182
+ | name | purpose |
183
+ |---|---|
184
+ | `fd.clean(df, *, report=False, config=None, **options)` | clean, optionally returning a `CleanReport` |
185
+ | `fd.profile(df, *, config=None, **options)` | read-only inspection with actionable issues |
186
+ | `fd.Cleaner(config=None, **options)` | reusable configured pipeline (`.clean()`, `.report_`) |
187
+ | `fd.CleanConfig` | frozen dataclass holding every option |
188
+ | `fd.CleanReport` / `fd.Action` | audit trail (`summary()`, `to_dict()`, `to_frame()`) |
189
+ | `fd.Profile` / `fd.ColumnProfile` | profiling results |
190
+
191
+ ## Development
192
+
193
+ ```bash
194
+ git clone https://github.com/JohnnyWilson-Portfolio/freshdata
195
+ cd freshdata
196
+ pip install -e ".[dev]"
197
+ pytest
198
+ ruff check src tests
199
+ ```
200
+
201
+ Benchmarks live in `benchmarks/bench.py` (`python benchmarks/bench.py`).
202
+
203
+ ## License
204
+
205
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,169 @@
1
+ # freshdata
2
+
3
+ **Fast, safe, automatic data cleaning for real-world tabular data.**
4
+
5
+ [![CI](https://github.com/JohnnyWilson-Portfolio/freshdata/actions/workflows/ci.yml/badge.svg)](https://github.com/JohnnyWilson-Portfolio/freshdata/actions/workflows/ci.yml)
6
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://pypi.org/project/freshdata-cleaner/)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
8
+
9
+ `freshdata` fixes the messy parts of CSV / Excel / SQL-export data — stray
10
+ whitespace, `"N/A"` strings, numbers stored as text, duplicate rows — in one
11
+ call, and tells you exactly what it did.
12
+
13
+ ```python
14
+ import pandas as pd
15
+ import freshdata as fd
16
+
17
+ df = pd.read_csv("export.csv")
18
+
19
+ cleaned = fd.clean(df) # one line
20
+ cleaned, report = fd.clean(df, report=True) # ... with a full audit trail
21
+ print(report.summary())
22
+ ```
23
+
24
+ ```text
25
+ freshdata clean report
26
+ rows: 5 -> 4 (-1)
27
+ columns: 6 -> 5 (-1)
28
+ memory: 1.5 KB -> 298 B
29
+ time: 0.011s
30
+ actions (12):
31
+ - [column_names] renamed 5 column(s): ' First Name '->'first_name', 'AGE'->'age', …
32
+ - [strip_whitespace] 'first_name': trimmed surrounding whitespace
33
+ - [normalize_sentinels] 'age': replaced sentinel strings ("N/A", "-", "", …) with missing
34
+ - [drop_empty_columns] dropped 1 all-missing column(s): empty
35
+ - [fix_dtypes] 'age': converted to Int64
36
+ - [fix_dtypes] 'joined_date': converted to datetime64[ns]
37
+ - [fix_dtypes] 'active': converted to bool
38
+ - [fix_dtypes] 'salary': converted to float64
39
+ - [drop_duplicates] dropped 1 duplicate row(s)
40
+ ```
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install freshdata-cleaner
46
+ ```
47
+
48
+ Requires Python ≥ 3.9 and pandas ≥ 1.5. No other dependencies.
49
+
50
+ ## Why another cleaning library?
51
+
52
+ Most auto-cleaners are either trivial wrappers or opaque frameworks that
53
+ guess. `freshdata` is built on four rules:
54
+
55
+ 1. **No surprises.** Defaults only repair *representation* — whitespace,
56
+ sentinel strings, wrong dtypes, exact duplicate rows, all-empty
57
+ rows/columns. Anything that changes your data's *statistics* (imputation,
58
+ outlier handling, lossy downcasting) is opt-in.
59
+ 2. **Everything is reported.** Every transformation is recorded with the
60
+ column name and the number of affected cells. `bool(report)` is `False`
61
+ when nothing changed.
62
+ 3. **Never mutates your input.** `clean` returns a new frame (built from a
63
+ shallow copy, so unchanged columns cost no extra memory). `profile` is
64
+ read-only.
65
+ 4. **Fast by construction.** Vectorized pandas operations only — no
66
+ row-wise `apply`. Type inference pre-screens a sample, so hopeless
67
+ conversions are rejected at O(sample), not O(n), and conversions only
68
+ stick when ≥ 95 % of values parse (configurable).
69
+
70
+ ## What `clean` does by default
71
+
72
+ | order | step | what it does |
73
+ |---|---|---|
74
+ | 1 | `column_names` | snake_case names, deduplicate collisions (`"a", "a"` → `"a", "a_2"`) |
75
+ | 2 | `strip_whitespace` | trim surrounding whitespace in text cells (internal spacing kept) |
76
+ | 3 | `normalize_sentinels` | `"N/A"`, `"null"`, `"-"`, `""`, `"#REF!"`, … → missing |
77
+ | 4 | `drop_empty_columns` / `drop_empty_rows` | remove all-missing columns and rows |
78
+ | 5 | `fix_dtypes` | text → numeric (`"$1,234.56"` works) / datetime / boolean, validated |
79
+ | 6 | `drop_duplicates` | drop exact duplicate rows, keep the first |
80
+
81
+ Conversions are conservative: a column converts only when at least
82
+ `numeric_threshold` (default 0.95) of its non-missing values parse, mixed-type
83
+ columns never lose their non-string values, and every value coerced to missing
84
+ is counted in the report.
85
+
86
+ ## Opt-in steps
87
+
88
+ ```python
89
+ fd.clean(
90
+ df,
91
+ impute="auto", # median for numeric, mode otherwise ("mean"/"median"/"mode")
92
+ outliers="clip", # or "flag" to add a boolean <col>_outlier column
93
+ outlier_method="iqr", # or "zscore"; factors default to 1.5 / 3.0
94
+ drop_constant_columns=True, # single-valued columns
95
+ optimize_memory=True, # downcast numerics, categorize low-cardinality text
96
+ reset_index=True, # 0..n-1 index instead of original labels
97
+ )
98
+ ```
99
+
100
+ Every option lives on one frozen dataclass — `fd.CleanConfig` — and unknown
101
+ names fail immediately with a "did you mean" suggestion:
102
+
103
+ ```python
104
+ config = fd.CleanConfig(drop_duplicates=False, extra_sentinels=("unknown",))
105
+ fd.clean(df, config=config, impute="median") # config + overrides
106
+
107
+ cleaner = fd.Cleaner(impute="median") # reusable pipeline
108
+ for path in paths:
109
+ out = cleaner.clean(pd.read_csv(path))
110
+ log.info(cleaner.report_.summary())
111
+ ```
112
+
113
+ ## Profiling
114
+
115
+ `fd.profile(df)` inspects without changing anything — and because it runs the
116
+ *same* inference code as `clean`, its suggestions are a faithful preview:
117
+
118
+ ```python
119
+ print(fd.profile(df))
120
+ ```
121
+
122
+ ```text
123
+ freshdata profile — 5 rows x 6 columns, 1.5 KB
124
+ missing cells: 6 (20.0%) duplicate rows: 1
125
+ column dtype missing issues
126
+ First Name object 20% 20.0% missing; 1 value(s) with surrounding whitespace; …
127
+ AGE object - 1 sentinel value(s) meaning missing; would convert to Int64
128
+ Joined Date object - would convert to datetime64[ns]
129
+ Active object - would convert to bool
130
+ Salary($) object - would convert to float64
131
+ empty object 100% 100.0% missing; constant column
132
+ ```
133
+
134
+ `profile.to_frame()` gives the same as a DataFrame; `profile.to_dict()` is
135
+ JSON-friendly for logging and data-quality dashboards.
136
+
137
+ ## What freshdata will not do
138
+
139
+ - Guess at fuzzy entity resolution ("Jon" vs "John").
140
+ - Impute, drop outliers, or change distributions unless you ask.
141
+ - Parse ambiguous European decimal commas (`"1.234,56"`) — too risky to guess.
142
+ - Mutate your DataFrame, ever.
143
+
144
+ ## API
145
+
146
+ | name | purpose |
147
+ |---|---|
148
+ | `fd.clean(df, *, report=False, config=None, **options)` | clean, optionally returning a `CleanReport` |
149
+ | `fd.profile(df, *, config=None, **options)` | read-only inspection with actionable issues |
150
+ | `fd.Cleaner(config=None, **options)` | reusable configured pipeline (`.clean()`, `.report_`) |
151
+ | `fd.CleanConfig` | frozen dataclass holding every option |
152
+ | `fd.CleanReport` / `fd.Action` | audit trail (`summary()`, `to_dict()`, `to_frame()`) |
153
+ | `fd.Profile` / `fd.ColumnProfile` | profiling results |
154
+
155
+ ## Development
156
+
157
+ ```bash
158
+ git clone https://github.com/JohnnyWilson-Portfolio/freshdata
159
+ cd freshdata
160
+ pip install -e ".[dev]"
161
+ pytest
162
+ ruff check src tests
163
+ ```
164
+
165
+ Benchmarks live in `benchmarks/bench.py` (`python benchmarks/bench.py`).
166
+
167
+ ## License
168
+
169
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,96 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.21"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "freshdata-cleaner"
7
+ version = "0.1.0"
8
+ description = "Fast, safe, automatic data cleaning for real-world tabular data."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Johnny Wilson Dougherty", email = "jyothiswaroop2803@gmail.com" },
14
+ ]
15
+ keywords = [
16
+ "data-cleaning",
17
+ "data-quality",
18
+ "pandas",
19
+ "preprocessing",
20
+ "etl",
21
+ "tabular-data",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 4 - Beta",
25
+ "Intended Audience :: Developers",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: OS Independent",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.9",
31
+ "Programming Language :: Python :: 3.10",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Programming Language :: Python :: 3.13",
35
+ "Topic :: Scientific/Engineering :: Information Analysis",
36
+ "Typing :: Typed",
37
+ ]
38
+ dependencies = [
39
+ "pandas>=1.5",
40
+ "numpy>=1.21",
41
+ ]
42
+
43
+ [project.optional-dependencies]
44
+ dev = [
45
+ "pytest>=7.0",
46
+ "pytest-cov>=4.0",
47
+ "ruff>=0.4",
48
+ "mypy>=1.8",
49
+ "build>=1.0",
50
+ ]
51
+
52
+ [project.urls]
53
+ Homepage = "https://github.com/JohnnyWilson-Portfolio/freshdata"
54
+ Repository = "https://github.com/JohnnyWilson-Portfolio/freshdata"
55
+ Issues = "https://github.com/JohnnyWilson-Portfolio/freshdata/issues"
56
+ Changelog = "https://github.com/JohnnyWilson-Portfolio/freshdata/blob/main/CHANGELOG.md"
57
+
58
+ [tool.hatch.build.targets.wheel]
59
+ packages = ["src/freshdata"]
60
+
61
+ [tool.hatch.build.targets.sdist]
62
+ include = ["src", "tests", "README.md", "CHANGELOG.md", "LICENSE"]
63
+
64
+ [tool.pytest.ini_options]
65
+ testpaths = ["tests"]
66
+ addopts = "-q --strict-markers"
67
+ xfail_strict = true
68
+
69
+ [tool.ruff]
70
+ line-length = 99
71
+ target-version = "py39"
72
+ src = ["src", "tests"]
73
+
74
+ [tool.ruff.lint]
75
+ select = ["E", "F", "W", "I", "UP", "B", "SIM", "C4", "RET", "PL"]
76
+ ignore = [
77
+ "PLR0911", # too many returns — dtype heuristics are naturally branchy
78
+ "PLR0912", # too many branches — cleaning steps are naturally branchy
79
+ "PLR0913", # many keyword options is the point of the config surface
80
+ "PLR2004", # magic-value comparisons in heuristics are documented inline
81
+ ]
82
+
83
+ [tool.ruff.lint.per-file-ignores]
84
+ "tests/*" = ["PLR2004", "SIM117"]
85
+
86
+ [tool.mypy]
87
+ python_version = "3.9"
88
+ strict = false
89
+ warn_unused_ignores = true
90
+ warn_redundant_casts = true
91
+ no_implicit_optional = true
92
+ files = ["src/freshdata"]
93
+
94
+ [[tool.mypy.overrides]]
95
+ module = ["pandas.*", "numpy.*"]
96
+ ignore_missing_imports = true
@@ -0,0 +1,39 @@
1
+ """freshdata — fast, safe, automatic data cleaning for real-world tabular data.
2
+
3
+ >>> import freshdata as fd
4
+ >>> cleaned = fd.clean(df)
5
+ >>> cleaned, report = fd.clean(df, report=True)
6
+ >>> print(fd.profile(df))
7
+
8
+ Design principles
9
+ -----------------
10
+ - **No surprises.** Defaults only fix representation (whitespace, sentinel
11
+ strings, wrong dtypes, exact duplicates, empty rows/columns). Anything that
12
+ changes your data's statistics is opt-in.
13
+ - **Everything is reported.** Each transformation is recorded with the column
14
+ and the number of affected cells.
15
+ - **Never mutates input.** ``clean`` returns a new frame; profiling is
16
+ read-only.
17
+ - **Fast by construction.** Vectorized pandas operations only, with
18
+ sample-based pre-screening so type inference stays cheap on large frames.
19
+ """
20
+
21
+ from .api import clean, profile
22
+ from .cleaner import Cleaner
23
+ from .config import CleanConfig
24
+ from .profile import ColumnProfile, Profile
25
+ from .report import Action, CleanReport
26
+
27
+ __version__ = "0.1.0"
28
+
29
+ __all__ = [
30
+ "Action",
31
+ "CleanConfig",
32
+ "CleanReport",
33
+ "Cleaner",
34
+ "ColumnProfile",
35
+ "Profile",
36
+ "__version__",
37
+ "clean",
38
+ "profile",
39
+ ]
@@ -0,0 +1,50 @@
1
+ """Registry of string values that conventionally mean "missing".
2
+
3
+ All entries are stored casefolded; matching is case-insensitive and happens
4
+ after whitespace stripping, so ``" N/A "`` and ``"n/a"`` both match.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ #: Values commonly used in CSV / Excel / SQL exports to denote a missing cell.
10
+ #: Deliberately conservative: entries here are near-certain to mean "missing"
11
+ #: when they appear as the entire cell value. Domain words that merely *might*
12
+ #: mean missing (e.g. ``"unknown"``) are excluded; pass them via the
13
+ #: ``extra_sentinels`` option instead.
14
+ DEFAULT_SENTINELS: frozenset[str] = frozenset(
15
+ {
16
+ # empty / placeholder punctuation
17
+ "",
18
+ "-",
19
+ "--",
20
+ "---",
21
+ "?",
22
+ "??",
23
+ # spelled-out missing markers
24
+ "na",
25
+ "n/a",
26
+ "n\\a",
27
+ "n.a",
28
+ "n.a.",
29
+ "nan",
30
+ "null",
31
+ "none",
32
+ "nil",
33
+ "missing",
34
+ "(null)",
35
+ "(none)",
36
+ "(blank)",
37
+ "(empty)",
38
+ "(missing)",
39
+ # Excel error codes — never legitimate data
40
+ "#n/a",
41
+ "#n/a n/a",
42
+ "#na",
43
+ "#null!",
44
+ "#div/0!",
45
+ "#ref!",
46
+ "#value!",
47
+ "#name?",
48
+ "#num!",
49
+ }
50
+ )
@@ -0,0 +1,59 @@
1
+ """Small shared helpers. Internal — no stability guarantees."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ #: Major version of the installed pandas, for the few places behavior differs.
8
+ PANDAS_MAJOR: int = int(pd.__version__.split(".")[0])
9
+
10
+
11
+ #: Above this many rows, object payloads are estimated from a sample instead
12
+ #: of measured cell by cell, keeping report bookkeeping ~free on tall frames.
13
+ _MEMORY_SAMPLE_THRESHOLD = 200_000
14
+ _MEMORY_SAMPLE_SIZE = 20_000
15
+
16
+
17
+ def memory_bytes(df: pd.DataFrame) -> int:
18
+ """Total memory footprint of *df* in bytes, including object payloads.
19
+
20
+ Exact for frames up to ~200k rows; for taller frames the per-row payload
21
+ of object/string columns is estimated from a 20k-row random sample (other
22
+ dtypes are always exact — their size does not depend on values).
23
+ """
24
+ n = len(df)
25
+ if n <= _MEMORY_SAMPLE_THRESHOLD:
26
+ return int(df.memory_usage(deep=True).sum())
27
+ total = int(df.memory_usage(deep=False).sum())
28
+ for i, dtype in enumerate(df.dtypes):
29
+ if not _is_stringlike_dtype(dtype):
30
+ continue
31
+ sample = df.iloc[:, i].sample(_MEMORY_SAMPLE_SIZE, random_state=0)
32
+ payload = sample.memory_usage(deep=True) - sample.memory_usage(deep=False)
33
+ total += int(payload / len(sample) * n)
34
+ return total
35
+
36
+
37
+ def format_bytes(n: float) -> str:
38
+ """Render a byte count for humans: ``format_bytes(2048) == '2.0 KB'``."""
39
+ for unit in ("B", "KB", "MB", "GB"):
40
+ if abs(n) < 1024.0:
41
+ return f"{n:.1f} {unit}" if unit != "B" else f"{int(n)} B"
42
+ n /= 1024.0
43
+ return f"{n:.1f} TB"
44
+
45
+
46
+ def sample_series(s: pd.Series, size: int, random_state: int) -> pd.Series:
47
+ """Return *s* itself if small, else a reproducible random sample of *size*."""
48
+ if len(s) <= size:
49
+ return s
50
+ return s.sample(size, random_state=random_state)
51
+
52
+
53
+ def stringlike_columns(df: pd.DataFrame) -> list:
54
+ """Column labels whose dtype can hold free-form text (object or string)."""
55
+ return list(df.columns[[_is_stringlike_dtype(dt) for dt in df.dtypes]])
56
+
57
+
58
+ def _is_stringlike_dtype(dtype: object) -> bool:
59
+ return pd.api.types.is_object_dtype(dtype) or isinstance(dtype, pd.StringDtype)