freshdata-cleaner 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- freshdata_cleaner-0.1.0/.gitignore +16 -0
- freshdata_cleaner-0.1.0/CHANGELOG.md +28 -0
- freshdata_cleaner-0.1.0/LICENSE +21 -0
- freshdata_cleaner-0.1.0/PKG-INFO +205 -0
- freshdata_cleaner-0.1.0/README.md +169 -0
- freshdata_cleaner-0.1.0/pyproject.toml +96 -0
- freshdata_cleaner-0.1.0/src/freshdata/__init__.py +39 -0
- freshdata_cleaner-0.1.0/src/freshdata/_sentinels.py +50 -0
- freshdata_cleaner-0.1.0/src/freshdata/_util.py +59 -0
- freshdata_cleaner-0.1.0/src/freshdata/api.py +88 -0
- freshdata_cleaner-0.1.0/src/freshdata/cleaner.py +127 -0
- freshdata_cleaner-0.1.0/src/freshdata/config.py +133 -0
- freshdata_cleaner-0.1.0/src/freshdata/profile.py +219 -0
- freshdata_cleaner-0.1.0/src/freshdata/py.typed +0 -0
- freshdata_cleaner-0.1.0/src/freshdata/report.py +135 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/__init__.py +8 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/columns.py +68 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/dtypes.py +240 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/duplicates.py +43 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/memory.py +89 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/missing.py +78 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/outliers.py +72 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/prune.py +58 -0
- freshdata_cleaner-0.1.0/src/freshdata/steps/strings.py +90 -0
- freshdata_cleaner-0.1.0/tests/conftest.py +29 -0
- freshdata_cleaner-0.1.0/tests/test_api.py +92 -0
- freshdata_cleaner-0.1.0/tests/test_columns.py +56 -0
- freshdata_cleaner-0.1.0/tests/test_dtypes.py +146 -0
- freshdata_cleaner-0.1.0/tests/test_memory.py +58 -0
- freshdata_cleaner-0.1.0/tests/test_missing.py +79 -0
- freshdata_cleaner-0.1.0/tests/test_outliers.py +65 -0
- freshdata_cleaner-0.1.0/tests/test_profile.py +91 -0
- freshdata_cleaner-0.1.0/tests/test_properties.py +67 -0
- freshdata_cleaner-0.1.0/tests/test_report.py +62 -0
- freshdata_cleaner-0.1.0/tests/test_rows.py +73 -0
- freshdata_cleaner-0.1.0/tests/test_strings.py +67 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.1.0] - 2026-06-12
|
|
8
|
+
|
|
9
|
+
Initial release.
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- `freshdata.clean()` — automatic, audited cleaning: column-name
|
|
13
|
+
normalization, whitespace stripping, sentinel-string normalization,
|
|
14
|
+
empty row/column pruning, validated dtype inference (numeric incl.
|
|
15
|
+
currency/thousands separators, datetime, boolean), and exact duplicate
|
|
16
|
+
removal.
|
|
17
|
+
- Opt-in steps: imputation (`auto`/`mean`/`median`/`mode`), outlier
|
|
18
|
+
clipping/flagging (IQR or z-score), constant-column dropping, memory
|
|
19
|
+
optimization (numeric downcasting + category conversion), index reset.
|
|
20
|
+
- `freshdata.profile()` — read-only profiling whose dtype suggestions are
|
|
21
|
+
produced by the same inference code `clean` uses.
|
|
22
|
+
- `freshdata.Cleaner` — reusable configured pipeline with `report_`.
|
|
23
|
+
- `freshdata.CleanConfig` — frozen, self-validating configuration;
|
|
24
|
+
unknown options raise with a "did you mean" suggestion.
|
|
25
|
+
- `freshdata.CleanReport` / `freshdata.Action` — structured audit trail
|
|
26
|
+
with `summary()`, `to_dict()`, `to_frame()`.
|
|
27
|
+
- Type hints throughout (`py.typed`), zero dependencies beyond
|
|
28
|
+
pandas/numpy, support for Python 3.9–3.13.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Johnny Wilson Dougherty
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: freshdata-cleaner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast, safe, automatic data cleaning for real-world tabular data.
|
|
5
|
+
Project-URL: Homepage, https://github.com/JohnnyWilson-Portfolio/freshdata
|
|
6
|
+
Project-URL: Repository, https://github.com/JohnnyWilson-Portfolio/freshdata
|
|
7
|
+
Project-URL: Issues, https://github.com/JohnnyWilson-Portfolio/freshdata/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/JohnnyWilson-Portfolio/freshdata/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Johnny Wilson Dougherty <jyothiswaroop2803@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: data-cleaning,data-quality,etl,pandas,preprocessing,tabular-data
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Requires-Dist: numpy>=1.21
|
|
28
|
+
Requires-Dist: pandas>=1.5
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# freshdata
|
|
38
|
+
|
|
39
|
+
**Fast, safe, automatic data cleaning for real-world tabular data.**
|
|
40
|
+
|
|
41
|
+
[](https://github.com/JohnnyWilson-Portfolio/freshdata/actions/workflows/ci.yml)
|
|
42
|
+
[](https://pypi.org/project/freshdata-cleaner/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
|
|
45
|
+
`freshdata` fixes the messy parts of CSV / Excel / SQL-export data — stray
|
|
46
|
+
whitespace, `"N/A"` strings, numbers stored as text, duplicate rows — in one
|
|
47
|
+
call, and tells you exactly what it did.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import pandas as pd
|
|
51
|
+
import freshdata as fd
|
|
52
|
+
|
|
53
|
+
df = pd.read_csv("export.csv")
|
|
54
|
+
|
|
55
|
+
cleaned = fd.clean(df) # one line
|
|
56
|
+
cleaned, report = fd.clean(df, report=True) # ... with a full audit trail
|
|
57
|
+
print(report.summary())
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```text
|
|
61
|
+
freshdata clean report
|
|
62
|
+
rows: 5 -> 4 (-1)
|
|
63
|
+
columns: 6 -> 5 (-1)
|
|
64
|
+
memory: 1.5 KB -> 298 B
|
|
65
|
+
time: 0.011s
|
|
66
|
+
actions (12):
|
|
67
|
+
- [column_names] renamed 5 column(s): ' First Name '->'first_name', 'AGE'->'age', …
|
|
68
|
+
- [strip_whitespace] 'first_name': trimmed surrounding whitespace
|
|
69
|
+
- [normalize_sentinels] 'age': replaced sentinel strings ("N/A", "-", "", …) with missing
|
|
70
|
+
- [drop_empty_columns] dropped 1 all-missing column(s): empty
|
|
71
|
+
- [fix_dtypes] 'age': converted to Int64
|
|
72
|
+
- [fix_dtypes] 'joined_date': converted to datetime64[ns]
|
|
73
|
+
- [fix_dtypes] 'active': converted to bool
|
|
74
|
+
- [fix_dtypes] 'salary': converted to float64
|
|
75
|
+
- [drop_duplicates] dropped 1 duplicate row(s)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Install
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install freshdata-cleaner
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Requires Python ≥ 3.9 and pandas ≥ 1.5. No other dependencies.
|
|
85
|
+
|
|
86
|
+
## Why another cleaning library?
|
|
87
|
+
|
|
88
|
+
Most auto-cleaners are either trivial wrappers or opaque frameworks that
|
|
89
|
+
guess. `freshdata` is built on four rules:
|
|
90
|
+
|
|
91
|
+
1. **No surprises.** Defaults only repair *representation* — whitespace,
|
|
92
|
+
sentinel strings, wrong dtypes, exact duplicate rows, all-empty
|
|
93
|
+
rows/columns. Anything that changes your data's *statistics* (imputation,
|
|
94
|
+
outlier handling, lossy downcasting) is opt-in.
|
|
95
|
+
2. **Everything is reported.** Every transformation is recorded with the
|
|
96
|
+
column name and the number of affected cells. `bool(report)` is `False`
|
|
97
|
+
when nothing changed.
|
|
98
|
+
3. **Never mutates your input.** `clean` returns a new frame (built from a
|
|
99
|
+
shallow copy, so unchanged columns cost no extra memory). `profile` is
|
|
100
|
+
read-only.
|
|
101
|
+
4. **Fast by construction.** Vectorized pandas operations only — no
|
|
102
|
+
row-wise `apply`. Type inference pre-screens a sample, so hopeless
|
|
103
|
+
conversions are rejected at O(sample), not O(n), and conversions only
|
|
104
|
+
stick when ≥ 95 % of values parse (configurable).
|
|
105
|
+
|
|
106
|
+
## What `clean` does by default
|
|
107
|
+
|
|
108
|
+
| order | step | what it does |
|
|
109
|
+
|---|---|---|
|
|
110
|
+
| 1 | `column_names` | snake_case names, deduplicate collisions (`"a", "a"` → `"a", "a_2"`) |
|
|
111
|
+
| 2 | `strip_whitespace` | trim surrounding whitespace in text cells (internal spacing kept) |
|
|
112
|
+
| 3 | `normalize_sentinels` | `"N/A"`, `"null"`, `"-"`, `""`, `"#REF!"`, … → missing |
|
|
113
|
+
| 4 | `drop_empty_columns` / `drop_empty_rows` | remove all-missing columns and rows |
|
|
114
|
+
| 5 | `fix_dtypes` | text → numeric (`"$1,234.56"` works) / datetime / boolean, validated |
|
|
115
|
+
| 6 | `drop_duplicates` | drop exact duplicate rows, keep the first |
|
|
116
|
+
|
|
117
|
+
Conversions are conservative: a column converts only when at least
|
|
118
|
+
`numeric_threshold` (default 0.95) of its non-missing values parse, mixed-type
|
|
119
|
+
columns never lose their non-string values, and every value coerced to missing
|
|
120
|
+
is counted in the report.
|
|
121
|
+
|
|
122
|
+
## Opt-in steps
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
fd.clean(
|
|
126
|
+
df,
|
|
127
|
+
impute="auto", # median for numeric, mode otherwise ("mean"/"median"/"mode")
|
|
128
|
+
outliers="clip", # or "flag" to add a boolean <col>_outlier column
|
|
129
|
+
outlier_method="iqr", # or "zscore"; factors default to 1.5 / 3.0
|
|
130
|
+
drop_constant_columns=True, # single-valued columns
|
|
131
|
+
optimize_memory=True, # downcast numerics, categorize low-cardinality text
|
|
132
|
+
reset_index=True, # 0..n-1 index instead of original labels
|
|
133
|
+
)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Every option lives on one frozen dataclass — `fd.CleanConfig` — and unknown
|
|
137
|
+
names fail immediately with a "did you mean" suggestion:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
config = fd.CleanConfig(drop_duplicates=False, extra_sentinels=("unknown",))
|
|
141
|
+
fd.clean(df, config=config, impute="median") # config + overrides
|
|
142
|
+
|
|
143
|
+
cleaner = fd.Cleaner(impute="median") # reusable pipeline
|
|
144
|
+
for path in paths:
|
|
145
|
+
out = cleaner.clean(pd.read_csv(path))
|
|
146
|
+
log.info(cleaner.report_.summary())
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Profiling
|
|
150
|
+
|
|
151
|
+
`fd.profile(df)` inspects without changing anything — and because it runs the
|
|
152
|
+
*same* inference code as `clean`, its suggestions are a faithful preview:
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
print(fd.profile(df))
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
```text
|
|
159
|
+
freshdata profile — 5 rows x 6 columns, 1.5 KB
|
|
160
|
+
missing cells: 6 (20.0%) duplicate rows: 1
|
|
161
|
+
column dtype missing issues
|
|
162
|
+
First Name object 20% 20.0% missing; 1 value(s) with surrounding whitespace; …
|
|
163
|
+
AGE object - 1 sentinel value(s) meaning missing; would convert to Int64
|
|
164
|
+
Joined Date object - would convert to datetime64[ns]
|
|
165
|
+
Active object - would convert to bool
|
|
166
|
+
Salary($) object - would convert to float64
|
|
167
|
+
empty object 100% 100.0% missing; constant column
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
`profile.to_frame()` gives the same as a DataFrame; `profile.to_dict()` is
|
|
171
|
+
JSON-friendly for logging and data-quality dashboards.
|
|
172
|
+
|
|
173
|
+
## What freshdata will not do
|
|
174
|
+
|
|
175
|
+
- Guess at fuzzy entity resolution ("Jon" vs "John").
|
|
176
|
+
- Impute, drop outliers, or change distributions unless you ask.
|
|
177
|
+
- Parse ambiguous European decimal commas (`"1.234,56"`) — too risky to guess.
|
|
178
|
+
- Mutate your DataFrame, ever.
|
|
179
|
+
|
|
180
|
+
## API
|
|
181
|
+
|
|
182
|
+
| name | purpose |
|
|
183
|
+
|---|---|
|
|
184
|
+
| `fd.clean(df, *, report=False, config=None, **options)` | clean, optionally returning a `CleanReport` |
|
|
185
|
+
| `fd.profile(df, *, config=None, **options)` | read-only inspection with actionable issues |
|
|
186
|
+
| `fd.Cleaner(config=None, **options)` | reusable configured pipeline (`.clean()`, `.report_`) |
|
|
187
|
+
| `fd.CleanConfig` | frozen dataclass holding every option |
|
|
188
|
+
| `fd.CleanReport` / `fd.Action` | audit trail (`summary()`, `to_dict()`, `to_frame()`) |
|
|
189
|
+
| `fd.Profile` / `fd.ColumnProfile` | profiling results |
|
|
190
|
+
|
|
191
|
+
## Development
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
git clone https://github.com/JohnnyWilson-Portfolio/freshdata
|
|
195
|
+
cd freshdata
|
|
196
|
+
pip install -e ".[dev]"
|
|
197
|
+
pytest
|
|
198
|
+
ruff check src tests
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Benchmarks live in `benchmarks/bench.py` (`python benchmarks/bench.py`).
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# freshdata
|
|
2
|
+
|
|
3
|
+
**Fast, safe, automatic data cleaning for real-world tabular data.**
|
|
4
|
+
|
|
5
|
+
[](https://github.com/JohnnyWilson-Portfolio/freshdata/actions/workflows/ci.yml)
|
|
6
|
+
[](https://pypi.org/project/freshdata-cleaner/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
`freshdata` fixes the messy parts of CSV / Excel / SQL-export data — stray
|
|
10
|
+
whitespace, `"N/A"` strings, numbers stored as text, duplicate rows — in one
|
|
11
|
+
call, and tells you exactly what it did.
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import freshdata as fd
|
|
16
|
+
|
|
17
|
+
df = pd.read_csv("export.csv")
|
|
18
|
+
|
|
19
|
+
cleaned = fd.clean(df) # one line
|
|
20
|
+
cleaned, report = fd.clean(df, report=True) # ... with a full audit trail
|
|
21
|
+
print(report.summary())
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
```text
|
|
25
|
+
freshdata clean report
|
|
26
|
+
rows: 5 -> 4 (-1)
|
|
27
|
+
columns: 6 -> 5 (-1)
|
|
28
|
+
memory: 1.5 KB -> 298 B
|
|
29
|
+
time: 0.011s
|
|
30
|
+
actions (12):
|
|
31
|
+
- [column_names] renamed 5 column(s): ' First Name '->'first_name', 'AGE'->'age', …
|
|
32
|
+
- [strip_whitespace] 'first_name': trimmed surrounding whitespace
|
|
33
|
+
- [normalize_sentinels] 'age': replaced sentinel strings ("N/A", "-", "", …) with missing
|
|
34
|
+
- [drop_empty_columns] dropped 1 all-missing column(s): empty
|
|
35
|
+
- [fix_dtypes] 'age': converted to Int64
|
|
36
|
+
- [fix_dtypes] 'joined_date': converted to datetime64[ns]
|
|
37
|
+
- [fix_dtypes] 'active': converted to bool
|
|
38
|
+
- [fix_dtypes] 'salary': converted to float64
|
|
39
|
+
- [drop_duplicates] dropped 1 duplicate row(s)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install freshdata-cleaner
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Requires Python ≥ 3.9 and pandas ≥ 1.5. No other dependencies.
|
|
49
|
+
|
|
50
|
+
## Why another cleaning library?
|
|
51
|
+
|
|
52
|
+
Most auto-cleaners are either trivial wrappers or opaque frameworks that
|
|
53
|
+
guess. `freshdata` is built on four rules:
|
|
54
|
+
|
|
55
|
+
1. **No surprises.** Defaults only repair *representation* — whitespace,
|
|
56
|
+
sentinel strings, wrong dtypes, exact duplicate rows, all-empty
|
|
57
|
+
rows/columns. Anything that changes your data's *statistics* (imputation,
|
|
58
|
+
outlier handling, lossy downcasting) is opt-in.
|
|
59
|
+
2. **Everything is reported.** Every transformation is recorded with the
|
|
60
|
+
column name and the number of affected cells. `bool(report)` is `False`
|
|
61
|
+
when nothing changed.
|
|
62
|
+
3. **Never mutates your input.** `clean` returns a new frame (built from a
|
|
63
|
+
shallow copy, so unchanged columns cost no extra memory). `profile` is
|
|
64
|
+
read-only.
|
|
65
|
+
4. **Fast by construction.** Vectorized pandas operations only — no
|
|
66
|
+
row-wise `apply`. Type inference pre-screens a sample, so hopeless
|
|
67
|
+
conversions are rejected at O(sample), not O(n), and conversions only
|
|
68
|
+
stick when ≥ 95 % of values parse (configurable).
|
|
69
|
+
|
|
70
|
+
## What `clean` does by default
|
|
71
|
+
|
|
72
|
+
| order | step | what it does |
|
|
73
|
+
|---|---|---|
|
|
74
|
+
| 1 | `column_names` | snake_case names, deduplicate collisions (`"a", "a"` → `"a", "a_2"`) |
|
|
75
|
+
| 2 | `strip_whitespace` | trim surrounding whitespace in text cells (internal spacing kept) |
|
|
76
|
+
| 3 | `normalize_sentinels` | `"N/A"`, `"null"`, `"-"`, `""`, `"#REF!"`, … → missing |
|
|
77
|
+
| 4 | `drop_empty_columns` / `drop_empty_rows` | remove all-missing columns and rows |
|
|
78
|
+
| 5 | `fix_dtypes` | text → numeric (`"$1,234.56"` works) / datetime / boolean, validated |
|
|
79
|
+
| 6 | `drop_duplicates` | drop exact duplicate rows, keep the first |
|
|
80
|
+
|
|
81
|
+
Conversions are conservative: a column converts only when at least
|
|
82
|
+
`numeric_threshold` (default 0.95) of its non-missing values parse, mixed-type
|
|
83
|
+
columns never lose their non-string values, and every value coerced to missing
|
|
84
|
+
is counted in the report.
|
|
85
|
+
|
|
86
|
+
## Opt-in steps
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
fd.clean(
|
|
90
|
+
df,
|
|
91
|
+
impute="auto", # median for numeric, mode otherwise ("mean"/"median"/"mode")
|
|
92
|
+
outliers="clip", # or "flag" to add a boolean <col>_outlier column
|
|
93
|
+
outlier_method="iqr", # or "zscore"; factors default to 1.5 / 3.0
|
|
94
|
+
drop_constant_columns=True, # single-valued columns
|
|
95
|
+
optimize_memory=True, # downcast numerics, categorize low-cardinality text
|
|
96
|
+
reset_index=True, # 0..n-1 index instead of original labels
|
|
97
|
+
)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Every option lives on one frozen dataclass — `fd.CleanConfig` — and unknown
|
|
101
|
+
names fail immediately with a "did you mean" suggestion:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
config = fd.CleanConfig(drop_duplicates=False, extra_sentinels=("unknown",))
|
|
105
|
+
fd.clean(df, config=config, impute="median") # config + overrides
|
|
106
|
+
|
|
107
|
+
cleaner = fd.Cleaner(impute="median") # reusable pipeline
|
|
108
|
+
for path in paths:
|
|
109
|
+
out = cleaner.clean(pd.read_csv(path))
|
|
110
|
+
log.info(cleaner.report_.summary())
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Profiling
|
|
114
|
+
|
|
115
|
+
`fd.profile(df)` inspects without changing anything — and because it runs the
|
|
116
|
+
*same* inference code as `clean`, its suggestions are a faithful preview:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
print(fd.profile(df))
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
```text
|
|
123
|
+
freshdata profile — 5 rows x 6 columns, 1.5 KB
|
|
124
|
+
missing cells: 6 (20.0%) duplicate rows: 1
|
|
125
|
+
column dtype missing issues
|
|
126
|
+
First Name object 20% 20.0% missing; 1 value(s) with surrounding whitespace; …
|
|
127
|
+
AGE object - 1 sentinel value(s) meaning missing; would convert to Int64
|
|
128
|
+
Joined Date object - would convert to datetime64[ns]
|
|
129
|
+
Active object - would convert to bool
|
|
130
|
+
Salary($) object - would convert to float64
|
|
131
|
+
empty object 100% 100.0% missing; constant column
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
`profile.to_frame()` gives the same as a DataFrame; `profile.to_dict()` is
|
|
135
|
+
JSON-friendly for logging and data-quality dashboards.
|
|
136
|
+
|
|
137
|
+
## What freshdata will not do
|
|
138
|
+
|
|
139
|
+
- Guess at fuzzy entity resolution ("Jon" vs "John").
|
|
140
|
+
- Impute, drop outliers, or change distributions unless you ask.
|
|
141
|
+
- Parse ambiguous European decimal commas (`"1.234,56"`) — too risky to guess.
|
|
142
|
+
- Mutate your DataFrame, ever.
|
|
143
|
+
|
|
144
|
+
## API
|
|
145
|
+
|
|
146
|
+
| name | purpose |
|
|
147
|
+
|---|---|
|
|
148
|
+
| `fd.clean(df, *, report=False, config=None, **options)` | clean, optionally returning a `CleanReport` |
|
|
149
|
+
| `fd.profile(df, *, config=None, **options)` | read-only inspection with actionable issues |
|
|
150
|
+
| `fd.Cleaner(config=None, **options)` | reusable configured pipeline (`.clean()`, `.report_`) |
|
|
151
|
+
| `fd.CleanConfig` | frozen dataclass holding every option |
|
|
152
|
+
| `fd.CleanReport` / `fd.Action` | audit trail (`summary()`, `to_dict()`, `to_frame()`) |
|
|
153
|
+
| `fd.Profile` / `fd.ColumnProfile` | profiling results |
|
|
154
|
+
|
|
155
|
+
## Development
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
git clone https://github.com/JohnnyWilson-Portfolio/freshdata
|
|
159
|
+
cd freshdata
|
|
160
|
+
pip install -e ".[dev]"
|
|
161
|
+
pytest
|
|
162
|
+
ruff check src tests
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Benchmarks live in `benchmarks/bench.py` (`python benchmarks/bench.py`).
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.21"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "freshdata-cleaner"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fast, safe, automatic data cleaning for real-world tabular data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Johnny Wilson Dougherty", email = "jyothiswaroop2803@gmail.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"data-cleaning",
|
|
17
|
+
"data-quality",
|
|
18
|
+
"pandas",
|
|
19
|
+
"preprocessing",
|
|
20
|
+
"etl",
|
|
21
|
+
"tabular-data",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 4 - Beta",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"License :: OSI Approved :: MIT License",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"Programming Language :: Python :: 3.9",
|
|
31
|
+
"Programming Language :: Python :: 3.10",
|
|
32
|
+
"Programming Language :: Python :: 3.11",
|
|
33
|
+
"Programming Language :: Python :: 3.12",
|
|
34
|
+
"Programming Language :: Python :: 3.13",
|
|
35
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
36
|
+
"Typing :: Typed",
|
|
37
|
+
]
|
|
38
|
+
dependencies = [
|
|
39
|
+
"pandas>=1.5",
|
|
40
|
+
"numpy>=1.21",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.optional-dependencies]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=7.0",
|
|
46
|
+
"pytest-cov>=4.0",
|
|
47
|
+
"ruff>=0.4",
|
|
48
|
+
"mypy>=1.8",
|
|
49
|
+
"build>=1.0",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
Homepage = "https://github.com/JohnnyWilson-Portfolio/freshdata"
|
|
54
|
+
Repository = "https://github.com/JohnnyWilson-Portfolio/freshdata"
|
|
55
|
+
Issues = "https://github.com/JohnnyWilson-Portfolio/freshdata/issues"
|
|
56
|
+
Changelog = "https://github.com/JohnnyWilson-Portfolio/freshdata/blob/main/CHANGELOG.md"
|
|
57
|
+
|
|
58
|
+
[tool.hatch.build.targets.wheel]
|
|
59
|
+
packages = ["src/freshdata"]
|
|
60
|
+
|
|
61
|
+
[tool.hatch.build.targets.sdist]
|
|
62
|
+
include = ["src", "tests", "README.md", "CHANGELOG.md", "LICENSE"]
|
|
63
|
+
|
|
64
|
+
[tool.pytest.ini_options]
|
|
65
|
+
testpaths = ["tests"]
|
|
66
|
+
addopts = "-q --strict-markers"
|
|
67
|
+
xfail_strict = true
|
|
68
|
+
|
|
69
|
+
[tool.ruff]
|
|
70
|
+
line-length = 99
|
|
71
|
+
target-version = "py39"
|
|
72
|
+
src = ["src", "tests"]
|
|
73
|
+
|
|
74
|
+
[tool.ruff.lint]
|
|
75
|
+
select = ["E", "F", "W", "I", "UP", "B", "SIM", "C4", "RET", "PL"]
|
|
76
|
+
ignore = [
|
|
77
|
+
"PLR0911", # too many returns — dtype heuristics are naturally branchy
|
|
78
|
+
"PLR0912", # too many branches — cleaning steps are naturally branchy
|
|
79
|
+
"PLR0913", # many keyword options is the point of the config surface
|
|
80
|
+
"PLR2004", # magic-value comparisons in heuristics are documented inline
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
[tool.ruff.lint.per-file-ignores]
|
|
84
|
+
"tests/*" = ["PLR2004", "SIM117"]
|
|
85
|
+
|
|
86
|
+
[tool.mypy]
|
|
87
|
+
python_version = "3.9"
|
|
88
|
+
strict = false
|
|
89
|
+
warn_unused_ignores = true
|
|
90
|
+
warn_redundant_casts = true
|
|
91
|
+
no_implicit_optional = true
|
|
92
|
+
files = ["src/freshdata"]
|
|
93
|
+
|
|
94
|
+
[[tool.mypy.overrides]]
|
|
95
|
+
module = ["pandas.*", "numpy.*"]
|
|
96
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""freshdata — fast, safe, automatic data cleaning for real-world tabular data.
|
|
2
|
+
|
|
3
|
+
>>> import freshdata as fd
|
|
4
|
+
>>> cleaned = fd.clean(df)
|
|
5
|
+
>>> cleaned, report = fd.clean(df, report=True)
|
|
6
|
+
>>> print(fd.profile(df))
|
|
7
|
+
|
|
8
|
+
Design principles
|
|
9
|
+
-----------------
|
|
10
|
+
- **No surprises.** Defaults only fix representation (whitespace, sentinel
|
|
11
|
+
strings, wrong dtypes, exact duplicates, empty rows/columns). Anything that
|
|
12
|
+
changes your data's statistics is opt-in.
|
|
13
|
+
- **Everything is reported.** Each transformation is recorded with the column
|
|
14
|
+
and the number of affected cells.
|
|
15
|
+
- **Never mutates input.** ``clean`` returns a new frame; profiling is
|
|
16
|
+
read-only.
|
|
17
|
+
- **Fast by construction.** Vectorized pandas operations only, with
|
|
18
|
+
sample-based pre-screening so type inference stays cheap on large frames.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .api import clean, profile
|
|
22
|
+
from .cleaner import Cleaner
|
|
23
|
+
from .config import CleanConfig
|
|
24
|
+
from .profile import ColumnProfile, Profile
|
|
25
|
+
from .report import Action, CleanReport
|
|
26
|
+
|
|
27
|
+
__version__ = "0.1.0"
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"Action",
|
|
31
|
+
"CleanConfig",
|
|
32
|
+
"CleanReport",
|
|
33
|
+
"Cleaner",
|
|
34
|
+
"ColumnProfile",
|
|
35
|
+
"Profile",
|
|
36
|
+
"__version__",
|
|
37
|
+
"clean",
|
|
38
|
+
"profile",
|
|
39
|
+
]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Registry of string values that conventionally mean "missing".
|
|
2
|
+
|
|
3
|
+
All entries are stored casefolded; matching is case-insensitive and happens
|
|
4
|
+
after whitespace stripping, so ``" N/A "`` and ``"n/a"`` both match.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
#: Values commonly used in CSV / Excel / SQL exports to denote a missing cell.
|
|
10
|
+
#: Deliberately conservative: entries here are near-certain to mean "missing"
|
|
11
|
+
#: when they appear as the entire cell value. Domain words that merely *might*
|
|
12
|
+
#: mean missing (e.g. ``"unknown"``) are excluded; pass them via the
|
|
13
|
+
#: ``extra_sentinels`` option instead.
|
|
14
|
+
DEFAULT_SENTINELS: frozenset[str] = frozenset(
|
|
15
|
+
{
|
|
16
|
+
# empty / placeholder punctuation
|
|
17
|
+
"",
|
|
18
|
+
"-",
|
|
19
|
+
"--",
|
|
20
|
+
"---",
|
|
21
|
+
"?",
|
|
22
|
+
"??",
|
|
23
|
+
# spelled-out missing markers
|
|
24
|
+
"na",
|
|
25
|
+
"n/a",
|
|
26
|
+
"n\\a",
|
|
27
|
+
"n.a",
|
|
28
|
+
"n.a.",
|
|
29
|
+
"nan",
|
|
30
|
+
"null",
|
|
31
|
+
"none",
|
|
32
|
+
"nil",
|
|
33
|
+
"missing",
|
|
34
|
+
"(null)",
|
|
35
|
+
"(none)",
|
|
36
|
+
"(blank)",
|
|
37
|
+
"(empty)",
|
|
38
|
+
"(missing)",
|
|
39
|
+
# Excel error codes — never legitimate data
|
|
40
|
+
"#n/a",
|
|
41
|
+
"#n/a n/a",
|
|
42
|
+
"#na",
|
|
43
|
+
"#null!",
|
|
44
|
+
"#div/0!",
|
|
45
|
+
"#ref!",
|
|
46
|
+
"#value!",
|
|
47
|
+
"#name?",
|
|
48
|
+
"#num!",
|
|
49
|
+
}
|
|
50
|
+
)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Small shared helpers. Internal — no stability guarantees."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
#: Major version of the installed pandas, for the few places behavior differs.
|
|
8
|
+
PANDAS_MAJOR: int = int(pd.__version__.split(".")[0])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
#: Above this many rows, object payloads are estimated from a sample instead
|
|
12
|
+
#: of measured cell by cell, keeping report bookkeeping ~free on tall frames.
|
|
13
|
+
_MEMORY_SAMPLE_THRESHOLD = 200_000
|
|
14
|
+
_MEMORY_SAMPLE_SIZE = 20_000
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def memory_bytes(df: pd.DataFrame) -> int:
|
|
18
|
+
"""Total memory footprint of *df* in bytes, including object payloads.
|
|
19
|
+
|
|
20
|
+
Exact for frames up to ~200k rows; for taller frames the per-row payload
|
|
21
|
+
of object/string columns is estimated from a 20k-row random sample (other
|
|
22
|
+
dtypes are always exact — their size does not depend on values).
|
|
23
|
+
"""
|
|
24
|
+
n = len(df)
|
|
25
|
+
if n <= _MEMORY_SAMPLE_THRESHOLD:
|
|
26
|
+
return int(df.memory_usage(deep=True).sum())
|
|
27
|
+
total = int(df.memory_usage(deep=False).sum())
|
|
28
|
+
for i, dtype in enumerate(df.dtypes):
|
|
29
|
+
if not _is_stringlike_dtype(dtype):
|
|
30
|
+
continue
|
|
31
|
+
sample = df.iloc[:, i].sample(_MEMORY_SAMPLE_SIZE, random_state=0)
|
|
32
|
+
payload = sample.memory_usage(deep=True) - sample.memory_usage(deep=False)
|
|
33
|
+
total += int(payload / len(sample) * n)
|
|
34
|
+
return total
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def format_bytes(n: float) -> str:
|
|
38
|
+
"""Render a byte count for humans: ``format_bytes(2048) == '2.0 KB'``."""
|
|
39
|
+
for unit in ("B", "KB", "MB", "GB"):
|
|
40
|
+
if abs(n) < 1024.0:
|
|
41
|
+
return f"{n:.1f} {unit}" if unit != "B" else f"{int(n)} B"
|
|
42
|
+
n /= 1024.0
|
|
43
|
+
return f"{n:.1f} TB"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def sample_series(s: pd.Series, size: int, random_state: int) -> pd.Series:
|
|
47
|
+
"""Return *s* itself if small, else a reproducible random sample of *size*."""
|
|
48
|
+
if len(s) <= size:
|
|
49
|
+
return s
|
|
50
|
+
return s.sample(size, random_state=random_state)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def stringlike_columns(df: pd.DataFrame) -> list:
|
|
54
|
+
"""Column labels whose dtype can hold free-form text (object or string)."""
|
|
55
|
+
return list(df.columns[[_is_stringlike_dtype(dt) for dt in df.dtypes]])
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _is_stringlike_dtype(dtype: object) -> bool:
|
|
59
|
+
return pd.api.types.is_object_dtype(dtype) or isinstance(dtype, pd.StringDtype)
|