messy-table 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- messy_table-0.1.0/.gitignore +33 -0
- messy_table-0.1.0/CHANGELOG.md +43 -0
- messy_table-0.1.0/LICENSE +21 -0
- messy_table-0.1.0/PKG-INFO +176 -0
- messy_table-0.1.0/README.md +140 -0
- messy_table-0.1.0/docs/heuristics.md +123 -0
- messy_table-0.1.0/pyproject.toml +136 -0
- messy_table-0.1.0/src/messy_table/__init__.py +52 -0
- messy_table-0.1.0/src/messy_table/api.py +96 -0
- messy_table-0.1.0/src/messy_table/config.py +75 -0
- messy_table-0.1.0/src/messy_table/context.py +63 -0
- messy_table-0.1.0/src/messy_table/detectors/__init__.py +10 -0
- messy_table-0.1.0/src/messy_table/detectors/header.py +115 -0
- messy_table-0.1.0/src/messy_table/detectors/table_end.py +104 -0
- messy_table-0.1.0/src/messy_table/detectors/table_start.py +81 -0
- messy_table-0.1.0/src/messy_table/exceptions.py +40 -0
- messy_table-0.1.0/src/messy_table/grid.py +133 -0
- messy_table-0.1.0/src/messy_table/py.typed +0 -0
- messy_table-0.1.0/src/messy_table/readers/__init__.py +84 -0
- messy_table-0.1.0/src/messy_table/readers/csv.py +109 -0
- messy_table-0.1.0/src/messy_table/readers/xlsx.py +243 -0
- messy_table-0.1.0/src/messy_table/report.py +212 -0
- messy_table-0.1.0/src/messy_table/result.py +68 -0
- messy_table-0.1.0/src/messy_table/transformers/__init__.py +23 -0
- messy_table-0.1.0/src/messy_table/transformers/dates.py +114 -0
- messy_table-0.1.0/src/messy_table/transformers/header_names.py +74 -0
- messy_table-0.1.0/src/messy_table/transformers/merged_cells.py +41 -0
- messy_table-0.1.0/src/messy_table/transformers/nulls.py +66 -0
- messy_table-0.1.0/src/messy_table/transformers/numbers.py +160 -0
- messy_table-0.1.0/src/messy_table/transformers/types.py +163 -0
- messy_table-0.1.0/src/messy_table/util.py +143 -0
- messy_table-0.1.0/tests/_fixtures.py +335 -0
- messy_table-0.1.0/tests/conftest.py +15 -0
- messy_table-0.1.0/tests/test_config.py +32 -0
- messy_table-0.1.0/tests/test_detectors.py +71 -0
- messy_table-0.1.0/tests/test_end_to_end.py +26 -0
- messy_table-0.1.0/tests/test_perf.py +83 -0
- messy_table-0.1.0/tests/test_readers.py +129 -0
- messy_table-0.1.0/tests/test_report.py +48 -0
- messy_table-0.1.0/tests/test_result.py +52 -0
- messy_table-0.1.0/tests/test_security.py +98 -0
- messy_table-0.1.0/tests/test_transformers.py +106 -0
- messy_table-0.1.0/tests/test_units.py +87 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
*.so
|
|
9
|
+
|
|
10
|
+
# Virtualenvs
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
env/
|
|
14
|
+
|
|
15
|
+
# Tooling caches
|
|
16
|
+
.pytest_cache/
|
|
17
|
+
.mypy_cache/
|
|
18
|
+
.ruff_cache/
|
|
19
|
+
.coverage
|
|
20
|
+
.coverage.*
|
|
21
|
+
htmlcov/
|
|
22
|
+
coverage.xml
|
|
23
|
+
|
|
24
|
+
# OS / editor
|
|
25
|
+
.DS_Store
|
|
26
|
+
.idea/
|
|
27
|
+
.vscode/
|
|
28
|
+
*.swp
|
|
29
|
+
|
|
30
|
+
# Local env (never commit secrets — none expected in a library, but be explicit)
|
|
31
|
+
.env
|
|
32
|
+
.env.*
|
|
33
|
+
!.env.example
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres
|
|
5
|
+
to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.1.0] — 2026-06-06
|
|
8
|
+
|
|
9
|
+
The first release. A complete, deterministic cleaning pipeline with a full audit
|
|
10
|
+
trail.
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- `clean(source, *, config=None)` — the single public entry point. Accepts a
|
|
15
|
+
path, raw `bytes`, or a binary/text file-like object.
|
|
16
|
+
- **Readers**: `.xlsx` (openpyxl), `.csv` and `.tsv` with delimiter and encoding
|
|
17
|
+
(UTF-8 / cp1252 / Latin-1) sniffing.
|
|
18
|
+
- **F1** table-start detection (skips titles, banners, metadata, blank rows).
|
|
19
|
+
- **F2** header detection + normalisation: multi-row merged headers, duplicate
|
|
20
|
+
(`valor`, `valor_2`), empty (`column_3`) and leading-digit (`col_2024`) names,
|
|
21
|
+
slugified to `snake_case`.
|
|
22
|
+
- **F3** merged cells: `fill` (propagate) or `first-only`.
|
|
23
|
+
- **F4** Excel serial dates with both 1900 and 1904 epochs.
|
|
24
|
+
- **F5** localised numbers (`1.234,56` vs `1,234.56`) inferred per column.
|
|
25
|
+
- **F6** per-column type inference: `int`, `float`, `date`, `datetime`, `bool`,
|
|
26
|
+
`str`, with mixed-column handling.
|
|
27
|
+
- **F7** disguised nulls (`-`, `N/A`, `#REF!`, `#DIV/0!`, …) → `None`, extensible.
|
|
28
|
+
- **F8** trailing-junk trimming (totals, signatures, footnotes).
|
|
29
|
+
- **F9** `CleanReport` — JSON-serialisable, every fix recorded with location and
|
|
30
|
+
confidence; per-cell fixes aggregated per column with counts and samples.
|
|
31
|
+
- `CleanResult.to_pandas()` via the optional `messy-table[pandas]` extra.
|
|
32
|
+
- `Config` for locale, header, sheet, merge mode, extra null tokens, strict mode,
|
|
33
|
+
and the file-safety limits.
|
|
34
|
+
- `strict=True` raises `AmbiguityError` (always with a `Config` suggestion) where
|
|
35
|
+
permissive mode would warn.
|
|
36
|
+
|
|
37
|
+
### Security
|
|
38
|
+
|
|
39
|
+
- Decompression-bomb defence for `.xlsx`: absolute uncompressed-size and
|
|
40
|
+
compression-ratio limits checked before openpyxl opens the archive.
|
|
41
|
+
- Hard cell ceiling (`Config.max_cells`) and text-size limit bound memory.
|
|
42
|
+
|
|
43
|
+
[0.1.0]: https://github.com/messy-table/messy-table/releases/tag/v0.1.0
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 messy-table contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: messy-table
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Turn messy real-world spreadsheets into clean, typed data — with an auditable report of every fix.
|
|
5
|
+
Project-URL: Homepage, https://github.com/messy-table/messy-table
|
|
6
|
+
Project-URL: Repository, https://github.com/messy-table/messy-table
|
|
7
|
+
Project-URL: Changelog, https://github.com/messy-table/messy-table/blob/main/CHANGELOG.md
|
|
8
|
+
Project-URL: Issues, https://github.com/messy-table/messy-table/issues
|
|
9
|
+
Author: messy-table contributors
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: csv,data-cleaning,data-ingestion,etl,excel,pandas,spreadsheet,xlsx
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: openpyxl>=3.1.3
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: mypy>=1.11; extra == 'dev'
|
|
28
|
+
Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pandas>=2.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
33
|
+
Provides-Extra: pandas
|
|
34
|
+
Requires-Dist: pandas>=2.0; extra == 'pandas'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# messy-table
|
|
38
|
+
|
|
39
|
+
> `pandas.read_excel` assumes your spreadsheet is well-behaved. **messy-table assumes it is not.**
|
|
40
|
+
|
|
41
|
+
Turn messy real-world spreadsheets — Excel/CSV exported from ERPs, legacy systems,
|
|
42
|
+
hand-made reports — into clean, typed data, and get back an **auditable report of
|
|
43
|
+
every fix that was applied**.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from messy_table import clean
|
|
47
|
+
|
|
48
|
+
result = clean("relatorio_vendas.xlsx")
|
|
49
|
+
|
|
50
|
+
result.data # list[dict] — clean, typed rows
|
|
51
|
+
result.columns # per-column name / dtype / null summary
|
|
52
|
+
result.report # every transformation that was applied
|
|
53
|
+
result.warnings # low-confidence decisions
|
|
54
|
+
result.to_pandas() # DataFrame (optional extra)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Why
|
|
58
|
+
|
|
59
|
+
AI agents and data pipelines receive arbitrary spreadsheets from users and today
|
|
60
|
+
re-implement, by hand and fragilely, the same heuristics: find where the table
|
|
61
|
+
starts, fix the headers, convert Excel serial dates, interpret `1.234,56`.
|
|
62
|
+
messy-table is the canonical answer to that step — deterministic, dependency-light,
|
|
63
|
+
and fully auditable.
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install messy-table # core — depends only on openpyxl
|
|
69
|
+
pip install 'messy-table[pandas]' # adds result.to_pandas()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Python ≥ 3.10. Ships `py.typed` — fully type-checked.
|
|
73
|
+
|
|
74
|
+
## Before & after
|
|
75
|
+
|
|
76
|
+
A typical ERP export — a title banner, a blank row, pt-BR numbers, an `N/A`, and a
|
|
77
|
+
trailing totals row:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Relatório de Vendas 2024
|
|
81
|
+
(gerado em 01/02/2024)
|
|
82
|
+
|
|
83
|
+
Produto | Valor (R$) | Qtd | Ativo
|
|
84
|
+
Café | 1.234,56 | 10 | sim
|
|
85
|
+
Chá | 2.000,00 | 5 | nao
|
|
86
|
+
Açúcar | - | 3 | sim
|
|
87
|
+
TOTAL | 3.234,56 | 18 |
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
>>> result = clean("vendas.xlsx")
|
|
92
|
+
>>> result.data
|
|
93
|
+
[{'produto': 'Café', 'valor_r': 1234.56, 'qtd': 10, 'ativo': True},
|
|
94
|
+
{'produto': 'Chá', 'valor_r': 2000.0, 'qtd': 5, 'ativo': False},
|
|
95
|
+
{'produto': 'Açúcar', 'valor_r': None, 'qtd': 3, 'ativo': True}]
|
|
96
|
+
>>> [(c.name, c.dtype) for c in result.columns]
|
|
97
|
+
[('produto', 'str'), ('valor_r', 'float'), ('qtd', 'int'), ('ativo', 'bool')]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
The title, blank row and `TOTAL` line are gone; numbers are parsed in the column's
|
|
101
|
+
inferred locale; `-` is a null; `sim/nao` became booleans — and the report says so.
|
|
102
|
+
|
|
103
|
+
## Features (v0.1)
|
|
104
|
+
|
|
105
|
+
| Feature | What it does |
|
|
106
|
+
|---|---|
|
|
107
|
+
| **Table-start detection** | Skips titles, logos, stray cells and metadata before the real header. |
|
|
108
|
+
| **Header detection & normalisation** | Finds the header (or its absence); resolves duplicate/empty/multi-row headers; slugifies to `snake_case`. |
|
|
109
|
+
| **Merged cells** | Propagates a merged value across its range (`fill`) or keeps it top-left only (`first-only`). |
|
|
110
|
+
| **Excel serial dates** | Converts `45123` → a real date when the column has a date profile (both 1900 and 1904 epochs). |
|
|
111
|
+
| **Localised numbers** | `1.234,56` (pt-BR/EU) vs `1,234.56` (en-US), inferred per **column**, never per cell. |
|
|
112
|
+
| **Column type inference** | `int`/`float`/`date`/`datetime`/`bool`/`str`, with mixed-column handling. |
|
|
113
|
+
| **Disguised nulls** | `-`, `N/A`, `n/d`, `#REF!`, `#DIV/0!`, blanks → `None` (extensible). |
|
|
114
|
+
| **Trailing junk** | Removes totals, signatures and footnotes after the data ends. |
|
|
115
|
+
| **Cleaning report** | Structured, JSON-serialisable record of every correction with location and confidence. |
|
|
116
|
+
| **Input formats** | `.xlsx`, `.csv` (delimiter + encoding sniffing), `.tsv`. |
|
|
117
|
+
|
|
118
|
+
## The report
|
|
119
|
+
|
|
120
|
+
Nothing changes without a record. Per-cell fixes are aggregated per column with a
|
|
121
|
+
count and sample locations, so the report stays small even on huge files:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
>>> print(result.report.to_json())
|
|
125
|
+
{
|
|
126
|
+
"summary": {"table_start_detected": 1, "table_end_trimmed": 1,
|
|
127
|
+
"header_renamed": 1, "null_normalized": 1,
|
|
128
|
+
"number_parsed": 4, "type_coerced": 2},
|
|
129
|
+
"actions": [
|
|
130
|
+
{"kind": "table_start_detected", "rule": "density", "confidence": 0.8,
|
|
131
|
+
"detail": "skipped 3 leading row(s) (title/metadata/blank); table starts at row 3"},
|
|
132
|
+
{"kind": "number_parsed", "rule": "locale:pt_BR", "column": "valor_r",
|
|
133
|
+
"count": 4, "confidence": 1.0,
|
|
134
|
+
"examples": [{"row": 0, "original": "1.234,56", "final": 1234.56}]},
|
|
135
|
+
...
|
|
136
|
+
]
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Configuration
|
|
141
|
+
|
|
142
|
+
The 80% case needs no config. For the rest:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from messy_table import clean, Config
|
|
146
|
+
|
|
147
|
+
result = clean(
|
|
148
|
+
"dados.csv",
|
|
149
|
+
config=Config(
|
|
150
|
+
locale="pt_BR", # force number/date interpretation; default "auto"
|
|
151
|
+
header="auto", # "auto" | int (row index) | None (no header)
|
|
152
|
+
sheet=0, # index or name of the worksheet
|
|
153
|
+
merged_cells="fill", # "fill" | "first-only"
|
|
154
|
+
null_values_extra=["s/i"], # add to the built-in null list
|
|
155
|
+
strict=False, # True: raise AmbiguityError instead of warning
|
|
156
|
+
),
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
In `strict=True`, a low-confidence decision raises `AmbiguityError` — always with a
|
|
161
|
+
copy-pasteable `Config` suggestion to resolve it.
|
|
162
|
+
|
|
163
|
+
## Security
|
|
164
|
+
|
|
165
|
+
messy-table parses untrusted files, so it defends against it: `.xlsx` archives are
|
|
166
|
+
checked for decompression-bomb shape (absolute size and ratio) **before** they are
|
|
167
|
+
opened, and a hard cell ceiling bounds memory. See [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
168
|
+
|
|
169
|
+
## Docs
|
|
170
|
+
|
|
171
|
+
- [ARCHITECTURE.md](ARCHITECTURE.md) — pipeline, stack rationale, security decisions.
|
|
172
|
+
- [docs/heuristics.md](docs/heuristics.md) — every heuristic and its thresholds.
|
|
173
|
+
|
|
174
|
+
## License
|
|
175
|
+
|
|
176
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# messy-table
|
|
2
|
+
|
|
3
|
+
> `pandas.read_excel` assumes your spreadsheet is well-behaved. **messy-table assumes it is not.**
|
|
4
|
+
|
|
5
|
+
Turn messy real-world spreadsheets — Excel/CSV exported from ERPs, legacy systems,
|
|
6
|
+
hand-made reports — into clean, typed data, and get back an **auditable report of
|
|
7
|
+
every fix that was applied**.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from messy_table import clean
|
|
11
|
+
|
|
12
|
+
result = clean("relatorio_vendas.xlsx")
|
|
13
|
+
|
|
14
|
+
result.data # list[dict] — clean, typed rows
|
|
15
|
+
result.columns # per-column name / dtype / null summary
|
|
16
|
+
result.report # every transformation that was applied
|
|
17
|
+
result.warnings # low-confidence decisions
|
|
18
|
+
result.to_pandas() # DataFrame (optional extra)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Why
|
|
22
|
+
|
|
23
|
+
AI agents and data pipelines receive arbitrary spreadsheets from users and today
|
|
24
|
+
re-implement, by hand and fragilely, the same heuristics: find where the table
|
|
25
|
+
starts, fix the headers, convert Excel serial dates, interpret `1.234,56`.
|
|
26
|
+
messy-table is the canonical answer to that step — deterministic, dependency-light,
|
|
27
|
+
and fully auditable.
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install messy-table # core — depends only on openpyxl
|
|
33
|
+
pip install 'messy-table[pandas]' # adds result.to_pandas()
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Python ≥ 3.10. Ships `py.typed` — fully type-checked.
|
|
37
|
+
|
|
38
|
+
## Before & after
|
|
39
|
+
|
|
40
|
+
A typical ERP export — a title banner, a blank row, pt-BR numbers, an `N/A`, and a
|
|
41
|
+
trailing totals row:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Relatório de Vendas 2024
|
|
45
|
+
(gerado em 01/02/2024)
|
|
46
|
+
|
|
47
|
+
Produto | Valor (R$) | Qtd | Ativo
|
|
48
|
+
Café | 1.234,56 | 10 | sim
|
|
49
|
+
Chá | 2.000,00 | 5 | nao
|
|
50
|
+
Açúcar | - | 3 | sim
|
|
51
|
+
TOTAL | 3.234,56 | 18 |
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
>>> result = clean("vendas.xlsx")
|
|
56
|
+
>>> result.data
|
|
57
|
+
[{'produto': 'Café', 'valor_r': 1234.56, 'qtd': 10, 'ativo': True},
|
|
58
|
+
{'produto': 'Chá', 'valor_r': 2000.0, 'qtd': 5, 'ativo': False},
|
|
59
|
+
{'produto': 'Açúcar', 'valor_r': None, 'qtd': 3, 'ativo': True}]
|
|
60
|
+
>>> [(c.name, c.dtype) for c in result.columns]
|
|
61
|
+
[('produto', 'str'), ('valor_r', 'float'), ('qtd', 'int'), ('ativo', 'bool')]
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
The title, blank row and `TOTAL` line are gone; numbers are parsed in the column's
|
|
65
|
+
inferred locale; `-` is a null; `sim/nao` became booleans — and the report says so.
|
|
66
|
+
|
|
67
|
+
## Features (v0.1)
|
|
68
|
+
|
|
69
|
+
| Feature | What it does |
|
|
70
|
+
|---|---|
|
|
71
|
+
| **Table-start detection** | Skips titles, logos, stray cells and metadata before the real header. |
|
|
72
|
+
| **Header detection & normalisation** | Finds the header (or its absence); resolves duplicate/empty/multi-row headers; slugifies to `snake_case`. |
|
|
73
|
+
| **Merged cells** | Propagates a merged value across its range (`fill`) or keeps it top-left only (`first-only`). |
|
|
74
|
+
| **Excel serial dates** | Converts `45123` → a real date when the column has a date profile (both 1900 and 1904 epochs). |
|
|
75
|
+
| **Localised numbers** | `1.234,56` (pt-BR/EU) vs `1,234.56` (en-US), inferred per **column**, never per cell. |
|
|
76
|
+
| **Column type inference** | `int`/`float`/`date`/`datetime`/`bool`/`str`, with mixed-column handling. |
|
|
77
|
+
| **Disguised nulls** | `-`, `N/A`, `n/d`, `#REF!`, `#DIV/0!`, blanks → `None` (extensible). |
|
|
78
|
+
| **Trailing junk** | Removes totals, signatures and footnotes after the data ends. |
|
|
79
|
+
| **Cleaning report** | Structured, JSON-serialisable record of every correction with location and confidence. |
|
|
80
|
+
| **Input formats** | `.xlsx`, `.csv` (delimiter + encoding sniffing), `.tsv`. |
|
|
81
|
+
|
|
82
|
+
## The report
|
|
83
|
+
|
|
84
|
+
Nothing changes without a record. Per-cell fixes are aggregated per column with a
|
|
85
|
+
count and sample locations, so the report stays small even on huge files:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
>>> print(result.report.to_json())
|
|
89
|
+
{
|
|
90
|
+
"summary": {"table_start_detected": 1, "table_end_trimmed": 1,
|
|
91
|
+
"header_renamed": 1, "null_normalized": 1,
|
|
92
|
+
"number_parsed": 4, "type_coerced": 2},
|
|
93
|
+
"actions": [
|
|
94
|
+
{"kind": "table_start_detected", "rule": "density", "confidence": 0.8,
|
|
95
|
+
"detail": "skipped 3 leading row(s) (title/metadata/blank); table starts at row 3"},
|
|
96
|
+
{"kind": "number_parsed", "rule": "locale:pt_BR", "column": "valor_r",
|
|
97
|
+
"count": 4, "confidence": 1.0,
|
|
98
|
+
"examples": [{"row": 0, "original": "1.234,56", "final": 1234.56}]},
|
|
99
|
+
...
|
|
100
|
+
]
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Configuration
|
|
105
|
+
|
|
106
|
+
The 80% case needs no config. For the rest:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from messy_table import clean, Config
|
|
110
|
+
|
|
111
|
+
result = clean(
|
|
112
|
+
"dados.csv",
|
|
113
|
+
config=Config(
|
|
114
|
+
locale="pt_BR", # force number/date interpretation; default "auto"
|
|
115
|
+
header="auto", # "auto" | int (row index) | None (no header)
|
|
116
|
+
sheet=0, # index or name of the worksheet
|
|
117
|
+
merged_cells="fill", # "fill" | "first-only"
|
|
118
|
+
null_values_extra=["s/i"], # add to the built-in null list
|
|
119
|
+
strict=False, # True: raise AmbiguityError instead of warning
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
In `strict=True`, a low-confidence decision raises `AmbiguityError` — always with a
|
|
125
|
+
copy-pasteable `Config` suggestion to resolve it.
|
|
126
|
+
|
|
127
|
+
## Security
|
|
128
|
+
|
|
129
|
+
messy-table parses untrusted files, so it defends against it: `.xlsx` archives are
|
|
130
|
+
checked for decompression-bomb shape (absolute size and ratio) **before** they are
|
|
131
|
+
opened, and a hard cell ceiling bounds memory. See [ARCHITECTURE.md](ARCHITECTURE.md).
|
|
132
|
+
|
|
133
|
+
## Docs
|
|
134
|
+
|
|
135
|
+
- [ARCHITECTURE.md](ARCHITECTURE.md) — pipeline, stack rationale, security decisions.
|
|
136
|
+
- [docs/heuristics.md](docs/heuristics.md) — every heuristic and its thresholds.
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Heuristics
|
|
2
|
+
|
|
3
|
+
Every heuristic in messy-table, what signal it uses, and the thresholds it trips
|
|
4
|
+
on. Thresholds live as named constants in code; this file is the prose companion.
|
|
5
|
+
When you change a threshold, update both.
|
|
6
|
+
|
|
7
|
+
A shared idea runs through all of them: produce a **confidence** in `[0, 1]`.
|
|
8
|
+
Below `Config.confidence_threshold` (default **0.6**) the decision is *ambiguous*
|
|
9
|
+
— a warning in permissive mode, an `AmbiguityError` (with a `Config` suggestion)
|
|
10
|
+
in strict mode.
|
|
11
|
+
|
|
12
|
+
## Density, merge-aware (shared primitive)
|
|
13
|
+
|
|
14
|
+
`util.merge_covered_cells` + `util.density_threshold`.
|
|
15
|
+
|
|
16
|
+
A row's *filled count* = non-blank cells **plus** cells covered by a **row-spanning**
|
|
17
|
+
merge (vertical/block). Horizontal one-row merges (banners) are excluded so titles
|
|
18
|
+
stay sparse.
|
|
19
|
+
|
|
20
|
+
`density_threshold(width, ratio=0.5)`:
|
|
21
|
+
- width 1 → **1** (single-column tables are real).
|
|
22
|
+
- width ≥ 2 → `max(2, ceil(0.5 · width))` (a lone stray cell never counts as data).
|
|
23
|
+
|
|
24
|
+
## F1 — table start (`detectors/table_start.py`)
|
|
25
|
+
|
|
26
|
+
- Compute the merge-aware filled count per row; `width` = the max across rows.
|
|
27
|
+
- A row is *substantial* if its filled count ≥ `density_threshold(width)`.
|
|
28
|
+
- The table starts at the first row of the **longest contiguous run** of
|
|
29
|
+
substantial rows. (A blank row between a metadata block and the table breaks the
|
|
30
|
+
run, so the longer table run wins.)
|
|
31
|
+
- Confidence = `clamp(body_density − above_density + 0.5)`. Skipping rows with a
|
|
32
|
+
sparse preamble is high-confidence; skipping into a still-dense region is not.
|
|
33
|
+
- `Config(header=<int>)` pins the start and bypasses detection.
|
|
34
|
+
|
|
35
|
+
**Known limit:** a metadata block *immediately* above the table with no blank
|
|
36
|
+
separator and the same column count can be absorbed. A blank separator (the common
|
|
37
|
+
real case) resolves it; otherwise pin `header`.
|
|
38
|
+
|
|
39
|
+
## F2 — header (`detectors/header.py` + `transformers/header_names.py`)
|
|
40
|
+
|
|
41
|
+
Detection (grid already body-sliced and unmerged, so the header is at row 0):
|
|
42
|
+
|
|
43
|
+
- **Headerless?** If row 0 is not text-heavy (`text_ratio < 0.5`), has no
|
|
44
|
+
horizontal merge, and shares the exact per-column category signature of row 1,
|
|
45
|
+
there is no header → synthesise `column_1…` and start data at row 0 (warned).
|
|
46
|
+
- **How many header rows?** Start at 1. While the current top header row intersects
|
|
47
|
+
a **horizontal merge** (a spanned group cell), consume the next row too — up to
|
|
48
|
+
`MAX_HEADER_ROWS` (3). This ties multi-row detection to real structure rather
|
|
49
|
+
than a fragile text test.
|
|
50
|
+
- Confidence: 0.9 if row 0 is text-heavy, else 0.55.
|
|
51
|
+
|
|
52
|
+
Naming (`header_names.py`): NFKD accent-strip → lowercase → non-word → `_` →
|
|
53
|
+
collapse/trim. Empty → `column_{i}`; leading digit → `col_…`; duplicates get
|
|
54
|
+
`_2`, `_3`. Every rename is recorded.
|
|
55
|
+
|
|
56
|
+
## F3 — merged cells (`transformers/merged_cells.py`)
|
|
57
|
+
|
|
58
|
+
`fill` (default): copy each merge's anchor value to every other cell in its range.
|
|
59
|
+
`first-only`: leave them `None` (openpyxl's default) — a no-op. Each filled cell is
|
|
60
|
+
recorded. Skipped on very large (streaming) sheets, with a warning.
|
|
61
|
+
|
|
62
|
+
## F4 — Excel serial dates (`transformers/dates.py`)
|
|
63
|
+
|
|
64
|
+
Date-formatted xlsx cells already arrive as `datetime` (openpyxl applies the
|
|
65
|
+
epoch). F4 targets the *messy* case: a column of **bare numbers** that are really
|
|
66
|
+
dates. Converted only when **both** hold:
|
|
67
|
+
|
|
68
|
+
1. every value sits in the serial range **20000–60000** (≈ 1954–2064), **and**
|
|
69
|
+
2. there is corroboration — the cell's number format looks like a date
|
|
70
|
+
(`fmt_is_date`, confidence **0.9**) **or** the column name hints at a date
|
|
71
|
+
(`data`, `vencimento`, `date`, … → confidence **0.7**).
|
|
72
|
+
|
|
73
|
+
Bare numbers with no hint are left numeric (so an `ano`/`year` column survives).
|
|
74
|
+
Conversion uses the workbook epoch (1899-12-30 base absorbs the 1900 leap bug;
|
|
75
|
+
1904-01-01 for Mac). Integer serial → `date`; fractional → `datetime`.
|
|
76
|
+
|
|
77
|
+
## F5 — localised numbers (`transformers/numbers.py`)
|
|
78
|
+
|
|
79
|
+
Only columns where ≥ **70%** (`NUMERIC_COLUMN_RATIO`) of non-blank string cells look
|
|
80
|
+
numeric are treated as numeric. Per-value *decimal vote*:
|
|
81
|
+
|
|
82
|
+
- both `.` and `,` present → the **last** one is the decimal point;
|
|
83
|
+
- one separator present → it is *thousands grouping* only if it splits into clean
|
|
84
|
+
3-digit runs (`1.234`, `12.345.678`), otherwise it is the decimal point.
|
|
85
|
+
|
|
86
|
+
The column's majority vote picks the convention; confidence = winner / total votes
|
|
87
|
+
(so a 50/50 split → 0.5, below threshold → ambiguous). `Config(locale=…)` overrides
|
|
88
|
+
the vote (confidence 1.0). Currency symbols, spaces, NBSP, apostrophes and a
|
|
89
|
+
trailing `%` (divide by 100) are handled.
|
|
90
|
+
|
|
91
|
+
## F6 — column types (`transformers/types.py`)
|
|
92
|
+
|
|
93
|
+
Per column, over surviving non-null values:
|
|
94
|
+
|
|
95
|
+
- all `date`/`datetime` → `date`, or `datetime` if any carry a time (bare dates are
|
|
96
|
+
promoted to datetime for uniformity);
|
|
97
|
+
- all `int` → `int`; any fractional → `float`;
|
|
98
|
+
- all boolean (native or text tokens `true/false/sim/não/yes/no/…`) → `bool`;
|
|
99
|
+
- otherwise → `str` (lossless fallback, with a "mixes types" warning when the column
|
|
100
|
+
genuinely mixed numbers and text).
|
|
101
|
+
|
|
102
|
+
## F7 — disguised nulls (`transformers/nulls.py`)
|
|
103
|
+
|
|
104
|
+
Case-insensitive, trimmed match against the built-in token set (`-`, `--`, `n/a`,
|
|
105
|
+
`n/d`, `null`, `none`, `nil`, `nan`, and the Excel error literals `#REF!`,
|
|
106
|
+
`#DIV/0!`, `#VALUE!`, …) plus `Config.null_values_extra`. Matches become `None`.
|
|
107
|
+
|
|
108
|
+
## F8 — trailing junk (`detectors/table_end.py`)
|
|
109
|
+
|
|
110
|
+
Walk up from the bottom; trim a row while it is **sparse** (below the body density
|
|
111
|
+
threshold), **empty**, or starts with a **summary keyword** (`total`, `subtotal`,
|
|
112
|
+
`soma`, `fonte`, `gerado em`, `assinatura`, …). Stop at the first real data row.
|
|
113
|
+
Confidence 0.85 when a keyword matched, else 0.7.
|
|
114
|
+
|
|
115
|
+
## CSV delimiter & encoding (`readers/csv.py`)
|
|
116
|
+
|
|
117
|
+
- **Encoding:** decode cascade `utf-8-sig` → `cp1252` → `latin-1` (the last never
|
|
118
|
+
fails). The first that decodes wins.
|
|
119
|
+
- **Delimiter:** structural, not `csv.Sniffer` (which preamble rows fool). For each
|
|
120
|
+
candidate (`, ; \t |`) score by how many lines share a modal field count ≥ 2,
|
|
121
|
+
requiring at least half the lines to split. Best agreement wins; ties broken by
|
|
122
|
+
more fields. If nothing qualifies → **single column** (a lone decimal comma is not
|
|
123
|
+
a delimiter). `.tsv` forces tab.
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "messy-table"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Turn messy real-world spreadsheets into clean, typed data — with an auditable report of every fix."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "messy-table contributors" }]
|
|
13
|
+
keywords = ["excel", "xlsx", "csv", "data-cleaning", "spreadsheet", "etl", "pandas", "data-ingestion"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Office/Business :: Financial :: Spreadsheet",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
"Typing :: Typed",
|
|
26
|
+
]
|
|
27
|
+
# openpyxl is the ONLY mandatory dependency. >=3.1.3 pulls in the read-only
|
|
28
|
+
# date handling fixes and the et-xmlfile pin that closes the XML entity issues.
|
|
29
|
+
dependencies = ["openpyxl>=3.1.3"]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/messy-table/messy-table"
|
|
33
|
+
Repository = "https://github.com/messy-table/messy-table"
|
|
34
|
+
Changelog = "https://github.com/messy-table/messy-table/blob/main/CHANGELOG.md"
|
|
35
|
+
Issues = "https://github.com/messy-table/messy-table/issues"
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
# `pip install messy-table[pandas]` enables CleanResult.to_pandas().
|
|
39
|
+
pandas = ["pandas>=2.0"]
|
|
40
|
+
dev = [
|
|
41
|
+
"pytest>=8.0",
|
|
42
|
+
"pytest-cov>=5.0",
|
|
43
|
+
"ruff>=0.6",
|
|
44
|
+
"mypy>=1.11",
|
|
45
|
+
"pandas>=2.0",
|
|
46
|
+
"pandas-stubs>=2.0",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["src/messy_table"]
|
|
51
|
+
|
|
52
|
+
[tool.hatch.build.targets.sdist]
|
|
53
|
+
include = ["src/messy_table", "tests", "docs", "README.md", "CHANGELOG.md", "LICENSE"]
|
|
54
|
+
|
|
55
|
+
# ----------------------------------------------------------------------------
|
|
56
|
+
# Ruff — lint + format. Opinionated, strict, zero tolerance for the basics.
|
|
57
|
+
# ----------------------------------------------------------------------------
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
line-length = 100
|
|
60
|
+
target-version = "py310"
|
|
61
|
+
src = ["src", "tests"]
|
|
62
|
+
|
|
63
|
+
[tool.ruff.lint]
|
|
64
|
+
select = [
|
|
65
|
+
"E", "W", # pycodestyle
|
|
66
|
+
"F", # pyflakes
|
|
67
|
+
"I", # isort
|
|
68
|
+
"N", # pep8-naming
|
|
69
|
+
"UP", # pyupgrade
|
|
70
|
+
"B", # flake8-bugbear
|
|
71
|
+
"C4", # comprehensions
|
|
72
|
+
"SIM", # simplify
|
|
73
|
+
"RUF", # ruff-specific
|
|
74
|
+
"PTH", # use pathlib
|
|
75
|
+
"TID", # tidy imports
|
|
76
|
+
"BLE", # blind-except — no bare/broad except in the core
|
|
77
|
+
"S", # bandit security checks
|
|
78
|
+
]
|
|
79
|
+
ignore = [
|
|
80
|
+
"S101", # asserts are fine in tests
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
[tool.ruff.lint.per-file-ignores]
|
|
84
|
+
"tests/**" = ["S", "N802", "N806"]
|
|
85
|
+
"tests/_fixtures_gen.py" = ["S", "N802", "N806"]
|
|
86
|
+
|
|
87
|
+
[tool.ruff.lint.isort]
|
|
88
|
+
known-first-party = ["messy_table"]
|
|
89
|
+
|
|
90
|
+
# ----------------------------------------------------------------------------
|
|
91
|
+
# mypy — strict. The library ships py.typed; the public surface is fully typed.
|
|
92
|
+
# ----------------------------------------------------------------------------
|
|
93
|
+
[tool.mypy]
|
|
94
|
+
python_version = "3.10"
|
|
95
|
+
strict = true
|
|
96
|
+
warn_unreachable = true
|
|
97
|
+
warn_redundant_casts = true
|
|
98
|
+
warn_unused_ignores = true
|
|
99
|
+
disallow_untyped_defs = true
|
|
100
|
+
disallow_any_generics = true
|
|
101
|
+
no_implicit_reexport = true
|
|
102
|
+
show_error_codes = true
|
|
103
|
+
files = ["src/messy_table"]
|
|
104
|
+
|
|
105
|
+
[[tool.mypy.overrides]]
|
|
106
|
+
# openpyxl ships no type stubs upstream; we isolate the untyped boundary in
|
|
107
|
+
# the readers and treat it as the only place `Any` may legitimately enter.
|
|
108
|
+
module = ["openpyxl.*"]
|
|
109
|
+
ignore_missing_imports = true
|
|
110
|
+
|
|
111
|
+
# ----------------------------------------------------------------------------
|
|
112
|
+
# pytest + coverage
|
|
113
|
+
# ----------------------------------------------------------------------------
|
|
114
|
+
[tool.pytest.ini_options]
|
|
115
|
+
minversion = "8.0"
|
|
116
|
+
testpaths = ["tests"]
|
|
117
|
+
addopts = "-q --strict-markers --strict-config"
|
|
118
|
+
markers = [
|
|
119
|
+
"perf: performance-gate tests (may be slow); run with -m perf",
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
[tool.coverage.run]
|
|
123
|
+
branch = true
|
|
124
|
+
source = ["messy_table"]
|
|
125
|
+
omit = ["*/tests/*"]
|
|
126
|
+
|
|
127
|
+
[tool.coverage.report]
|
|
128
|
+
# Acceptance criterion: 90% minimum on the core. Hardened, not loosened.
|
|
129
|
+
fail_under = 90
|
|
130
|
+
show_missing = true
|
|
131
|
+
exclude_lines = [
|
|
132
|
+
"pragma: no cover",
|
|
133
|
+
"if TYPE_CHECKING:",
|
|
134
|
+
"raise NotImplementedError",
|
|
135
|
+
"\\.\\.\\.",
|
|
136
|
+
]
|