messy-table 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. messy_table-0.1.0/.gitignore +33 -0
  2. messy_table-0.1.0/CHANGELOG.md +43 -0
  3. messy_table-0.1.0/LICENSE +21 -0
  4. messy_table-0.1.0/PKG-INFO +176 -0
  5. messy_table-0.1.0/README.md +140 -0
  6. messy_table-0.1.0/docs/heuristics.md +123 -0
  7. messy_table-0.1.0/pyproject.toml +136 -0
  8. messy_table-0.1.0/src/messy_table/__init__.py +52 -0
  9. messy_table-0.1.0/src/messy_table/api.py +96 -0
  10. messy_table-0.1.0/src/messy_table/config.py +75 -0
  11. messy_table-0.1.0/src/messy_table/context.py +63 -0
  12. messy_table-0.1.0/src/messy_table/detectors/__init__.py +10 -0
  13. messy_table-0.1.0/src/messy_table/detectors/header.py +115 -0
  14. messy_table-0.1.0/src/messy_table/detectors/table_end.py +104 -0
  15. messy_table-0.1.0/src/messy_table/detectors/table_start.py +81 -0
  16. messy_table-0.1.0/src/messy_table/exceptions.py +40 -0
  17. messy_table-0.1.0/src/messy_table/grid.py +133 -0
  18. messy_table-0.1.0/src/messy_table/py.typed +0 -0
  19. messy_table-0.1.0/src/messy_table/readers/__init__.py +84 -0
  20. messy_table-0.1.0/src/messy_table/readers/csv.py +109 -0
  21. messy_table-0.1.0/src/messy_table/readers/xlsx.py +243 -0
  22. messy_table-0.1.0/src/messy_table/report.py +212 -0
  23. messy_table-0.1.0/src/messy_table/result.py +68 -0
  24. messy_table-0.1.0/src/messy_table/transformers/__init__.py +23 -0
  25. messy_table-0.1.0/src/messy_table/transformers/dates.py +114 -0
  26. messy_table-0.1.0/src/messy_table/transformers/header_names.py +74 -0
  27. messy_table-0.1.0/src/messy_table/transformers/merged_cells.py +41 -0
  28. messy_table-0.1.0/src/messy_table/transformers/nulls.py +66 -0
  29. messy_table-0.1.0/src/messy_table/transformers/numbers.py +160 -0
  30. messy_table-0.1.0/src/messy_table/transformers/types.py +163 -0
  31. messy_table-0.1.0/src/messy_table/util.py +143 -0
  32. messy_table-0.1.0/tests/_fixtures.py +335 -0
  33. messy_table-0.1.0/tests/conftest.py +15 -0
  34. messy_table-0.1.0/tests/test_config.py +32 -0
  35. messy_table-0.1.0/tests/test_detectors.py +71 -0
  36. messy_table-0.1.0/tests/test_end_to_end.py +26 -0
  37. messy_table-0.1.0/tests/test_perf.py +83 -0
  38. messy_table-0.1.0/tests/test_readers.py +129 -0
  39. messy_table-0.1.0/tests/test_report.py +48 -0
  40. messy_table-0.1.0/tests/test_result.py +52 -0
  41. messy_table-0.1.0/tests/test_security.py +98 -0
  42. messy_table-0.1.0/tests/test_transformers.py +106 -0
  43. messy_table-0.1.0/tests/test_units.py +87 -0
@@ -0,0 +1,33 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ *.so
9
+
10
+ # Virtualenvs
11
+ .venv/
12
+ venv/
13
+ env/
14
+
15
+ # Tooling caches
16
+ .pytest_cache/
17
+ .mypy_cache/
18
+ .ruff_cache/
19
+ .coverage
20
+ .coverage.*
21
+ htmlcov/
22
+ coverage.xml
23
+
24
+ # OS / editor
25
+ .DS_Store
26
+ .idea/
27
+ .vscode/
28
+ *.swp
29
+
30
+ # Local env (never commit secrets — none expected in a library, but be explicit)
31
+ .env
32
+ .env.*
33
+ !.env.example
@@ -0,0 +1,43 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres
5
+ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.1.0] — 2026-06-06
8
+
9
+ The first release. A complete, deterministic cleaning pipeline with a full audit
10
+ trail.
11
+
12
+ ### Added
13
+
14
+ - `clean(source, *, config=None)` — the single public entry point. Accepts a
15
+ path, raw `bytes`, or a binary/text file-like object.
16
+ - **Readers**: `.xlsx` (openpyxl), `.csv` and `.tsv` with delimiter and encoding
17
+ (UTF-8 / cp1252 / Latin-1) sniffing.
18
+ - **F1** table-start detection (skips titles, banners, metadata, blank rows).
19
+ - **F2** header detection + normalisation: multi-row merged headers, duplicate
20
+ (`valor`, `valor_2`), empty (`column_3`) and leading-digit (`col_2024`) names,
21
+ slugified to `snake_case`.
22
+ - **F3** merged cells: `fill` (propagate) or `first-only`.
23
+ - **F4** Excel serial dates with both 1900 and 1904 epochs.
24
+ - **F5** localised numbers (`1.234,56` vs `1,234.56`) inferred per column.
25
+ - **F6** per-column type inference: `int`, `float`, `date`, `datetime`, `bool`,
26
+ `str`, with mixed-column handling.
27
+ - **F7** disguised nulls (`-`, `N/A`, `#REF!`, `#DIV/0!`, …) → `None`, extensible.
28
+ - **F8** trailing-junk trimming (totals, signatures, footnotes).
29
+ - **F9** `CleanReport` — JSON-serialisable, every fix recorded with location and
30
+ confidence; per-cell fixes aggregated per column with counts and samples.
31
+ - `CleanResult.to_pandas()` via the optional `messy-table[pandas]` extra.
32
+ - `Config` for locale, header, sheet, merge mode, extra null tokens, strict mode,
33
+ and the file-safety limits.
34
+ - `strict=True` raises `AmbiguityError` (always with a `Config` suggestion) where
35
+ permissive mode would warn.
36
+
37
+ ### Security
38
+
39
+ - Decompression-bomb defence for `.xlsx`: absolute uncompressed-size and
40
+ compression-ratio limits checked before openpyxl opens the archive.
41
+ - Hard cell ceiling (`Config.max_cells`) and text-size limit bound memory.
42
+
43
+ [0.1.0]: https://github.com/messy-table/messy-table/releases/tag/v0.1.0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 messy-table contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,176 @@
1
+ Metadata-Version: 2.4
2
+ Name: messy-table
3
+ Version: 0.1.0
4
+ Summary: Turn messy real-world spreadsheets into clean, typed data — with an auditable report of every fix.
5
+ Project-URL: Homepage, https://github.com/messy-table/messy-table
6
+ Project-URL: Repository, https://github.com/messy-table/messy-table
7
+ Project-URL: Changelog, https://github.com/messy-table/messy-table/blob/main/CHANGELOG.md
8
+ Project-URL: Issues, https://github.com/messy-table/messy-table/issues
9
+ Author: messy-table contributors
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: csv,data-cleaning,data-ingestion,etl,excel,pandas,spreadsheet,xlsx
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: openpyxl>=3.1.3
26
+ Provides-Extra: dev
27
+ Requires-Dist: mypy>=1.11; extra == 'dev'
28
+ Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
29
+ Requires-Dist: pandas>=2.0; extra == 'dev'
30
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
31
+ Requires-Dist: pytest>=8.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.6; extra == 'dev'
33
+ Provides-Extra: pandas
34
+ Requires-Dist: pandas>=2.0; extra == 'pandas'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # messy-table
38
+
39
+ > `pandas.read_excel` assumes your spreadsheet is well-behaved. **messy-table assumes it is not.**
40
+
41
+ Turn messy real-world spreadsheets — Excel/CSV exported from ERPs, legacy systems,
42
+ hand-made reports — into clean, typed data, and get back an **auditable report of
43
+ every fix that was applied**.
44
+
45
+ ```python
46
+ from messy_table import clean
47
+
48
+ result = clean("relatorio_vendas.xlsx")
49
+
50
+ result.data # list[dict] — clean, typed rows
51
+ result.columns # per-column name / dtype / null summary
52
+ result.report # every transformation that was applied
53
+ result.warnings # low-confidence decisions
54
+ result.to_pandas() # DataFrame (optional extra)
55
+ ```
56
+
57
+ ## Why
58
+
59
+ AI agents and data pipelines receive arbitrary spreadsheets from users and today
60
+ re-implement, by hand and fragilely, the same heuristics: find where the table
61
+ starts, fix the headers, convert Excel serial dates, interpret `1.234,56`.
62
+ messy-table is the canonical answer to that step — deterministic, dependency-light,
63
+ and fully auditable.
64
+
65
+ ## Install
66
+
67
+ ```bash
68
+ pip install messy-table # core — depends only on openpyxl
69
+ pip install 'messy-table[pandas]' # adds result.to_pandas()
70
+ ```
71
+
72
+ Python ≥ 3.10. Ships `py.typed` — fully type-checked.
73
+
74
+ ## Before & after
75
+
76
+ A typical ERP export — a title banner, a blank row, pt-BR numbers, an `N/A`, and a
77
+ trailing totals row:
78
+
79
+ ```
80
+ Relatório de Vendas 2024
81
+ (gerado em 01/02/2024)
82
+
83
+ Produto | Valor (R$) | Qtd | Ativo
84
+ Café | 1.234,56 | 10 | sim
85
+ Chá | 2.000,00 | 5 | nao
86
+ Açúcar | - | 3 | sim
87
+ TOTAL | 3.234,56 | 18 |
88
+ ```
89
+
90
+ ```python
91
+ >>> result = clean("vendas.xlsx")
92
+ >>> result.data
93
+ [{'produto': 'Café', 'valor_r': 1234.56, 'qtd': 10, 'ativo': True},
94
+ {'produto': 'Chá', 'valor_r': 2000.0, 'qtd': 5, 'ativo': False},
95
+ {'produto': 'Açúcar', 'valor_r': None, 'qtd': 3, 'ativo': True}]
96
+ >>> [(c.name, c.dtype) for c in result.columns]
97
+ [('produto', 'str'), ('valor_r', 'float'), ('qtd', 'int'), ('ativo', 'bool')]
98
+ ```
99
+
100
+ The title, blank row and `TOTAL` line are gone; numbers are parsed in the column's
101
+ inferred locale; `-` is a null; `sim/nao` became booleans — and the report says so.
102
+
103
+ ## Features (v0.1)
104
+
105
+ | Feature | What it does |
106
+ |---|---|
107
+ | **Table-start detection** | Skips titles, logos, stray cells and metadata before the real header. |
108
+ | **Header detection & normalisation** | Finds the header (or its absence); resolves duplicate/empty/multi-row headers; slugifies to `snake_case`. |
109
+ | **Merged cells** | Propagates a merged value across its range (`fill`) or keeps it top-left only (`first-only`). |
110
+ | **Excel serial dates** | Converts `45123` → a real date when the column has a date profile (both 1900 and 1904 epochs). |
111
+ | **Localised numbers** | `1.234,56` (pt-BR/EU) vs `1,234.56` (en-US), inferred per **column**, never per cell. |
112
+ | **Column type inference** | `int`/`float`/`date`/`datetime`/`bool`/`str`, with mixed-column handling. |
113
+ | **Disguised nulls** | `-`, `N/A`, `n/d`, `#REF!`, `#DIV/0!`, blanks → `None` (extensible). |
114
+ | **Trailing junk** | Removes totals, signatures and footnotes after the data ends. |
115
+ | **Cleaning report** | Structured, JSON-serialisable record of every correction with location and confidence. |
116
+ | **Input formats** | `.xlsx`, `.csv` (delimiter + encoding sniffing), `.tsv`. |
117
+
118
+ ## The report
119
+
120
+ Nothing changes without a record. Per-cell fixes are aggregated per column with a
121
+ count and sample locations, so the report stays small even on huge files:
122
+
123
+ ```python
124
+ >>> print(result.report.to_json())
125
+ {
126
+ "summary": {"table_start_detected": 1, "table_end_trimmed": 1,
127
+ "header_renamed": 1, "null_normalized": 1,
128
+ "number_parsed": 4, "type_coerced": 2},
129
+ "actions": [
130
+ {"kind": "table_start_detected", "rule": "density", "confidence": 0.8,
131
+ "detail": "skipped 3 leading row(s) (title/metadata/blank); table starts at row 3"},
132
+ {"kind": "number_parsed", "rule": "locale:pt_BR", "column": "valor_r",
133
+ "count": 4, "confidence": 1.0,
134
+ "examples": [{"row": 0, "original": "1.234,56", "final": 1234.56}]},
135
+ ...
136
+ ]
137
+ }
138
+ ```
139
+
140
+ ## Configuration
141
+
142
+ The 80% case needs no config. For the rest:
143
+
144
+ ```python
145
+ from messy_table import clean, Config
146
+
147
+ result = clean(
148
+ "dados.csv",
149
+ config=Config(
150
+ locale="pt_BR", # force number/date interpretation; default "auto"
151
+ header="auto", # "auto" | int (row index) | None (no header)
152
+ sheet=0, # index or name of the worksheet
153
+ merged_cells="fill", # "fill" | "first-only"
154
+ null_values_extra=["s/i"], # add to the built-in null list
155
+ strict=False, # True: raise AmbiguityError instead of warning
156
+ ),
157
+ )
158
+ ```
159
+
160
+ In `strict=True`, a low-confidence decision raises `AmbiguityError` — always with a
161
+ copy-pasteable `Config` suggestion to resolve it.
162
+
163
+ ## Security
164
+
165
+ messy-table parses untrusted files, so it defends against it: `.xlsx` archives are
166
+ checked for decompression-bomb shape (absolute size and ratio) **before** they are
167
+ opened, and a hard cell ceiling bounds memory. See [ARCHITECTURE.md](ARCHITECTURE.md).
168
+
169
+ ## Docs
170
+
171
+ - [ARCHITECTURE.md](ARCHITECTURE.md) — pipeline, stack rationale, security decisions.
172
+ - [docs/heuristics.md](docs/heuristics.md) — every heuristic and its thresholds.
173
+
174
+ ## License
175
+
176
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,140 @@
1
+ # messy-table
2
+
3
+ > `pandas.read_excel` assumes your spreadsheet is well-behaved. **messy-table assumes it is not.**
4
+
5
+ Turn messy real-world spreadsheets — Excel/CSV exported from ERPs, legacy systems,
6
+ hand-made reports — into clean, typed data, and get back an **auditable report of
7
+ every fix that was applied**.
8
+
9
+ ```python
10
+ from messy_table import clean
11
+
12
+ result = clean("relatorio_vendas.xlsx")
13
+
14
+ result.data # list[dict] — clean, typed rows
15
+ result.columns # per-column name / dtype / null summary
16
+ result.report # every transformation that was applied
17
+ result.warnings # low-confidence decisions
18
+ result.to_pandas() # DataFrame (optional extra)
19
+ ```
20
+
21
+ ## Why
22
+
23
+ AI agents and data pipelines receive arbitrary spreadsheets from users and today
24
+ re-implement, by hand and fragilely, the same heuristics: find where the table
25
+ starts, fix the headers, convert Excel serial dates, interpret `1.234,56`.
26
+ messy-table is the canonical answer to that step — deterministic, dependency-light,
27
+ and fully auditable.
28
+
29
+ ## Install
30
+
31
+ ```bash
32
+ pip install messy-table # core — depends only on openpyxl
33
+ pip install 'messy-table[pandas]' # adds result.to_pandas()
34
+ ```
35
+
36
+ Python ≥ 3.10. Ships `py.typed` — fully type-checked.
37
+
38
+ ## Before & after
39
+
40
+ A typical ERP export — a title banner, a blank row, pt-BR numbers, an `N/A`, and a
41
+ trailing totals row:
42
+
43
+ ```
44
+ Relatório de Vendas 2024
45
+ (gerado em 01/02/2024)
46
+
47
+ Produto | Valor (R$) | Qtd | Ativo
48
+ Café | 1.234,56 | 10 | sim
49
+ Chá | 2.000,00 | 5 | nao
50
+ Açúcar | - | 3 | sim
51
+ TOTAL | 3.234,56 | 18 |
52
+ ```
53
+
54
+ ```python
55
+ >>> result = clean("vendas.xlsx")
56
+ >>> result.data
57
+ [{'produto': 'Café', 'valor_r': 1234.56, 'qtd': 10, 'ativo': True},
58
+ {'produto': 'Chá', 'valor_r': 2000.0, 'qtd': 5, 'ativo': False},
59
+ {'produto': 'Açúcar', 'valor_r': None, 'qtd': 3, 'ativo': True}]
60
+ >>> [(c.name, c.dtype) for c in result.columns]
61
+ [('produto', 'str'), ('valor_r', 'float'), ('qtd', 'int'), ('ativo', 'bool')]
62
+ ```
63
+
64
+ The title, blank row and `TOTAL` line are gone; numbers are parsed in the column's
65
+ inferred locale; `-` is a null; `sim/nao` became booleans — and the report says so.
66
+
67
+ ## Features (v0.1)
68
+
69
+ | Feature | What it does |
70
+ |---|---|
71
+ | **Table-start detection** | Skips titles, logos, stray cells and metadata before the real header. |
72
+ | **Header detection & normalisation** | Finds the header (or its absence); resolves duplicate/empty/multi-row headers; slugifies to `snake_case`. |
73
+ | **Merged cells** | Propagates a merged value across its range (`fill`) or keeps it top-left only (`first-only`). |
74
+ | **Excel serial dates** | Converts `45123` → a real date when the column has a date profile (both 1900 and 1904 epochs). |
75
+ | **Localised numbers** | `1.234,56` (pt-BR/EU) vs `1,234.56` (en-US), inferred per **column**, never per cell. |
76
+ | **Column type inference** | `int`/`float`/`date`/`datetime`/`bool`/`str`, with mixed-column handling. |
77
+ | **Disguised nulls** | `-`, `N/A`, `n/d`, `#REF!`, `#DIV/0!`, blanks → `None` (extensible). |
78
+ | **Trailing junk** | Removes totals, signatures and footnotes after the data ends. |
79
+ | **Cleaning report** | Structured, JSON-serialisable record of every correction with location and confidence. |
80
+ | **Input formats** | `.xlsx`, `.csv` (delimiter + encoding sniffing), `.tsv`. |
81
+
82
+ ## The report
83
+
84
+ Nothing changes without a record. Per-cell fixes are aggregated per column with a
85
+ count and sample locations, so the report stays small even on huge files:
86
+
87
+ ```python
88
+ >>> print(result.report.to_json())
89
+ {
90
+ "summary": {"table_start_detected": 1, "table_end_trimmed": 1,
91
+ "header_renamed": 1, "null_normalized": 1,
92
+ "number_parsed": 4, "type_coerced": 2},
93
+ "actions": [
94
+ {"kind": "table_start_detected", "rule": "density", "confidence": 0.8,
95
+ "detail": "skipped 3 leading row(s) (title/metadata/blank); table starts at row 3"},
96
+ {"kind": "number_parsed", "rule": "locale:pt_BR", "column": "valor_r",
97
+ "count": 4, "confidence": 1.0,
98
+ "examples": [{"row": 0, "original": "1.234,56", "final": 1234.56}]},
99
+ ...
100
+ ]
101
+ }
102
+ ```
103
+
104
+ ## Configuration
105
+
106
+ The 80% case needs no config. For the rest:
107
+
108
+ ```python
109
+ from messy_table import clean, Config
110
+
111
+ result = clean(
112
+ "dados.csv",
113
+ config=Config(
114
+ locale="pt_BR", # force number/date interpretation; default "auto"
115
+ header="auto", # "auto" | int (row index) | None (no header)
116
+ sheet=0, # index or name of the worksheet
117
+ merged_cells="fill", # "fill" | "first-only"
118
+ null_values_extra=["s/i"], # add to the built-in null list
119
+ strict=False, # True: raise AmbiguityError instead of warning
120
+ ),
121
+ )
122
+ ```
123
+
124
+ In `strict=True`, a low-confidence decision raises `AmbiguityError` — always with a
125
+ copy-pasteable `Config` suggestion to resolve it.
126
+
127
+ ## Security
128
+
129
+ messy-table parses untrusted files, so it defends against it: `.xlsx` archives are
130
+ checked for decompression-bomb shape (absolute size and ratio) **before** they are
131
+ opened, and a hard cell ceiling bounds memory. See [ARCHITECTURE.md](ARCHITECTURE.md).
132
+
133
+ ## Docs
134
+
135
+ - [ARCHITECTURE.md](ARCHITECTURE.md) — pipeline, stack rationale, security decisions.
136
+ - [docs/heuristics.md](docs/heuristics.md) — every heuristic and its thresholds.
137
+
138
+ ## License
139
+
140
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,123 @@
1
+ # Heuristics
2
+
3
+ Every heuristic in messy-table, what signal it uses, and the thresholds it trips
4
+ on. Thresholds live as named constants in code; this file is the prose companion.
5
+ When you change a threshold, update both.
6
+
7
+ A shared idea runs through all of them: produce a **confidence** in `[0, 1]`.
8
+ Below `Config.confidence_threshold` (default **0.6**) the decision is *ambiguous*
9
+ — a warning in permissive mode, an `AmbiguityError` (with a `Config` suggestion)
10
+ in strict mode.
11
+
12
+ ## Density, merge-aware (shared primitive)
13
+
14
+ `util.merge_covered_cells` + `util.density_threshold`.
15
+
16
+ A row's *filled count* = non-blank cells **plus** cells covered by a **row-spanning**
17
+ merge (vertical/block). Horizontal one-row merges (banners) are excluded so titles
18
+ stay sparse.
19
+
20
+ `density_threshold(width, ratio=0.5)`:
21
+ - width 1 → **1** (single-column tables are real).
22
+ - width ≥ 2 → `max(2, ceil(0.5 · width))` (a lone stray cell never counts as data).
23
+
24
+ ## F1 — table start (`detectors/table_start.py`)
25
+
26
+ - Compute the merge-aware filled count per row; `width` = the max across rows.
27
+ - A row is *substantial* if its filled count ≥ `density_threshold(width)`.
28
+ - The table starts at the first row of the **longest contiguous run** of
29
+ substantial rows. (A blank row between a metadata block and the table breaks the
30
+ run, so the longer table run wins.)
31
+ - Confidence = `clamp(body_density − above_density + 0.5)`. Skipping rows with a
32
+ sparse preamble is high-confidence; skipping into a still-dense region is not.
33
+ - `Config(header=<int>)` pins the start and bypasses detection.
34
+
35
+ **Known limit:** a metadata block *immediately* above the table with no blank
36
+ separator and the same column count can be absorbed. A blank separator (the common
37
+ real case) resolves it; otherwise pin `header`.
38
+
39
+ ## F2 — header (`detectors/header.py` + `transformers/header_names.py`)
40
+
41
+ Detection (grid already body-sliced and unmerged, so the header is at row 0):
42
+
43
+ - **Headerless?** If row 0 is not text-heavy (`text_ratio < 0.5`), has no
44
+ horizontal merge, and shares the exact per-column category signature of row 1,
45
+ there is no header → synthesise `column_1…` and start data at row 0 (warned).
46
+ - **How many header rows?** Start at 1. While the current top header row intersects
47
+ a **horizontal merge** (a spanned group cell), consume the next row too — up to
48
+ `MAX_HEADER_ROWS` (3). This ties multi-row detection to real structure rather
49
+ than a fragile text test.
50
+ - Confidence: 0.9 if row 0 is text-heavy, else 0.55.
51
+
52
+ Naming (`header_names.py`): NFKD accent-strip → lowercase → non-word → `_` →
53
+ collapse/trim. Empty → `column_{i}`; leading digit → `col_…`; duplicates get
54
+ `_2`, `_3`. Every rename is recorded.
55
+
56
+ ## F3 — merged cells (`transformers/merged_cells.py`)
57
+
58
+ `fill` (default): copy each merge's anchor value to every other cell in its range.
59
+ `first-only`: leave them `None` (openpyxl's default) — a no-op. Each filled cell is
60
+ recorded. Skipped on very large (streaming) sheets, with a warning.
61
+
62
+ ## F4 — Excel serial dates (`transformers/dates.py`)
63
+
64
+ Date-formatted xlsx cells already arrive as `datetime` (openpyxl applies the
65
+ epoch). F4 targets the *messy* case: a column of **bare numbers** that are really
66
+ dates. Converted only when **both** hold:
67
+
68
+ 1. every value sits in the serial range **20000–60000** (≈ 1954–2064), **and**
69
+ 2. there is corroboration — the cell's number format looks like a date
70
+ (`fmt_is_date`, confidence **0.9**) **or** the column name hints at a date
71
+ (`data`, `vencimento`, `date`, … → confidence **0.7**).
72
+
73
+ Bare numbers with no hint are left numeric (so an `ano`/`year` column survives).
74
+ Conversion uses the workbook epoch (1899-12-30 base absorbs the 1900 leap bug;
75
+ 1904-01-01 for Mac). Integer serial → `date`; fractional → `datetime`.
76
+
77
+ ## F5 — localised numbers (`transformers/numbers.py`)
78
+
79
+ Only columns where ≥ **70%** (`NUMERIC_COLUMN_RATIO`) of non-blank string cells look
80
+ numeric are treated as numeric. Per-value *decimal vote*:
81
+
82
+ - both `.` and `,` present → the **last** one is the decimal point;
83
+ - one separator present → it is *thousands grouping* only if it splits into clean
84
+ 3-digit runs (`1.234`, `12.345.678`), otherwise it is the decimal point.
85
+
86
+ The column's majority vote picks the convention; confidence = winner / total votes
87
+ (so a 50/50 split → 0.5, below threshold → ambiguous). `Config(locale=…)` overrides
88
+ the vote (confidence 1.0). Currency symbols, spaces, NBSP, apostrophes and a
89
+ trailing `%` (divide by 100) are handled.
90
+
91
+ ## F6 — column types (`transformers/types.py`)
92
+
93
+ Per column, over surviving non-null values:
94
+
95
+ - all `date`/`datetime` → `date`, or `datetime` if any carry a time (bare dates are
96
+ promoted to datetime for uniformity);
97
+ - all `int` → `int`; any fractional → `float`;
98
+ - all boolean (native or text tokens `true/false/sim/não/yes/no/…`) → `bool`;
99
+ - otherwise → `str` (lossless fallback, with a "mixes types" warning when the column
100
+ genuinely mixed numbers and text).
101
+
102
+ ## F7 — disguised nulls (`transformers/nulls.py`)
103
+
104
+ Case-insensitive, trimmed match against the built-in token set (`-`, `--`, `n/a`,
105
+ `n/d`, `null`, `none`, `nil`, `nan`, and the Excel error literals `#REF!`,
106
+ `#DIV/0!`, `#VALUE!`, …) plus `Config.null_values_extra`. Matches become `None`.
107
+
108
+ ## F8 — trailing junk (`detectors/table_end.py`)
109
+
110
+ Walk up from the bottom; trim a row while it is **sparse** (below the body density
111
+ threshold), **empty**, or starts with a **summary keyword** (`total`, `subtotal`,
112
+ `soma`, `fonte`, `gerado em`, `assinatura`, …). Stop at the first real data row.
113
+ Confidence 0.85 when a keyword matched, else 0.7.
114
+
115
+ ## CSV delimiter & encoding (`readers/csv.py`)
116
+
117
+ - **Encoding:** decode cascade `utf-8-sig` → `cp1252` → `latin-1` (the last never
118
+ fails). The first that decodes wins.
119
+ - **Delimiter:** structural, not `csv.Sniffer` (which preamble rows fool). For each
120
+ candidate (`, ; \t |`) score by how many lines share a modal field count ≥ 2,
121
+ requiring at least half the lines to split. Best agreement wins; ties broken by
122
+ more fields. If nothing qualifies → **single column** (a lone decimal comma is not
123
+ a delimiter). `.tsv` forces tab.
@@ -0,0 +1,136 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.25"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "messy-table"
7
+ version = "0.1.0"
8
+ description = "Turn messy real-world spreadsheets into clean, typed data — with an auditable report of every fix."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "messy-table contributors" }]
13
+ keywords = ["excel", "xlsx", "csv", "data-cleaning", "spreadsheet", "etl", "pandas", "data-ingestion"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Office/Business :: Financial :: Spreadsheet",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ "Typing :: Typed",
26
+ ]
27
+ # openpyxl is the ONLY mandatory dependency. >=3.1.3 pulls in the read-only
28
+ # date handling fixes and the et-xmlfile pin that closes the XML entity issues.
29
+ dependencies = ["openpyxl>=3.1.3"]
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/messy-table/messy-table"
33
+ Repository = "https://github.com/messy-table/messy-table"
34
+ Changelog = "https://github.com/messy-table/messy-table/blob/main/CHANGELOG.md"
35
+ Issues = "https://github.com/messy-table/messy-table/issues"
36
+
37
+ [project.optional-dependencies]
38
+ # `pip install messy-table[pandas]` enables CleanResult.to_pandas().
39
+ pandas = ["pandas>=2.0"]
40
+ dev = [
41
+ "pytest>=8.0",
42
+ "pytest-cov>=5.0",
43
+ "ruff>=0.6",
44
+ "mypy>=1.11",
45
+ "pandas>=2.0",
46
+ "pandas-stubs>=2.0",
47
+ ]
48
+
49
+ [tool.hatch.build.targets.wheel]
50
+ packages = ["src/messy_table"]
51
+
52
+ [tool.hatch.build.targets.sdist]
53
+ include = ["src/messy_table", "tests", "docs", "README.md", "CHANGELOG.md", "LICENSE"]
54
+
55
+ # ----------------------------------------------------------------------------
56
+ # Ruff — lint + format. Opinionated, strict, zero tolerance for the basics.
57
+ # ----------------------------------------------------------------------------
58
+ [tool.ruff]
59
+ line-length = 100
60
+ target-version = "py310"
61
+ src = ["src", "tests"]
62
+
63
+ [tool.ruff.lint]
64
+ select = [
65
+ "E", "W", # pycodestyle
66
+ "F", # pyflakes
67
+ "I", # isort
68
+ "N", # pep8-naming
69
+ "UP", # pyupgrade
70
+ "B", # flake8-bugbear
71
+ "C4", # comprehensions
72
+ "SIM", # simplify
73
+ "RUF", # ruff-specific
74
+ "PTH", # use pathlib
75
+ "TID", # tidy imports
76
+ "BLE", # blind-except — no bare/broad except in the core
77
+ "S", # bandit security checks
78
+ ]
79
+ ignore = [
80
+ "S101", # asserts are fine in tests
81
+ ]
82
+
83
+ [tool.ruff.lint.per-file-ignores]
84
+ "tests/**" = ["S", "N802", "N806"]
85
+ "tests/_fixtures_gen.py" = ["S", "N802", "N806"]
86
+
87
+ [tool.ruff.lint.isort]
88
+ known-first-party = ["messy_table"]
89
+
90
+ # ----------------------------------------------------------------------------
91
+ # mypy — strict. The library ships py.typed; the public surface is fully typed.
92
+ # ----------------------------------------------------------------------------
93
+ [tool.mypy]
94
+ python_version = "3.10"
95
+ strict = true
96
+ warn_unreachable = true
97
+ warn_redundant_casts = true
98
+ warn_unused_ignores = true
99
+ disallow_untyped_defs = true
100
+ disallow_any_generics = true
101
+ no_implicit_reexport = true
102
+ show_error_codes = true
103
+ files = ["src/messy_table"]
104
+
105
+ [[tool.mypy.overrides]]
106
+ # openpyxl ships no type stubs upstream; we isolate the untyped boundary in
107
+ # the readers and treat it as the only place `Any` may legitimately enter.
108
+ module = ["openpyxl.*"]
109
+ ignore_missing_imports = true
110
+
111
+ # ----------------------------------------------------------------------------
112
+ # pytest + coverage
113
+ # ----------------------------------------------------------------------------
114
+ [tool.pytest.ini_options]
115
+ minversion = "8.0"
116
+ testpaths = ["tests"]
117
+ addopts = "-q --strict-markers --strict-config"
118
+ markers = [
119
+ "perf: performance-gate tests (may be slow); run with -m perf",
120
+ ]
121
+
122
+ [tool.coverage.run]
123
+ branch = true
124
+ source = ["messy_table"]
125
+ omit = ["*/tests/*"]
126
+
127
+ [tool.coverage.report]
128
+ # Acceptance criterion: 90% minimum on the core. Hardened, not loosened.
129
+ fail_under = 90
130
+ show_missing = true
131
+ exclude_lines = [
132
+ "pragma: no cover",
133
+ "if TYPE_CHECKING:",
134
+ "raise NotImplementedError",
135
+ "\\.\\.\\.",
136
+ ]