messy-table 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ """messy-table — turn messy real-world spreadsheets into clean, typed data.
2
+
3
+ ``pandas.read_excel`` assumes your spreadsheet is well-behaved. messy-table
4
+ assumes it is not.
5
+
6
+ >>> from messy_table import clean
7
+ >>> result = clean("relatorio_vendas.xlsx")
8
+ >>> result.data # list[dict] — clean, typed rows
9
+ >>> result.columns # per-column name/dtype/null summary
10
+ >>> result.report # every fix that was applied
11
+ >>> result.warnings # low-confidence decisions
12
+
13
+ The public surface is intentionally small. Everything below ``clean`` and
14
+ ``Config`` is for inspecting results and handling errors.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from messy_table.api import clean
20
+ from messy_table.config import Config
21
+ from messy_table.exceptions import (
22
+ AmbiguityError,
23
+ MessyTableError,
24
+ UnsupportedFormatError,
25
+ )
26
+ from messy_table.report import (
27
+ Action,
28
+ ActionKind,
29
+ CleanReport,
30
+ ColumnInfo,
31
+ Issue,
32
+ Severity,
33
+ )
34
+ from messy_table.result import CleanResult
35
+
36
+ __version__ = "0.1.0"
37
+
38
+ __all__ = [
39
+ "Action",
40
+ "ActionKind",
41
+ "AmbiguityError",
42
+ "CleanReport",
43
+ "CleanResult",
44
+ "ColumnInfo",
45
+ "Config",
46
+ "Issue",
47
+ "MessyTableError",
48
+ "Severity",
49
+ "UnsupportedFormatError",
50
+ "__version__",
51
+ "clean",
52
+ ]
messy_table/api.py ADDED
@@ -0,0 +1,96 @@
1
+ """The single public entry point: :func:`clean`.
2
+
3
+ This module owns the pipeline order, which is the one place the whole design
4
+ comes together:
5
+
6
+ read → [detect start] → [detect end] → slice body
7
+ → unmerge → [detect header] → name header → slice off header
8
+ → nulls → numbers → dates → finalize types → emit
9
+
10
+ Detectors run on the still-merged grid (so banner/title rows stay sparse and get
11
+ skipped); unmerge then runs before header detection; value transformers run on
12
+ the header-stripped data grid in dependency order.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Any
18
+
19
+ from messy_table.config import Config
20
+ from messy_table.context import Context
21
+ from messy_table.detectors import detect_header, detect_table_end, detect_table_start
22
+ from messy_table.grid import Grid, SourceInfo
23
+ from messy_table.readers import Source, read
24
+ from messy_table.result import CleanResult
25
+ from messy_table.transformers import (
26
+ apply_merged_cells,
27
+ convert_dates,
28
+ finalize_types,
29
+ normalize_headers,
30
+ normalize_nulls,
31
+ parse_numbers,
32
+ )
33
+
34
+
35
+ def clean(source: Source, *, config: Config | None = None) -> CleanResult:
36
+ """Clean a messy spreadsheet and return typed data plus an audit report.
37
+
38
+ Parameters
39
+ ----------
40
+ source:
41
+ A path (``str``/``Path``), raw ``bytes``, or a binary/text file-like
42
+ object holding an ``.xlsx``, ``.csv`` or ``.tsv``.
43
+ config:
44
+ Optional :class:`~messy_table.config.Config`. Omit it for the 80% case.
45
+
46
+ Returns
47
+ -------
48
+ CleanResult
49
+ ``.data``, ``.columns``, ``.report`` and ``.warnings``.
50
+ """
51
+ cfg = config or Config()
52
+ ctx = Context(config=cfg, source=SourceInfo(origin="<input>", kind="unknown"))
53
+
54
+ grid = read(source, cfg, ctx)
55
+ if grid.nrows == 0 or grid.ncols == 0:
56
+ return _empty_result(ctx)
57
+
58
+ start = detect_table_start(grid, ctx)
59
+ end = detect_table_end(grid, ctx, start)
60
+ body = grid.slice_rows(start, end)
61
+
62
+ apply_merged_cells(body, ctx)
63
+
64
+ header_rows, raw_names = detect_header(body, ctx)
65
+ pairs = normalize_headers(raw_names, ctx)
66
+ originals = [original for original, _ in pairs]
67
+
68
+ data = body.slice_rows(header_rows, body.nrows)
69
+ normalize_nulls(data, ctx)
70
+ parse_numbers(data, ctx)
71
+ convert_dates(data, ctx)
72
+ columns = finalize_types(data, ctx, originals)
73
+
74
+ rows = _emit_rows(data, ctx.column_names)
75
+ return CleanResult(
76
+ data=rows,
77
+ columns=columns,
78
+ report=ctx.report.build(),
79
+ warnings=ctx.warnings,
80
+ source=ctx.source,
81
+ )
82
+
83
+
84
+ def _emit_rows(grid: Grid, names: list[str]) -> list[dict[str, Any]]:
85
+ return [{names[c]: grid.cell(r, c) for c in range(len(names))} for r in range(grid.nrows)]
86
+
87
+
88
+ def _empty_result(ctx: Context) -> CleanResult:
89
+ ctx.warn("input contained no data", suggestion="check the file and sheet selection")
90
+ return CleanResult(
91
+ data=[],
92
+ columns=[],
93
+ report=ctx.report.build(),
94
+ warnings=ctx.warnings,
95
+ source=ctx.source,
96
+ )
messy_table/config.py ADDED
@@ -0,0 +1,75 @@
1
+ """User-facing configuration.
2
+
3
+ The 80% case needs none of this — ``clean(path)`` works. ``Config`` exists for
4
+ the cases where a heuristic needs a hand: a forced locale, a known header row,
5
+ a specific sheet.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Literal
12
+
13
+ from messy_table.exceptions import MessyTableError
14
+
15
+ MergedCellsMode = Literal["fill", "first-only"]
16
+ HeaderSpec = Literal["auto"] | int | None
17
+ LocaleSpec = Literal["auto", "pt_BR", "en_US", "de_DE", "fr_FR"]
18
+
19
+ # A library that parses untrusted files needs hard ceilings. These defend
20
+ # against decompression bombs and pathological inputs without getting in the
21
+ # way of any realistic spreadsheet (50k x 30 = 1.5M cells sits well under).
22
+ _DEFAULT_MAX_CELLS = 5_000_000
23
+ _DEFAULT_MAX_UNCOMPRESSED_BYTES = 512 * 1024 * 1024 # 512 MiB expanded
24
+ _DEFAULT_MAX_COMPRESSION_RATIO = 200 # expanded/packed beyond this is suspicious
25
+
26
+
27
+ @dataclass(frozen=True, slots=True)
28
+ class Config:
29
+ """Tuning knobs for :func:`messy_table.clean`.
30
+
31
+ All fields have safe defaults; constructing ``Config()`` matches the implicit
32
+ behaviour of calling ``clean`` with no config.
33
+ """
34
+
35
+ locale: LocaleSpec = "auto"
36
+ """Forces number/date interpretation. ``"auto"`` infers per column."""
37
+
38
+ header: HeaderSpec = "auto"
39
+ """``"auto"`` detects the header; an ``int`` pins the 0-based row; ``None``
40
+ means there is no header (columns become ``column_1``, ``column_2``, ...)."""
41
+
42
+ sheet: int | str = 0
43
+ """Worksheet to read, by 0-based index or by name. Ignored for CSV/TSV."""
44
+
45
+ merged_cells: MergedCellsMode = "fill"
46
+ """``"fill"`` propagates a merged value across its whole range;
47
+ ``"first-only"`` keeps it in the top-left cell and nulls the rest."""
48
+
49
+ null_values_extra: tuple[str, ...] = ()
50
+ """Extra tokens to treat as null, *added* to the built-in set."""
51
+
52
+ strict: bool = False
53
+ """When ``True``, a low-confidence decision raises
54
+ :class:`~messy_table.exceptions.AmbiguityError` instead of warning."""
55
+
56
+ # --- Safety limits (rarely touched; present so they are auditable) -------
57
+ max_cells: int = _DEFAULT_MAX_CELLS
58
+ max_uncompressed_bytes: int = _DEFAULT_MAX_UNCOMPRESSED_BYTES
59
+ max_compression_ratio: int = _DEFAULT_MAX_COMPRESSION_RATIO
60
+
61
+ # Confidence thresholds for the heuristics. Documented in docs/heuristics.md.
62
+ confidence_threshold: float = field(default=0.6)
63
+ """Below this, a decision is a warning (or an error in strict mode)."""
64
+
65
+ def __post_init__(self) -> None:
66
+ if self.merged_cells not in ("fill", "first-only"):
67
+ raise MessyTableError(
68
+ f"merged_cells must be 'fill' or 'first-only', got {self.merged_cells!r}"
69
+ )
70
+ if not (0.0 <= self.confidence_threshold <= 1.0):
71
+ raise MessyTableError("confidence_threshold must be in [0.0, 1.0]")
72
+ if isinstance(self.header, int) and self.header < 0:
73
+ raise MessyTableError("header row index must be >= 0")
74
+ if self.max_cells <= 0 or self.max_uncompressed_bytes <= 0:
75
+ raise MessyTableError("safety limits must be positive")
messy_table/context.py ADDED
@@ -0,0 +1,63 @@
1
+ """Shared, mutable context threaded through every pipeline stage.
2
+
3
+ Centralising the strict-vs-permissive decision here means each detector and
4
+ transformer just calls ``ctx.ambiguous(...)`` when its confidence is low and
5
+ never has to know which mode it is running in.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+
12
+ from messy_table.config import Config
13
+ from messy_table.exceptions import AmbiguityError
14
+ from messy_table.grid import SourceInfo
15
+ from messy_table.report import Issue, ReportBuilder, Severity
16
+
17
+
18
+ @dataclass
19
+ class Context:
20
+ config: Config
21
+ source: SourceInfo
22
+ report: ReportBuilder = field(default_factory=ReportBuilder)
23
+ warnings: list[Issue] = field(default_factory=list)
24
+ # The current header names, mutated as the header transformer runs.
25
+ column_names: list[str] = field(default_factory=list)
26
+
27
+ def warn(
28
+ self,
29
+ message: str,
30
+ *,
31
+ column: str | None = None,
32
+ row: int | None = None,
33
+ confidence: float | None = None,
34
+ suggestion: str | None = None,
35
+ ) -> None:
36
+ self.warnings.append(
37
+ Issue(
38
+ message=message,
39
+ severity=Severity.WARNING,
40
+ column=column,
41
+ row=row,
42
+ confidence=confidence,
43
+ suggestion=suggestion,
44
+ )
45
+ )
46
+
47
+ def ambiguous(
48
+ self,
49
+ message: str,
50
+ *,
51
+ suggestion: str,
52
+ column: str | None = None,
53
+ row: int | None = None,
54
+ confidence: float | None = None,
55
+ ) -> None:
56
+ """A below-threshold decision: raise in strict mode, else warn.
57
+
58
+ ``suggestion`` is mandatory and must be a copy-pasteable ``Config`` hint,
59
+ so the error/warning is always actionable.
60
+ """
61
+ if self.config.strict:
62
+ raise AmbiguityError(message, suggestion=suggestion)
63
+ self.warn(message, column=column, row=row, confidence=confidence, suggestion=suggestion)
@@ -0,0 +1,10 @@
1
+ """Detectors locate structure in the raw grid; each returns a decision + records
2
+ its confidence. They never mutate values — that is the transformers' job."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from messy_table.detectors.header import detect_header
7
+ from messy_table.detectors.table_end import detect_table_end
8
+ from messy_table.detectors.table_start import detect_table_start
9
+
10
+ __all__ = ["detect_header", "detect_table_end", "detect_table_start"]
@@ -0,0 +1,115 @@
1
+ """F2 (detection half) — find the header row(s).
2
+
3
+ By the time this runs the grid is already sliced to the table body and unmerged,
4
+ so the header is at local row 0. Two questions:
5
+
6
+ * **Is there a header at all?** If the first row looks like data (numeric-heavy)
7
+ and has the same per-column type signature as the row below it, there is no
8
+ header — we synthesise ``column_1 ...`` and start data at row 0.
9
+ * **How many rows does it span?** Multi-row headers ("Vendas" over "2024") come
10
+ from a horizontally merged group cell. We keep the original merged ranges as
11
+ metadata, so a top row that intersects a horizontal merge tells us the next
12
+ row holds the leaf labels. That ties multi-row detection to real structure
13
+ rather than a fragile text heuristic.
14
+
15
+ The raw, still-dirty column names are returned; slugifying/de-duping/filling is
16
+ the header-names transformer's job.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from messy_table.context import Context
22
+ from messy_table.grid import Grid
23
+ from messy_table.report import ActionKind
24
+ from messy_table.util import raw_category
25
+
26
+ MAX_HEADER_ROWS = 3
27
+ TEXT_RATIO_THRESHOLD = 0.5
28
+
29
+
30
+ def detect_header(grid: Grid, ctx: Context) -> tuple[int, list[str]]:
31
+ """Return ``(header_row_count, raw_column_names)``.
32
+
33
+ ``header_row_count`` is the number of leading rows consumed by the header;
34
+ data begins at that local index.
35
+ """
36
+ cfg = ctx.config
37
+ nrows, ncols = grid.nrows, grid.ncols
38
+
39
+ if cfg.header is None:
40
+ return 0, _synthetic_names(ncols)
41
+ if nrows == 0:
42
+ return 0, []
43
+
44
+ if isinstance(cfg.header, int):
45
+ ctx.report.note(ActionKind.HEADER, "config", detail="header row pinned via Config.header")
46
+ return 1, _merge_header_rows(grid, 1, ncols)
47
+
48
+ row0_text = _text_ratio(grid.row(0))
49
+ if _looks_headerless(grid, row0_text):
50
+ ctx.warn(
51
+ "no header row detected; generated column names",
52
+ suggestion="Config(header=0) to force the first row as the header",
53
+ )
54
+ return 0, _synthetic_names(ncols)
55
+
56
+ max_header = max(1, min(MAX_HEADER_ROWS, nrows - 1)) if nrows > 1 else 1
57
+ header_rows = 1
58
+ while header_rows < max_header and _has_horizontal_merge(grid, header_rows - 1):
59
+ header_rows += 1
60
+
61
+ confidence = 0.9 if row0_text >= TEXT_RATIO_THRESHOLD else 0.55
62
+ detail = f"header occupies {header_rows} row(s)"
63
+ if header_rows > 1:
64
+ detail += " (multi-row header merged column-wise)"
65
+ ctx.report.note(ActionKind.HEADER, "auto", detail=detail, confidence=confidence)
66
+ if confidence < cfg.confidence_threshold:
67
+ ctx.ambiguous(
68
+ f"low confidence ({confidence}) that row 0 is a header",
69
+ suggestion="Config(header=<row index>) or Config(header=None)",
70
+ )
71
+ return header_rows, _merge_header_rows(grid, header_rows, ncols)
72
+
73
+
74
+ def _looks_headerless(grid: Grid, row0_text: float) -> bool:
75
+ if grid.nrows < 2 or row0_text >= TEXT_RATIO_THRESHOLD:
76
+ return False
77
+ if _has_horizontal_merge(grid, 0):
78
+ return False
79
+ return _category_signature(grid.row(0)) == _category_signature(grid.row(1))
80
+
81
+
82
+ def _text_ratio(row: list[object]) -> float:
83
+ cats = [raw_category(v) for v in row]
84
+ nonblank = [c for c in cats if c != "blank"]
85
+ if not nonblank:
86
+ return 0.0
87
+ return sum(1 for c in nonblank if c == "text") / len(nonblank)
88
+
89
+
90
+ def _category_signature(row: list[object]) -> tuple[str, ...]:
91
+ return tuple(raw_category(v) for v in row)
92
+
93
+
94
+ def _has_horizontal_merge(grid: Grid, row: int) -> bool:
95
+ return any(m.min_row <= row <= m.max_row and m.max_col > m.min_col for m in grid.merged_ranges)
96
+
97
+
98
+ def _merge_header_rows(grid: Grid, header_rows: int, ncols: int) -> list[str]:
99
+ names: list[str] = []
100
+ for c in range(ncols):
101
+ parts: list[str] = []
102
+ for r in range(header_rows):
103
+ value = grid.cell(r, c)
104
+ if value is None:
105
+ continue
106
+ text = str(value).strip()
107
+ # Skip a part already contributed by the row above (merge fill repeats it).
108
+ if text and (not parts or parts[-1] != text):
109
+ parts.append(text)
110
+ names.append(" ".join(parts))
111
+ return names
112
+
113
+
114
+ def _synthetic_names(ncols: int) -> list[str]:
115
+ return [f"column_{i + 1}" for i in range(ncols)]
@@ -0,0 +1,104 @@
1
+ """F8 — trim trailing junk: totals, signatures, footnotes.
2
+
3
+ After the last data row, exports often append a totals line, a "Gerado em ..."
4
+ stamp, a signature, or free-text notes. We walk up from the bottom and trim rows
5
+ that are either *sparse* (the data block is dense, these are not) or begin with a
6
+ *summary keyword*. We stop at the first real data row, so only the trailing block
7
+ is removed.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from messy_table.context import Context
13
+ from messy_table.grid import Grid
14
+ from messy_table.report import ActionKind
15
+ from messy_table.util import (
16
+ density_threshold,
17
+ first_nonblank,
18
+ is_blank,
19
+ merge_covered_cells,
20
+ )
21
+
22
+ DENSITY_RATIO = 0.5
23
+
24
+ # Lower-cased; matched as a prefix of the row's first non-blank cell.
25
+ SUMMARY_PREFIXES = (
26
+ "total",
27
+ "totais",
28
+ "subtotal",
29
+ "sub-total",
30
+ "soma",
31
+ "grand total",
32
+ "resumo",
33
+ "média",
34
+ "media",
35
+ "assinatura",
36
+ "observ",
37
+ "obs.",
38
+ "nota",
39
+ "fonte",
40
+ "gerado em",
41
+ "emitido",
42
+ "página",
43
+ "pagina",
44
+ )
45
+
46
+
47
+ def detect_table_end(grid: Grid, ctx: Context, start: int) -> int:
48
+ """Return the exclusive end row index of the data block."""
49
+ nrows, ncols = grid.nrows, grid.ncols
50
+ if nrows == 0 or ncols == 0:
51
+ return nrows
52
+
53
+ covered = merge_covered_cells(grid.merged_ranges)
54
+ # Measure fill against *live* columns only. A fully empty column (a stray
55
+ # trailing column is common in real exports) must not make every data row look
56
+ # sparse and get trimmed as junk — that silently destroys the whole table.
57
+ live_cols = _live_columns(grid, start + 1, nrows, covered)
58
+ threshold = density_threshold(len(live_cols), DENSITY_RATIO)
59
+
60
+ end = nrows
61
+ keyword_hit = False
62
+ r = nrows - 1
63
+ while r > start:
64
+ first = first_nonblank(grid.row(r))
65
+ filled_live = sum(
66
+ 1 for c in live_cols if not is_blank(grid.cell(r, c)) or (r, c) in covered
67
+ )
68
+ is_summary = isinstance(first, str) and first.strip().lower().startswith(SUMMARY_PREFIXES)
69
+ if filled_live < threshold or is_summary:
70
+ end = r
71
+ keyword_hit = keyword_hit or is_summary
72
+ r -= 1
73
+ else:
74
+ break
75
+
76
+ return _finish(ctx, nrows, end, start, keyword_hit)
77
+
78
+
79
+ def _live_columns(grid: Grid, start: int, stop: int, covered: set[tuple[int, int]]) -> set[int]:
80
+ """Columns holding any data in ``[start, stop)``. Short-circuits when all are
81
+ live (the common dense case), so this stays cheap on large sheets."""
82
+ ncols = grid.ncols
83
+ live: set[int] = set()
84
+ for r in range(start, stop):
85
+ for c in range(ncols):
86
+ if c not in live and (not is_blank(grid.cell(r, c)) or (r, c) in covered):
87
+ live.add(c)
88
+ if len(live) == ncols:
89
+ break
90
+ return live or set(range(ncols))
91
+
92
+
93
+ def _finish(ctx: Context, nrows: int, end: int, start: int, keyword_hit: bool) -> int:
94
+ trimmed = nrows - end
95
+ if trimmed:
96
+ # A keyword-matched total carries more certainty than a merely sparse row.
97
+ confidence = 0.85 if keyword_hit else 0.7
98
+ ctx.report.note(
99
+ ActionKind.TABLE_END,
100
+ "trailing-junk",
101
+ detail=f"trimmed {trimmed} trailing row(s) (totals/notes/blank) after row {end - 1}",
102
+ confidence=confidence,
103
+ )
104
+ return end
@@ -0,0 +1,81 @@
1
+ """F1 — find where the real table begins.
2
+
3
+ Real exports bury the table under a title, a logo cell, a date stamp, blank
4
+ rows. Those leading rows are *sparse*: one or two filled cells. The table — its
5
+ header included — is *dense*: most columns filled, row after row. So we locate
6
+ the longest contiguous run of dense rows and call its first row the start.
7
+
8
+ Merged title banners are not a problem here: openpyxl reports a merged cell's
9
+ value only in its top-left anchor, so a full-width merged title still counts as
10
+ a single filled cell. We detect boundaries *before* unmerging precisely so this
11
+ holds.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from messy_table.context import Context
17
+ from messy_table.grid import Grid
18
+ from messy_table.report import ActionKind
19
+ from messy_table.util import density_threshold, merge_covered_cells, row_filled_counts
20
+
21
+ DENSITY_RATIO = 0.5
22
+
23
+
24
+ def detect_table_start(grid: Grid, ctx: Context) -> int:
25
+ """Return the 0-based index of the first row belonging to the table."""
26
+ cfg = ctx.config
27
+ nrows, ncols = grid.nrows, grid.ncols
28
+ if nrows == 0 or ncols == 0:
29
+ return 0
30
+
31
+ if isinstance(cfg.header, int):
32
+ start = min(cfg.header, nrows - 1)
33
+ if start > 0:
34
+ ctx.report.note(
35
+ ActionKind.TABLE_START,
36
+ "config",
37
+ detail=f"table start pinned to row {start} via Config.header",
38
+ )
39
+ return start
40
+
41
+ covered = merge_covered_cells(grid.merged_ranges)
42
+ filled = row_filled_counts(grid.values, covered)
43
+ width = max(filled)
44
+ if width == 0:
45
+ return 0
46
+ threshold = density_threshold(width, DENSITY_RATIO)
47
+ substantial = [f >= threshold for f in filled]
48
+
49
+ best_start, best_len = 0, 0
50
+ i = 0
51
+ while i < nrows:
52
+ if not substantial[i]:
53
+ i += 1
54
+ continue
55
+ j = i
56
+ while j < nrows and substantial[j]:
57
+ j += 1
58
+ if j - i > best_len:
59
+ best_start, best_len = i, j - i
60
+ i = j
61
+
62
+ start = best_start
63
+ if start > 0:
64
+ above_density = sum(filled[:start]) / (start * ncols)
65
+ body_density = filled[start] / ncols
66
+ confidence = round(min(1.0, max(0.3, body_density - above_density + 0.5)), 2)
67
+ ctx.report.note(
68
+ ActionKind.TABLE_START,
69
+ "density",
70
+ detail=(
71
+ f"skipped {start} leading row(s) (title/metadata/blank); "
72
+ f"table starts at row {start}"
73
+ ),
74
+ confidence=confidence,
75
+ )
76
+ if confidence < cfg.confidence_threshold:
77
+ ctx.ambiguous(
78
+ f"low confidence ({confidence}) locating the table start at row {start}",
79
+ suggestion=f"Config(header={start})",
80
+ )
81
+ return start
@@ -0,0 +1,40 @@
1
+ """Exception hierarchy for messy-table.
2
+
3
+ Every error raised by the public API descends from :class:`MessyTableError`, so
4
+ callers can catch the whole family with a single ``except``. Errors that the user
5
+ can resolve by changing configuration always carry a concrete ``suggestion``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+
11
+ class MessyTableError(Exception):
12
+ """Base class for every error raised by messy-table."""
13
+
14
+
15
+ class UnsupportedFormatError(MessyTableError):
16
+ """Raised when the input is not a format messy-table can read.
17
+
18
+ Covers unknown extensions, corrupt archives, and inputs that trip a safety
19
+ guard (for example an ``.xlsx`` that decompresses far beyond its packed size,
20
+ which is the classic decompression-bomb shape).
21
+ """
22
+
23
+
24
+ class AmbiguityError(MessyTableError):
25
+ """Raised in ``strict`` mode when a heuristic cannot decide confidently.
26
+
27
+ In permissive mode the same situation is recorded as a low-confidence
28
+ :class:`~messy_table.report.Issue` instead of raising. The ``suggestion`` is
29
+ always a copy-pasteable hint for the ``Config`` field that resolves it.
30
+ """
31
+
32
+ def __init__(self, message: str, *, suggestion: str | None = None) -> None:
33
+ super().__init__(message)
34
+ self.suggestion = suggestion
35
+
36
+ def __str__(self) -> str:
37
+ base = super().__str__()
38
+ if self.suggestion:
39
+ return f"{base}\n → resolve with: {self.suggestion}"
40
+ return base