PyPI - messy-table - Versions diffs - 0.1.0__py3-none-any.whl - Mend

messy-table 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

messy_table/__init__.py +52 -0
messy_table/api.py +96 -0
messy_table/config.py +75 -0
messy_table/context.py +63 -0
messy_table/detectors/__init__.py +10 -0
messy_table/detectors/header.py +115 -0
messy_table/detectors/table_end.py +104 -0
messy_table/detectors/table_start.py +81 -0
messy_table/exceptions.py +40 -0
messy_table/grid.py +133 -0
messy_table/py.typed +0 -0
messy_table/readers/__init__.py +84 -0
messy_table/readers/csv.py +109 -0
messy_table/readers/xlsx.py +243 -0
messy_table/report.py +212 -0
messy_table/result.py +68 -0
messy_table/transformers/__init__.py +23 -0
messy_table/transformers/dates.py +114 -0
messy_table/transformers/header_names.py +74 -0
messy_table/transformers/merged_cells.py +41 -0
messy_table/transformers/nulls.py +66 -0
messy_table/transformers/numbers.py +160 -0
messy_table/transformers/types.py +163 -0
messy_table/util.py +143 -0
messy_table-0.1.0.dist-info/METADATA +176 -0
messy_table-0.1.0.dist-info/RECORD +28 -0
messy_table-0.1.0.dist-info/WHEEL +4 -0
messy_table-0.1.0.dist-info/licenses/LICENSE +21 -0

messy_table/__init__.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""messy-table — turn messy real-world spreadsheets into clean, typed data.
+``pandas.read_excel`` assumes your spreadsheet is well-behaved. messy-table
+assumes it is not.
+    >>> from messy_table import clean
+    >>> result = clean("relatorio_vendas.xlsx")
+    >>> result.data        # list[dict] — clean, typed rows
+    >>> result.columns     # per-column name/dtype/null summary
+    >>> result.report      # every fix that was applied
+    >>> result.warnings    # low-confidence decisions
+The public surface is intentionally small. Everything below ``clean`` and
+``Config`` is for inspecting results and handling errors.
+"""
+from __future__ import annotations
+from messy_table.api import clean
+from messy_table.config import Config
+from messy_table.exceptions import (
+    AmbiguityError,
+    MessyTableError,
+    UnsupportedFormatError,
+)
+from messy_table.report import (
+    Action,
+    ActionKind,
+    CleanReport,
+    ColumnInfo,
+    Issue,
+    Severity,
+)
+from messy_table.result import CleanResult
+__version__ = "0.1.0"
+__all__ = [
+    "Action",
+    "ActionKind",
+    "AmbiguityError",
+    "CleanReport",
+    "CleanResult",
+    "ColumnInfo",
+    "Config",
+    "Issue",
+    "MessyTableError",
+    "Severity",
+    "UnsupportedFormatError",
+    "__version__",
+    "clean",
+]

messy_table/api.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""The single public entry point: :func:`clean`.
+This module owns the pipeline order, which is the one place the whole design
+comes together:
+    read → [detect start] → [detect end] → slice body
+         → unmerge → [detect header] → name header → slice off header
+         → nulls → numbers → dates → finalize types → emit
+Detectors run on the still-merged grid (so banner/title rows stay sparse and get
+skipped); unmerge then runs before header detection; value transformers run on
+the header-stripped data grid in dependency order.
+"""
+from __future__ import annotations
+from typing import Any
+from messy_table.config import Config
+from messy_table.context import Context
+from messy_table.detectors import detect_header, detect_table_end, detect_table_start
+from messy_table.grid import Grid, SourceInfo
+from messy_table.readers import Source, read
+from messy_table.result import CleanResult
+from messy_table.transformers import (
+    apply_merged_cells,
+    convert_dates,
+    finalize_types,
+    normalize_headers,
+    normalize_nulls,
+    parse_numbers,
+)
+def clean(source: Source, *, config: Config | None = None) -> CleanResult:
+    """Clean a messy spreadsheet and return typed data plus an audit report.
+    Parameters
+    ----------
+    source:
+        A path (``str``/``Path``), raw ``bytes``, or a binary/text file-like
+        object holding an ``.xlsx``, ``.csv`` or ``.tsv``.
+    config:
+        Optional :class:`~messy_table.config.Config`. Omit it for the 80% case.
+    Returns
+    -------
+    CleanResult
+        ``.data``, ``.columns``, ``.report`` and ``.warnings``.
+    """
+    cfg = config or Config()
+    ctx = Context(config=cfg, source=SourceInfo(origin="<input>", kind="unknown"))
+    grid = read(source, cfg, ctx)
+    if grid.nrows == 0 or grid.ncols == 0:
+        return _empty_result(ctx)
+    start = detect_table_start(grid, ctx)
+    end = detect_table_end(grid, ctx, start)
+    body = grid.slice_rows(start, end)
+    apply_merged_cells(body, ctx)
+    header_rows, raw_names = detect_header(body, ctx)
+    pairs = normalize_headers(raw_names, ctx)
+    originals = [original for original, _ in pairs]
+    data = body.slice_rows(header_rows, body.nrows)
+    normalize_nulls(data, ctx)
+    parse_numbers(data, ctx)
+    convert_dates(data, ctx)
+    columns = finalize_types(data, ctx, originals)
+    rows = _emit_rows(data, ctx.column_names)
+    return CleanResult(
+        data=rows,
+        columns=columns,
+        report=ctx.report.build(),
+        warnings=ctx.warnings,
+        source=ctx.source,
+    )
+def _emit_rows(grid: Grid, names: list[str]) -> list[dict[str, Any]]:
+    return [{names[c]: grid.cell(r, c) for c in range(len(names))} for r in range(grid.nrows)]
+def _empty_result(ctx: Context) -> CleanResult:
+    ctx.warn("input contained no data", suggestion="check the file and sheet selection")
+    return CleanResult(
+        data=[],
+        columns=[],
+        report=ctx.report.build(),
+        warnings=ctx.warnings,
+        source=ctx.source,
+    )

messy_table/config.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""User-facing configuration.
+The 80% case needs none of this — ``clean(path)`` works. ``Config`` exists for
+the cases where a heuristic needs a hand: a forced locale, a known header row,
+a specific sheet.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Literal
+from messy_table.exceptions import MessyTableError
+MergedCellsMode = Literal["fill", "first-only"]
+HeaderSpec = Literal["auto"] | int | None
+LocaleSpec = Literal["auto", "pt_BR", "en_US", "de_DE", "fr_FR"]
+# A library that parses untrusted files needs hard ceilings. These defend
+# against decompression bombs and pathological inputs without getting in the
+# way of any realistic spreadsheet (50k x 30 = 1.5M cells sits well under).
+_DEFAULT_MAX_CELLS = 5_000_000
+_DEFAULT_MAX_UNCOMPRESSED_BYTES = 512 * 1024 * 1024  # 512 MiB expanded
+_DEFAULT_MAX_COMPRESSION_RATIO = 200  # expanded/packed beyond this is suspicious
+@dataclass(frozen=True, slots=True)
+class Config:
+    """Tuning knobs for :func:`messy_table.clean`.
+    All fields have safe defaults; constructing ``Config()`` matches the implicit
+    behaviour of calling ``clean`` with no config.
+    """
+    locale: LocaleSpec = "auto"
+    """Forces number/date interpretation. ``"auto"`` infers per column."""
+    header: HeaderSpec = "auto"
+    """``"auto"`` detects the header; an ``int`` pins the 0-based row; ``None``
+    means there is no header (columns become ``column_1``, ``column_2``, ...)."""
+    sheet: int | str = 0
+    """Worksheet to read, by 0-based index or by name. Ignored for CSV/TSV."""
+    merged_cells: MergedCellsMode = "fill"
+    """``"fill"`` propagates a merged value across its whole range;
+    ``"first-only"`` keeps it in the top-left cell and nulls the rest."""
+    null_values_extra: tuple[str, ...] = ()
+    """Extra tokens to treat as null, *added* to the built-in set."""
+    strict: bool = False
+    """When ``True``, a low-confidence decision raises
+    :class:`~messy_table.exceptions.AmbiguityError` instead of warning."""
+    # --- Safety limits (rarely touched; present so they are auditable) -------
+    max_cells: int = _DEFAULT_MAX_CELLS
+    max_uncompressed_bytes: int = _DEFAULT_MAX_UNCOMPRESSED_BYTES
+    max_compression_ratio: int = _DEFAULT_MAX_COMPRESSION_RATIO
+    # Confidence thresholds for the heuristics. Documented in docs/heuristics.md.
+    confidence_threshold: float = field(default=0.6)
+    """Below this, a decision is a warning (or an error in strict mode)."""
+    def __post_init__(self) -> None:
+        if self.merged_cells not in ("fill", "first-only"):
+            raise MessyTableError(
+                f"merged_cells must be 'fill' or 'first-only', got {self.merged_cells!r}"
+            )
+        if not (0.0 <= self.confidence_threshold <= 1.0):
+            raise MessyTableError("confidence_threshold must be in [0.0, 1.0]")
+        if isinstance(self.header, int) and self.header < 0:
+            raise MessyTableError("header row index must be >= 0")
+        if self.max_cells <= 0 or self.max_uncompressed_bytes <= 0:
+            raise MessyTableError("safety limits must be positive")

messy_table/context.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Shared, mutable context threaded through every pipeline stage.
+Centralising the strict-vs-permissive decision here means each detector and
+transformer just calls ``ctx.ambiguous(...)`` when its confidence is low and
+never has to know which mode it is running in.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from messy_table.config import Config
+from messy_table.exceptions import AmbiguityError
+from messy_table.grid import SourceInfo
+from messy_table.report import Issue, ReportBuilder, Severity
+@dataclass
+class Context:
+    config: Config
+    source: SourceInfo
+    report: ReportBuilder = field(default_factory=ReportBuilder)
+    warnings: list[Issue] = field(default_factory=list)
+    # The current header names, mutated as the header transformer runs.
+    column_names: list[str] = field(default_factory=list)
+    def warn(
+        self,
+        message: str,
+        *,
+        column: str | None = None,
+        row: int | None = None,
+        confidence: float | None = None,
+        suggestion: str | None = None,
+    ) -> None:
+        self.warnings.append(
+            Issue(
+                message=message,
+                severity=Severity.WARNING,
+                column=column,
+                row=row,
+                confidence=confidence,
+                suggestion=suggestion,
+            )
+        )
+    def ambiguous(
+        self,
+        message: str,
+        *,
+        suggestion: str,
+        column: str | None = None,
+        row: int | None = None,
+        confidence: float | None = None,
+    ) -> None:
+        """A below-threshold decision: raise in strict mode, else warn.
+        ``suggestion`` is mandatory and must be a copy-pasteable ``Config`` hint,
+        so the error/warning is always actionable.
+        """
+        if self.config.strict:
+            raise AmbiguityError(message, suggestion=suggestion)
+        self.warn(message, column=column, row=row, confidence=confidence, suggestion=suggestion)

messy_table/detectors/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Detectors locate structure in the raw grid; each returns a decision + records
+its confidence. They never mutate values — that is the transformers' job."""
+from __future__ import annotations
+from messy_table.detectors.header import detect_header
+from messy_table.detectors.table_end import detect_table_end
+from messy_table.detectors.table_start import detect_table_start
+__all__ = ["detect_header", "detect_table_end", "detect_table_start"]

messy_table/detectors/header.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""F2 (detection half) — find the header row(s).
+By the time this runs the grid is already sliced to the table body and unmerged,
+so the header is at local row 0. Two questions:
+* **Is there a header at all?** If the first row looks like data (numeric-heavy)
+  and has the same per-column type signature as the row below it, there is no
+  header — we synthesise ``column_1 ...`` and start data at row 0.
+* **How many rows does it span?** Multi-row headers ("Vendas" over "2024") come
+  from a horizontally merged group cell. We keep the original merged ranges as
+  metadata, so a top row that intersects a horizontal merge tells us the next
+  row holds the leaf labels. That ties multi-row detection to real structure
+  rather than a fragile text heuristic.
+The raw, still-dirty column names are returned; slugifying/de-duping/filling is
+the header-names transformer's job.
+"""
+from __future__ import annotations
+from messy_table.context import Context
+from messy_table.grid import Grid
+from messy_table.report import ActionKind
+from messy_table.util import raw_category
+MAX_HEADER_ROWS = 3
+TEXT_RATIO_THRESHOLD = 0.5
+def detect_header(grid: Grid, ctx: Context) -> tuple[int, list[str]]:
+    """Return ``(header_row_count, raw_column_names)``.
+    ``header_row_count`` is the number of leading rows consumed by the header;
+    data begins at that local index.
+    """
+    cfg = ctx.config
+    nrows, ncols = grid.nrows, grid.ncols
+    if cfg.header is None:
+        return 0, _synthetic_names(ncols)
+    if nrows == 0:
+        return 0, []
+    if isinstance(cfg.header, int):
+        ctx.report.note(ActionKind.HEADER, "config", detail="header row pinned via Config.header")
+        return 1, _merge_header_rows(grid, 1, ncols)
+    row0_text = _text_ratio(grid.row(0))
+    if _looks_headerless(grid, row0_text):
+        ctx.warn(
+            "no header row detected; generated column names",
+            suggestion="Config(header=0) to force the first row as the header",
+        )
+        return 0, _synthetic_names(ncols)
+    max_header = max(1, min(MAX_HEADER_ROWS, nrows - 1)) if nrows > 1 else 1
+    header_rows = 1
+    while header_rows < max_header and _has_horizontal_merge(grid, header_rows - 1):
+        header_rows += 1
+    confidence = 0.9 if row0_text >= TEXT_RATIO_THRESHOLD else 0.55
+    detail = f"header occupies {header_rows} row(s)"
+    if header_rows > 1:
+        detail += " (multi-row header merged column-wise)"
+    ctx.report.note(ActionKind.HEADER, "auto", detail=detail, confidence=confidence)
+    if confidence < cfg.confidence_threshold:
+        ctx.ambiguous(
+            f"low confidence ({confidence}) that row 0 is a header",
+            suggestion="Config(header=<row index>) or Config(header=None)",
+        )
+    return header_rows, _merge_header_rows(grid, header_rows, ncols)
+def _looks_headerless(grid: Grid, row0_text: float) -> bool:
+    if grid.nrows < 2 or row0_text >= TEXT_RATIO_THRESHOLD:
+        return False
+    if _has_horizontal_merge(grid, 0):
+        return False
+    return _category_signature(grid.row(0)) == _category_signature(grid.row(1))
+def _text_ratio(row: list[object]) -> float:
+    cats = [raw_category(v) for v in row]
+    nonblank = [c for c in cats if c != "blank"]
+    if not nonblank:
+        return 0.0
+    return sum(1 for c in nonblank if c == "text") / len(nonblank)
+def _category_signature(row: list[object]) -> tuple[str, ...]:
+    return tuple(raw_category(v) for v in row)
+def _has_horizontal_merge(grid: Grid, row: int) -> bool:
+    return any(m.min_row <= row <= m.max_row and m.max_col > m.min_col for m in grid.merged_ranges)
+def _merge_header_rows(grid: Grid, header_rows: int, ncols: int) -> list[str]:
+    names: list[str] = []
+    for c in range(ncols):
+        parts: list[str] = []
+        for r in range(header_rows):
+            value = grid.cell(r, c)
+            if value is None:
+                continue
+            text = str(value).strip()
+            # Skip a part already contributed by the row above (merge fill repeats it).
+            if text and (not parts or parts[-1] != text):
+                parts.append(text)
+        names.append(" ".join(parts))
+    return names
+def _synthetic_names(ncols: int) -> list[str]:
+    return [f"column_{i + 1}" for i in range(ncols)]

messy_table/detectors/table_end.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""F8 — trim trailing junk: totals, signatures, footnotes.
+After the last data row, exports often append a totals line, a "Gerado em ..."
+stamp, a signature, or free-text notes. We walk up from the bottom and trim rows
+that are either *sparse* (the data block is dense, these are not) or begin with a
+*summary keyword*. We stop at the first real data row, so only the trailing block
+is removed.
+"""
+from __future__ import annotations
+from messy_table.context import Context
+from messy_table.grid import Grid
+from messy_table.report import ActionKind
+from messy_table.util import (
+    density_threshold,
+    first_nonblank,
+    is_blank,
+    merge_covered_cells,
+)
+DENSITY_RATIO = 0.5
+# Lower-cased; matched as a prefix of the row's first non-blank cell.
+SUMMARY_PREFIXES = (
+    "total",
+    "totais",
+    "subtotal",
+    "sub-total",
+    "soma",
+    "grand total",
+    "resumo",
+    "média",
+    "media",
+    "assinatura",
+    "observ",
+    "obs.",
+    "nota",
+    "fonte",
+    "gerado em",
+    "emitido",
+    "página",
+    "pagina",
+)
+def detect_table_end(grid: Grid, ctx: Context, start: int) -> int:
+    """Return the exclusive end row index of the data block."""
+    nrows, ncols = grid.nrows, grid.ncols
+    if nrows == 0 or ncols == 0:
+        return nrows
+    covered = merge_covered_cells(grid.merged_ranges)
+    # Measure fill against *live* columns only. A fully empty column (a stray
+    # trailing column is common in real exports) must not make every data row look
+    # sparse and get trimmed as junk — that silently destroys the whole table.
+    live_cols = _live_columns(grid, start + 1, nrows, covered)
+    threshold = density_threshold(len(live_cols), DENSITY_RATIO)
+    end = nrows
+    keyword_hit = False
+    r = nrows - 1
+    while r > start:
+        first = first_nonblank(grid.row(r))
+        filled_live = sum(
+            1 for c in live_cols if not is_blank(grid.cell(r, c)) or (r, c) in covered
+        )
+        is_summary = isinstance(first, str) and first.strip().lower().startswith(SUMMARY_PREFIXES)
+        if filled_live < threshold or is_summary:
+            end = r
+            keyword_hit = keyword_hit or is_summary
+            r -= 1
+        else:
+            break
+    return _finish(ctx, nrows, end, start, keyword_hit)
+def _live_columns(grid: Grid, start: int, stop: int, covered: set[tuple[int, int]]) -> set[int]:
+    """Columns holding any data in ``[start, stop)``. Short-circuits when all are
+    live (the common dense case), so this stays cheap on large sheets."""
+    ncols = grid.ncols
+    live: set[int] = set()
+    for r in range(start, stop):
+        for c in range(ncols):
+            if c not in live and (not is_blank(grid.cell(r, c)) or (r, c) in covered):
+                live.add(c)
+        if len(live) == ncols:
+            break
+    return live or set(range(ncols))
+def _finish(ctx: Context, nrows: int, end: int, start: int, keyword_hit: bool) -> int:
+    trimmed = nrows - end
+    if trimmed:
+        # A keyword-matched total carries more certainty than a merely sparse row.
+        confidence = 0.85 if keyword_hit else 0.7
+        ctx.report.note(
+            ActionKind.TABLE_END,
+            "trailing-junk",
+            detail=f"trimmed {trimmed} trailing row(s) (totals/notes/blank) after row {end - 1}",
+            confidence=confidence,
+        )
+    return end

messy_table/detectors/table_start.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""F1 — find where the real table begins.
+Real exports bury the table under a title, a logo cell, a date stamp, blank
+rows. Those leading rows are *sparse*: one or two filled cells. The table — its
+header included — is *dense*: most columns filled, row after row. So we locate
+the longest contiguous run of dense rows and call its first row the start.
+Merged title banners are not a problem here: openpyxl reports a merged cell's
+value only in its top-left anchor, so a full-width merged title still counts as
+a single filled cell. We detect boundaries *before* unmerging precisely so this
+holds.
+"""
+from __future__ import annotations
+from messy_table.context import Context
+from messy_table.grid import Grid
+from messy_table.report import ActionKind
+from messy_table.util import density_threshold, merge_covered_cells, row_filled_counts
+DENSITY_RATIO = 0.5
+def detect_table_start(grid: Grid, ctx: Context) -> int:
+    """Return the 0-based index of the first row belonging to the table."""
+    cfg = ctx.config
+    nrows, ncols = grid.nrows, grid.ncols
+    if nrows == 0 or ncols == 0:
+        return 0
+    if isinstance(cfg.header, int):
+        start = min(cfg.header, nrows - 1)
+        if start > 0:
+            ctx.report.note(
+                ActionKind.TABLE_START,
+                "config",
+                detail=f"table start pinned to row {start} via Config.header",
+            )
+        return start
+    covered = merge_covered_cells(grid.merged_ranges)
+    filled = row_filled_counts(grid.values, covered)
+    width = max(filled)
+    if width == 0:
+        return 0
+    threshold = density_threshold(width, DENSITY_RATIO)
+    substantial = [f >= threshold for f in filled]
+    best_start, best_len = 0, 0
+    i = 0
+    while i < nrows:
+        if not substantial[i]:
+            i += 1
+            continue
+        j = i
+        while j < nrows and substantial[j]:
+            j += 1
+        if j - i > best_len:
+            best_start, best_len = i, j - i
+        i = j
+    start = best_start
+    if start > 0:
+        above_density = sum(filled[:start]) / (start * ncols)
+        body_density = filled[start] / ncols
+        confidence = round(min(1.0, max(0.3, body_density - above_density + 0.5)), 2)
+        ctx.report.note(
+            ActionKind.TABLE_START,
+            "density",
+            detail=(
+                f"skipped {start} leading row(s) (title/metadata/blank); "
+                f"table starts at row {start}"
+            ),
+            confidence=confidence,
+        )
+        if confidence < cfg.confidence_threshold:
+            ctx.ambiguous(
+                f"low confidence ({confidence}) locating the table start at row {start}",
+                suggestion=f"Config(header={start})",
+            )
+    return start

messy_table/exceptions.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Exception hierarchy for messy-table.
+Every error raised by the public API descends from :class:`MessyTableError`, so
+callers can catch the whole family with a single ``except``. Errors that the user
+can resolve by changing configuration always carry a concrete ``suggestion``.
+"""
+from __future__ import annotations
+class MessyTableError(Exception):
+    """Base class for every error raised by messy-table."""
+class UnsupportedFormatError(MessyTableError):
+    """Raised when the input is not a format messy-table can read.
+    Covers unknown extensions, corrupt archives, and inputs that trip a safety
+    guard (for example an ``.xlsx`` that decompresses far beyond its packed size,
+    which is the classic decompression-bomb shape).
+    """
+class AmbiguityError(MessyTableError):
+    """Raised in ``strict`` mode when a heuristic cannot decide confidently.
+    In permissive mode the same situation is recorded as a low-confidence
+    :class:`~messy_table.report.Issue` instead of raising. The ``suggestion`` is
+    always a copy-pasteable hint for the ``Config`` field that resolves it.
+    """
+    def __init__(self, message: str, *, suggestion: str | None = None) -> None:
+        super().__init__(message)
+        self.suggestion = suggestion
+    def __str__(self) -> str:
+        base = super().__str__()
+        if self.suggestion:
+            return f"{base}\n  → resolve with: {self.suggestion}"
+        return base