PyPI - md-spreadsheet-parser - Versions diffs - 1.0.1__py3-none-any.whl - Mend

md-spreadsheet-parser 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

md_spreadsheet_parser/__init__.py +66 -0
md_spreadsheet_parser/cli.py +128 -0
md_spreadsheet_parser/converters.py +126 -0
md_spreadsheet_parser/excel.py +183 -0
md_spreadsheet_parser/generator.py +170 -0
md_spreadsheet_parser/loader.py +183 -0
md_spreadsheet_parser/models.py +491 -0
md_spreadsheet_parser/parsing.py +590 -0
md_spreadsheet_parser/py.typed +0 -0
md_spreadsheet_parser/pydantic_adapter.py +130 -0
md_spreadsheet_parser/schemas.py +108 -0
md_spreadsheet_parser/utils.py +6 -0
md_spreadsheet_parser/validation.py +348 -0
md_spreadsheet_parser-1.0.1.dist-info/METADATA +922 -0
md_spreadsheet_parser-1.0.1.dist-info/RECORD +18 -0
md_spreadsheet_parser-1.0.1.dist-info/WHEEL +4 -0
md_spreadsheet_parser-1.0.1.dist-info/entry_points.txt +2 -0
md_spreadsheet_parser-1.0.1.dist-info/licenses/LICENSE +21 -0

md_spreadsheet_parser/__init__.py ADDED Viewed

@@ -0,0 +1,66 @@
+from .parsing import (
+    parse_table,
+    parse_sheet,
+    parse_workbook,
+    scan_tables,
+)
+from .loader import (
+    parse_table_from_file,
+    parse_workbook_from_file,
+    scan_tables_from_file,
+    scan_tables_iter,
+)
+from .schemas import (
+    ParsingSchema,
+    DEFAULT_SCHEMA,
+    MultiTableParsingSchema,
+    ConversionSchema,
+    DEFAULT_CONVERSION_SCHEMA,
+    ExcelParsingSchema,
+    DEFAULT_EXCEL_SCHEMA,
+)
+from .models import (
+    Table,
+    Sheet,
+    Workbook,
+)
+from .validation import TableValidationError
+from .generator import (
+    generate_table_markdown,
+    generate_sheet_markdown,
+    generate_workbook_markdown,
+)
+from .excel import (
+    parse_excel,
+    parse_excel_text,
+)
+__all__ = [
+    "parse_table",
+    "parse_sheet",
+    "parse_workbook",
+    "scan_tables",
+    "parse_table_from_file",
+    "parse_workbook_from_file",
+    "scan_tables_from_file",
+    "scan_tables_iter",
+    "ParsingSchema",
+    "MultiTableParsingSchema",
+    "ConversionSchema",
+    "ExcelParsingSchema",
+    "Table",
+    "Sheet",
+    "Workbook",
+    "DEFAULT_SCHEMA",
+    "DEFAULT_CONVERSION_SCHEMA",
+    "DEFAULT_EXCEL_SCHEMA",
+    "TableValidationError",
+    "generate_table_markdown",
+    "generate_sheet_markdown",
+    "generate_workbook_markdown",
+    "parse_excel",
+    "parse_excel_text",
+    "converters",
+]
+from . import converters

md_spreadsheet_parser/cli.py ADDED Viewed

@@ -0,0 +1,128 @@
+import argparse
+import json
+import sys
+from pathlib import Path
+from .parsing import parse_workbook, scan_tables
+from .schemas import MultiTableParsingSchema
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse Markdown tables to JSON.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "file",
+        nargs="?",
+        type=str,
+        help="Path to Markdown file. If omitted or '-', reads from stdin.",
+    )
+    parser.add_argument(
+        "--scan",
+        action="store_true",
+        help="Scan for all tables ignoring workbook structure (uses scan_tables).",
+    )
+    parser.add_argument(
+        "--root-marker",
+        type=str,
+        default="# Tables",
+        help="Marker indicating start of data section (for workbook mode).",
+    )
+    parser.add_argument(
+        "--sheet-header-level",
+        type=int,
+        default=2,
+        help="Header level for sheets (for workbook mode).",
+    )
+    parser.add_argument(
+        "--table-header-level",
+        type=int,
+        default=None,
+        help="Header level for tables.",
+    )
+    parser.add_argument(
+        "--capture-description",
+        action="store_true",
+        help="Capture text between header and table as description. Requires --table-header-level.",
+    )
+    parser.add_argument(
+        "--column-separator",
+        type=str,
+        default="|",
+        help="Character used to separate columns.",
+    )
+    parser.add_argument(
+        "--header-separator-char",
+        type=str,
+        default="-",
+        help="Character used in the separator row.",
+    )
+    parser.add_argument(
+        "--no-outer-pipes",
+        action="store_false",
+        dest="require_outer_pipes",
+        help="Allow tables without outer pipes.",
+    )
+    parser.add_argument(
+        "--no-strip-whitespace",
+        action="store_false",
+        dest="strip_whitespace",
+        help="Do not strip whitespace from cell values.",
+    )
+    parser.add_argument(
+        "--no-br-conversion",
+        action="store_false",
+        dest="convert_br_to_newline",
+        help="Disable automatic conversion of <br> tags to newlines.",
+    )
+    args = parser.parse_args()
+    # Validate configuration
+    if args.capture_description and args.table_header_level is None:
+        print(
+            "Error: --capture-description requires --table-header-level to be set.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    # Read input
+    if args.file and args.file != "-":
+        try:
+            content = Path(args.file).read_text(encoding="utf-8")
+        except FileNotFoundError:
+            print(f"Error: File '{args.file}' not found.", file=sys.stderr)
+            sys.exit(1)
+    else:
+        content = sys.stdin.read()
+    # Configure schema
+    schema = MultiTableParsingSchema(
+        root_marker=args.root_marker,
+        sheet_header_level=args.sheet_header_level,
+        table_header_level=args.table_header_level,
+        capture_description=args.capture_description,
+        column_separator=args.column_separator,
+        header_separator_char=args.header_separator_char,
+        require_outer_pipes=args.require_outer_pipes,
+        strip_whitespace=args.strip_whitespace,
+        convert_br_to_newline=args.convert_br_to_newline,
+    )
+    # Parse
+    try:
+        if args.scan:
+            tables = scan_tables(content, schema)
+            # Output list of tables
+            print(json.dumps([t.json for t in tables], indent=2, ensure_ascii=False))
+        else:
+            workbook = parse_workbook(content, schema)
+            print(json.dumps(workbook.json, indent=2, ensure_ascii=False))
+    except Exception as e:
+        print(f"Error parsing content: {e}", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

md_spreadsheet_parser/converters.py ADDED Viewed

@@ -0,0 +1,126 @@
+from decimal import Decimal
+from typing import Callable, Iterable
+from datetime import datetime
+import re
+try:
+    from zoneinfo import ZoneInfo
+except ImportError:
+    # Python 3.8 support or fallback
+    # In this library we assume Python 3.10+ based on user environment but good to be safe?
+    # User env is 3.12.12. ZoneInfo is standard.
+    from zoneinfo import ZoneInfo
+def to_decimal_clean(value: str) -> Decimal:
+    """
+    Convert a string to Decimal, removing common currency symbols and grouping separators.
+    Removes: '$', '¥', '€', '£', ',', ' ' (space), '_'
+    """
+    clean_val = re.sub(r"[ $¥€£,_]", "", value)
+    if not clean_val:
+        # What should empty string be?
+        # Usually schema validation handles empty string via Optional[Decimal] -> None.
+        # If we are here, it's likely a non-empty string or expected to be a value.
+        # But if the string was just "$", it becomes empty.
+        raise ValueError(f"Cannot convert '{value}' to Decimal")
+    return Decimal(clean_val)
+def make_datetime_converter(
+    fmt: str | None = None, tz: ZoneInfo | None = None
+) -> Callable[[str], datetime]:
+    """
+    Create a converter function for datetime.
+    Args:
+        fmt: str format for strptime. If None, uses datetime.fromisoformat().
+        tz: ZoneInfo to attach (if naive) or convert to (if aware).
+    Returns:
+        Function that accepts a string and returns a datetime.
+    """
+    def converter(value: str) -> datetime:
+        value = value.strip()
+        if fmt:
+            dt = datetime.strptime(value, fmt)
+        else:
+            dt = datetime.fromisoformat(value)
+        if tz:
+            if dt.tzinfo is None:
+                # Attach timezone if naive
+                dt = dt.replace(tzinfo=tz)
+            else:
+                # Convert timezone if aware
+                dt = dt.astimezone(tz)
+        return dt
+    return converter
+def make_list_converter(
+    separator: str = ",", strip_items: bool = True, distinct: bool = False
+) -> Callable[[str], list[str]]:
+    """
+    Create a converter that splits a string into a list.
+    Args:
+        separator: Character/string to split by. Default ",".
+        strip_items: Whether to strip whitespace from each item. Default True.
+        distinct: Whether to remove duplicates (maintaining order). Default False.
+    Returns:
+        Function that accepts a string and returns a list of strings.
+    """
+    def converter(value: str) -> list[str]:
+        if not value:
+            return []
+        parts = value.split(separator)
+        if strip_items:
+            parts = [p.strip() for p in parts]
+        if distinct:
+            seen = set()
+            deduper = []
+            for p in parts:
+                if p not in seen:
+                    deduper.append(p)
+                    seen.add(p)
+            parts = deduper
+        return parts
+    return converter
+def make_bool_converter(
+    true_values: Iterable[str] = ("true", "yes", "1", "on"),
+    false_values: Iterable[str] = ("false", "no", "0", "off"),
+) -> Callable[[str], bool]:
+    """
+    Create a strict boolean converter.
+    Args:
+        true_values: List of case-insensitive strings treated as True.
+        false_values: List of case-insensitive strings treated as False.
+    Returns:
+        Function that returns bool or raises ValueError.
+    """
+    t_set = {v.lower() for v in true_values}
+    f_set = {v.lower() for v in false_values}
+    def converter(value: str) -> bool:
+        lower = value.strip().lower()
+        if lower in t_set:
+            return True
+        if lower in f_set:
+            return False
+        raise ValueError(f"Invalid boolean value: '{value}'")
+    return converter

md_spreadsheet_parser/excel.py ADDED Viewed

@@ -0,0 +1,183 @@
+"""
+Excel/TSV/CSV parsing module with merged cell and hierarchical header support.
+This module provides functions to parse tabular data exported from Excel
+(TSV/CSV format) or directly from openpyxl Worksheets, handling:
+- Merged cells (forward-fill empty cells)
+- Hierarchical headers (flatten to "Parent - Child" format)
+"""
+import csv
+import io
+from typing import Any, TYPE_CHECKING, Union
+from .models import Table
+from .schemas import ExcelParsingSchema, DEFAULT_EXCEL_SCHEMA
+# --- Optional openpyxl support ---
+if TYPE_CHECKING:
+    from openpyxl.worksheet.worksheet import Worksheet
+# Type alias for parse_excel source parameter
+# Note: Worksheet is only available at runtime if openpyxl is installed
+ExcelSource = Union[str, list[list[str]], "Worksheet"]
+try:
+    import openpyxl
+    HAS_OPENPYXL = True
+except ImportError:
+    HAS_OPENPYXL = False
+    openpyxl = None  # type: ignore
+def _parse_tsv(text: str, delimiter: str) -> list[list[str]]:
+    """Parse TSV/CSV text into a 2D list using Python's csv module."""
+    reader = csv.reader(io.StringIO(text), delimiter=delimiter)
+    return list(reader)
+def _forward_fill(row: list[str]) -> list[str]:
+    """Fill empty cells with the previous non-empty value (left-to-right)."""
+    result = []
+    prev = ""
+    for cell in row:
+        if cell.strip():
+            prev = cell
+        result.append(prev)
+    return result
+def _flatten_headers(
+    parent_row: list[str], child_row: list[str], separator: str
+) -> list[str]:
+    """
+    Flatten 2-row headers into single row.
+    Format: "Parent - Child" if Parent differs from Child, else just Child.
+    """
+    headers = []
+    max_len = max(len(parent_row), len(child_row))
+    for i in range(max_len):
+        parent = parent_row[i] if i < len(parent_row) else ""
+        child = child_row[i] if i < len(child_row) else ""
+        if parent and child and parent != child:
+            headers.append(f"{parent}{separator}{child}")
+        else:
+            headers.append(child if child else parent)
+    return headers
+def _safe_str(value: Any) -> str:
+    """
+    Convert value to string, handling None and integer-floats cleanly.
+    """
+    if value is None:
+        return ""
+    if isinstance(value, float) and value.is_integer():
+        return str(int(value))
+    return str(value)
+def parse_excel_text(
+    rows: list[list[str]],
+    schema: ExcelParsingSchema = DEFAULT_EXCEL_SCHEMA,
+) -> Table:
+    """
+    Parse a 2D string array into a Table with merged cell and header handling.
+    Args:
+        rows: 2D list of strings (e.g., from csv.reader or worksheet iteration).
+        schema: Configuration for header processing.
+    Returns:
+        Table object with processed headers and data rows.
+    """
+    if not rows:
+        return Table(headers=None, rows=[])
+    if schema.header_rows == 1:
+        # Single header row
+        header_row = rows[0]
+        if schema.fill_merged_headers:
+            header_row = _forward_fill(header_row)
+        headers = header_row
+        data_rows = rows[1:]
+    elif schema.header_rows == 2:
+        # Two header rows: Parent-Child flattening
+        if len(rows) < 2:
+            # Not enough rows for 2-row header
+            return Table(headers=rows[0] if rows else None, rows=[])
+        parent_row = rows[0]
+        child_row = rows[1]
+        if schema.fill_merged_headers:
+            parent_row = _forward_fill(parent_row)
+        headers = _flatten_headers(parent_row, child_row, schema.header_separator)
+        data_rows = rows[2:]
+    else:
+        # Should not reach here due to schema validation
+        raise ValueError(f"Invalid header_rows: {schema.header_rows}")
+    # Convert data_rows to list[list[str]] ensuring all are strings
+    processed_rows = [[_safe_str(cell) for cell in row] for row in data_rows]
+    return Table(headers=headers, rows=processed_rows)
+def parse_excel(
+    source: ExcelSource,
+    schema: ExcelParsingSchema = DEFAULT_EXCEL_SCHEMA,
+) -> Table:
+    """
+    Parse Excel data from various sources.
+    Args:
+        source: One of:
+            - openpyxl.Worksheet (if openpyxl is installed)
+            - str: TSV/CSV text content
+            - list[list[str]]: Pre-parsed 2D array
+        schema: Configuration for parsing.
+    Returns:
+        Table object with processed headers and data.
+    Raises:
+        TypeError: If source type is not supported.
+    """
+    rows: list[list[str]]
+    # Check for openpyxl Worksheet (duck typing via hasattr)
+    if HAS_OPENPYXL and hasattr(source, "iter_rows"):
+        # At runtime, source is a Worksheet with iter_rows method
+        ws: Any = source
+        rows = [
+            [_safe_str(cell) for cell in row] for row in ws.iter_rows(values_only=True)
+        ]
+    # Check for string (TSV/CSV content)
+    elif isinstance(source, str):
+        rows = _parse_tsv(source, schema.delimiter)
+    # Check for pre-parsed 2D array
+    elif isinstance(source, list):
+        # Assume it's already list[list[str]]
+        rows = source
+    else:
+        supported = "openpyxl.Worksheet, str, or list[list[str]]"
+        if not HAS_OPENPYXL:
+            supported = (
+                "str or list[list[str]] (install openpyxl for Worksheet support)"
+            )
+        raise TypeError(
+            f"Unsupported source type: {type(source).__name__}. Expected {supported}."
+        )
+    return parse_excel_text(rows, schema)

md_spreadsheet_parser/generator.py ADDED Viewed

@@ -0,0 +1,170 @@
+import json
+from typing import TYPE_CHECKING
+from .schemas import DEFAULT_SCHEMA, MultiTableParsingSchema, ParsingSchema
+if TYPE_CHECKING:
+    from .models import Sheet, Table, Workbook
+def generate_table_markdown(
+    table: "Table", schema: ParsingSchema = DEFAULT_SCHEMA
+) -> str:
+    """
+    Generates a Markdown string representation of the table.
+    Args:
+        table: The Table object.
+        schema (ParsingSchema, optional): Configuration for formatting.
+    Returns:
+        str: The Markdown string.
+    """
+    lines = []
+    # Handle metadata (name and description) if MultiTableParsingSchema
+    if isinstance(schema, MultiTableParsingSchema):
+        if table.name and schema.table_header_level is not None:
+            lines.append(f"{'#' * schema.table_header_level} {table.name}")
+            lines.append("")  # Empty line after name
+        if table.description and schema.capture_description:
+            lines.append(table.description)
+            lines.append("")  # Empty line after description
+    # Build table
+    sep = f" {schema.column_separator} "
+    def _prepare_cell(cell: str) -> str:
+        """Prepare cell for markdown generation."""
+        if schema.convert_br_to_newline and "\n" in cell:
+            return cell.replace("\n", "<br>")
+        return cell
+    # Headers
+    if table.headers:
+        # Add outer pipes if required
+        processed_headers = [_prepare_cell(h) for h in table.headers]
+        header_row = sep.join(processed_headers)
+        if schema.require_outer_pipes:
+            header_row = (
+                f"{schema.column_separator} {header_row} {schema.column_separator}"
+            )
+        lines.append(header_row)
+        # Separator row
+        separator_cells = []
+        for i, _ in enumerate(table.headers):
+            alignment = "default"
+            if table.alignments and i < len(table.alignments):
+                # Ensure we handle potentially None values if list has gaps (unlikely by design but safe)
+                alignment = table.alignments[i] or "default"
+            # Construct separator cell based on alignment
+            # Use 3 hyphens as base
+            if alignment == "left":
+                cell = ":" + schema.header_separator_char * 3
+            elif alignment == "right":
+                cell = schema.header_separator_char * 3 + ":"
+            elif alignment == "center":
+                cell = ":" + schema.header_separator_char * 3 + ":"
+            else:
+                # default
+                cell = schema.header_separator_char * 3
+            separator_cells.append(cell)
+        separator_row = sep.join(separator_cells)
+        if schema.require_outer_pipes:
+            separator_row = (
+                f"{schema.column_separator} {separator_row} {schema.column_separator}"
+            )
+        lines.append(separator_row)
+    # Rows
+    for row in table.rows:
+        processed_row = [_prepare_cell(cell) for cell in row]
+        row_str = sep.join(processed_row)
+        if schema.require_outer_pipes:
+            row_str = f"{schema.column_separator} {row_str} {schema.column_separator}"
+        lines.append(row_str)
+    # Append Metadata if present
+    if table.metadata and "visual" in table.metadata:
+        metadata_json = json.dumps(table.metadata["visual"])
+        comment = f"<!-- md-spreadsheet-table-metadata: {metadata_json} -->"
+        lines.append("")
+        lines.append(comment)
+    return "\n".join(lines)
+def generate_sheet_markdown(
+    sheet: "Sheet", schema: ParsingSchema = DEFAULT_SCHEMA
+) -> str:
+    """
+    Generates a Markdown string representation of the sheet.
+    Args:
+        sheet: The Sheet object.
+        schema (ParsingSchema, optional): Configuration for formatting.
+    Returns:
+        str: The Markdown string.
+    """
+    lines = []
+    if isinstance(schema, MultiTableParsingSchema):
+        lines.append(f"{'#' * schema.sheet_header_level} {sheet.name}")
+        lines.append("")
+    for i, table in enumerate(sheet.tables):
+        lines.append(generate_table_markdown(table, schema))
+        if i < len(sheet.tables) - 1:
+            lines.append("")  # Empty line between tables
+    # Append Sheet Metadata if present (at the end)
+    if isinstance(schema, MultiTableParsingSchema) and sheet.metadata:
+        lines.append("")
+        metadata_json = json.dumps(sheet.metadata)
+        comment = f"<!-- md-spreadsheet-sheet-metadata: {metadata_json} -->"
+        lines.append(comment)
+    return "\n".join(lines)
+def generate_workbook_markdown(
+    workbook: "Workbook", schema: MultiTableParsingSchema
+) -> str:
+    """
+    Generates a Markdown string representation of the workbook.
+    Args:
+        workbook: The Workbook object.
+        schema (MultiTableParsingSchema): Configuration for formatting.
+    Returns:
+        str: The Markdown string.
+    """
+    lines = []
+    if schema.root_marker:
+        lines.append(schema.root_marker)
+        lines.append("")
+    for i, sheet in enumerate(workbook.sheets):
+        lines.append(generate_sheet_markdown(sheet, schema))
+        if i < len(workbook.sheets) - 1:
+            lines.append("")  # Empty line between sheets
+    # Append Workbook Metadata if present
+    if workbook.metadata:
+        # Ensure separation from last sheet
+        if lines and lines[-1] != "":
+            lines.append("")
+        metadata_json = json.dumps(workbook.metadata)
+        comment = f"<!-- md-spreadsheet-workbook-metadata: {metadata_json} -->"
+        lines.append(comment)
+    return "\n".join(lines)