PyPI - md-spreadsheet-parser - Versions diffs - 1.0.1__py3-none-any.whl - Mend

md-spreadsheet-parser 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

md_spreadsheet_parser/__init__.py +66 -0
md_spreadsheet_parser/cli.py +128 -0
md_spreadsheet_parser/converters.py +126 -0
md_spreadsheet_parser/excel.py +183 -0
md_spreadsheet_parser/generator.py +170 -0
md_spreadsheet_parser/loader.py +183 -0
md_spreadsheet_parser/models.py +491 -0
md_spreadsheet_parser/parsing.py +590 -0
md_spreadsheet_parser/py.typed +0 -0
md_spreadsheet_parser/pydantic_adapter.py +130 -0
md_spreadsheet_parser/schemas.py +108 -0
md_spreadsheet_parser/utils.py +6 -0
md_spreadsheet_parser/validation.py +348 -0
md_spreadsheet_parser-1.0.1.dist-info/METADATA +922 -0
md_spreadsheet_parser-1.0.1.dist-info/RECORD +18 -0
md_spreadsheet_parser-1.0.1.dist-info/WHEEL +4 -0
md_spreadsheet_parser-1.0.1.dist-info/entry_points.txt +2 -0
md_spreadsheet_parser-1.0.1.dist-info/licenses/LICENSE +21 -0

md_spreadsheet_parser/pydantic_adapter.py ADDED Viewed

@@ -0,0 +1,130 @@
+import json
+from typing import Type, get_origin
+from pydantic import BaseModel, ValidationError as PydanticValidationError
+from .schemas import ConversionSchema
+from .models import Table
+from .validation import TableValidationError
+from .utils import normalize_header
+def validate_table_pydantic(
+    table: Table,
+    schema_cls: Type[BaseModel],
+    conversion_schema: ConversionSchema,
+) -> list[BaseModel]:
+    """
+    Validates a Table using Pydantic.
+    """
+    # Map headers to fields (checking aliases)
+    model_fields = schema_cls.model_fields
+    # helper: find field name by alias or name
+    # Pydantic v2 stores alias in FieldInfo
+    header_map: dict[int, str] = {}  # column_index -> field_name
+    # Pre-calculate normalized map of field names/aliases
+    # We map normalized_string -> key_to_use_in_dict
+    lookup_map = {}
+    for name, field_info in model_fields.items():
+        # By default Pydantic expects the alias if it exists
+        # UNLESS populate_by_name=True is set.
+        # To be safe and support common case (headers match alias), we prioritize alias.
+        # If alias is defined, map its normalized version to the ALIAS string
+        if field_info.alias:
+            lookup_map[normalize_header(field_info.alias)] = field_info.alias
+            # Also allow mapping field name if populate_by_name is likely?
+            # But we can't easily know the config.
+            # Let's support both: normalized(name) -> name
+            # But if collision? Alias usually wins in user intent.
+            if normalize_header(name) not in lookup_map:
+                lookup_map[normalize_header(name)] = name
+        else:
+            lookup_map[normalize_header(name)] = name
+    normalized_headers = [normalize_header(h) for h in (table.headers or [])]
+    for idx, header in enumerate(normalized_headers):
+        if header in lookup_map:
+            header_map[idx] = lookup_map[header]
+    results = []
+    errors = []
+    for row_idx, row in enumerate(table.rows):
+        row_data = {}
+        for col_idx, cell_value in enumerate(row):
+            if col_idx in header_map:
+                target_key = header_map[col_idx]
+                # Check for field-specific converter first (Library specific override)
+                if target_key in conversion_schema.field_converters:
+                    converter = conversion_schema.field_converters[target_key]
+                    try:
+                        val = converter(cell_value)
+                        row_data[target_key] = val
+                    except Exception as e:
+                        errors.append(
+                            f"Row {row_idx + 1}: Column '{target_key}' conversion failed: {e}"
+                        )
+                else:
+                    if cell_value.strip() == "":
+                        row_data[target_key] = None
+                    else:
+                        # Pydantic JSON Pre-parsing
+                        # If field type is dict or list, try parsing as JSON
+                        # Pydantic v2 stores type in field_info.annotation
+                        val_to_set = cell_value
+                        # Find FieldInfo to check type
+                        # We have schema_cls.model_fields[name] -> FieldInfo
+                        # Need to find the NAME corresponding to target_key
+                        # Wait, target_key IS the field name (or alias) used in the dict.
+                        # But Pydantic accepts name OR alias.
+                        target_field_name = None
+                        if target_key in model_fields:
+                            target_field_name = target_key
+                        else:
+                            # Reverse lookup for alias?
+                            # In Pydantic v2, model_fields keys are attribute names.
+                            # If target_key matches alias, we need to find the attribute name to get type.
+                            for fname, f in model_fields.items():
+                                if f.alias == target_key:
+                                    target_field_name = fname
+                                    break
+                        if target_field_name:
+                            field_def = model_fields[target_field_name]
+                            ftype = field_def.annotation
+                            origin = get_origin(ftype)
+                            if (ftype is dict or ftype is list) or (
+                                origin is dict or origin is list
+                            ):
+                                try:
+                                    val_to_set = json.loads(cell_value)
+                                except json.JSONDecodeError:
+                                    # Fallback: Let Pydantic validation handle it (might raise error or work if string is expected)
+                                    pass
+                        row_data[target_key] = val_to_set
+        try:
+            obj = schema_cls(**row_data)
+            results.append(obj)
+        except PydanticValidationError as e:
+            # Format Pydantic errors nicely
+            for err in e.errors():
+                loc = ".".join(map(str, err["loc"]))
+                msg = err["msg"]
+                errors.append(f"Row {row_idx + 1}: Field '{loc}' - {msg}")
+    if errors:
+        raise TableValidationError(errors)
+    return results

md_spreadsheet_parser/schemas.py ADDED Viewed

@@ -0,0 +1,108 @@
+from dataclasses import dataclass, field
+from typing import Callable, Any
+@dataclass(frozen=True)
+class ParsingSchema:
+    """
+    Configuration for parsing markdown tables.
+    Designed to be immutable and passed to pure functions.
+    Attributes:
+        column_separator (str): Character used to separate columns. Defaults to "|".
+        header_separator_char (str): Character used in the separator row. Defaults to "-".
+        require_outer_pipes (bool): Whether tables must have outer pipes (e.g. `| col |`). Defaults to True.
+        strip_whitespace (bool): Whether to strip whitespace from cell values. Defaults to True.
+    """
+    column_separator: str = "|"
+    header_separator_char: str = "-"
+    require_outer_pipes: bool = True
+    strip_whitespace: bool = True
+    convert_br_to_newline: bool = True
+# Default schema for standard Markdown tables (GFM style)
+DEFAULT_SCHEMA = ParsingSchema()
+@dataclass(frozen=True)
+class MultiTableParsingSchema(ParsingSchema):
+    """
+    Configuration for parsing multiple tables (workbook mode).
+    Inherits from ParsingSchema.
+    Attributes:
+        root_marker (str): The marker indicating the start of the data section. Defaults to "# Tables".
+        sheet_header_level (int): The markdown header level for sheets. Defaults to 2 (e.g. `## Sheet`).
+        table_header_level (int | None): The markdown header level for tables. If None, table names are not extracted. Defaults to None.
+        capture_description (bool): Whether to capture text between the table header and the table as a description. Defaults to False.
+    """
+    root_marker: str = "# Tables"
+    sheet_header_level: int = 2
+    table_header_level: int | None = 3
+    capture_description: bool = True
+    def __post_init__(self):
+        if self.capture_description and self.table_header_level is None:
+            raise ValueError(
+                "capture_description=True requires table_header_level to be set"
+            )
+@dataclass(frozen=True)
+class ConversionSchema:
+    """
+    Configuration for converting string values to Python types.
+    Attributes:
+        boolean_pairs: Pairs of strings representing (True, False). Case-insensitive.
+                       Example: `(("yes", "no"), ("on", "off"))`.
+        custom_converters: Dictionary mapping ANY Python type to a conversion function `str -> Any`.
+                           You can specify:
+                           - Built-in types: `int`, `float`, `bool` (to override default behavior)
+                           - Standard library types: `Decimal`, `datetime`, `date`, `ZoneInfo`
+                           - Custom classes: `MyClass`, `Product`
+        field_converters: Dictionary mapping field names (str) to conversion functions.
+                          Takes precedence over `custom_converters`.
+    """
+    boolean_pairs: tuple[tuple[str, str], ...] = (
+        ("true", "false"),
+        ("yes", "no"),
+        ("1", "0"),
+        ("on", "off"),
+    )
+    custom_converters: dict[type, Callable[[str], Any]] = field(default_factory=dict)
+    field_converters: dict[str, Callable[[str], Any]] = field(default_factory=dict)
+DEFAULT_CONVERSION_SCHEMA = ConversionSchema()
+@dataclass(frozen=True)
+class ExcelParsingSchema:
+    """
+    Configuration for parsing Excel-exported data (TSV/CSV or openpyxl).
+    Attributes:
+        header_rows: Number of header rows (1 or 2).
+                     If 2, headers are flattened to "Parent - Child" format.
+        fill_merged_headers: Whether to forward-fill empty header cells
+                             (for merged cells in Excel exports).
+        delimiter: Column separator for TSV/CSV parsing. Default is tab.
+        header_separator: Separator used when flattening 2-row headers.
+    """
+    header_rows: int = 1
+    fill_merged_headers: bool = True
+    delimiter: str = "\t"
+    header_separator: str = " - "
+    def __post_init__(self):
+        if self.header_rows not in (1, 2):
+            raise ValueError("header_rows must be 1 or 2")
+DEFAULT_EXCEL_SCHEMA = ExcelParsingSchema()

md_spreadsheet_parser/utils.py ADDED Viewed

@@ -0,0 +1,6 @@
+def normalize_header(header: str) -> str:
+    """
+    Normalizes a header string to match field names (lowercase, snake_case).
+    Example: "User Name" -> "user_name"
+    """
+    return header.lower().replace(" ", "_").strip()

md_spreadsheet_parser/validation.py ADDED Viewed

@@ -0,0 +1,348 @@
+import json
+import types
+from dataclasses import fields, is_dataclass
+from typing import TYPE_CHECKING, Any, Type, TypeVar, get_args, get_origin, is_typeddict
+if TYPE_CHECKING:
+    from .models import Table
+from .schemas import DEFAULT_CONVERSION_SCHEMA, ConversionSchema
+from .utils import normalize_header
+T = TypeVar("T")
+class TableValidationError(Exception):
+    """
+    Exception raised when table validation fails.
+    Contains a list of errors found during validation.
+    """
+    def __init__(self, errors: list[str]):
+        self.errors = errors
+        super().__init__(
+            f"Validation failed with {len(errors)} errors:\n" + "\n".join(errors)
+        )
+def _convert_value(
+    value: str, target_type: Type, schema: ConversionSchema = DEFAULT_CONVERSION_SCHEMA
+) -> Any:
+    """
+    Converts a string value to the target type.
+    Supports int, float, bool, str, and Optional types.
+    """
+    # Check custom converters first
+    if target_type in schema.custom_converters:
+        return schema.custom_converters[target_type](value)
+    origin = get_origin(target_type)
+    args = get_args(target_type)
+    # Handle Optional[T] (Union[T, None])
+    # Robust check for Union-like types
+    if origin is not None and (origin is types.UnionType or "Union" in str(origin)):
+        if type(None) in args:
+            if not value.strip():
+                return None
+            # Find the non-None type
+            for arg in args:
+                if arg is not type(None):
+                    return _convert_value(value, arg, schema)
+    # Handle basic types
+    if target_type is int:
+        if not value.strip():
+            raise ValueError("Empty value for int field")
+        return int(value)
+    if target_type is float:
+        if not value.strip():
+            raise ValueError("Empty value for float field")
+        return float(value)
+    if target_type is bool:
+        lower_val = value.lower().strip()
+        for true_val, false_val in schema.boolean_pairs:
+            if lower_val == true_val.lower():
+                return True
+            if lower_val == false_val.lower():
+                return False
+        raise ValueError(f"Invalid boolean value: '{value}'")
+    if target_type is str:
+        return value
+    # JSON Parsing for dict/list
+    # Logic: If target is strict dict or list, try parsing as JSON
+    # This covers dict, list, dict[str, Any], list[int], etc.
+    if origin in (dict, list) or target_type in (dict, list):
+        if not value.strip():
+            # Empty string -> Empty dict/list? Or None?
+            # Let's say empty string is not valid JSON, so strictly it should fail or return empty type.
+            # For user friendliness, let's treat empty string as empty container if not Optional
+            # For user friendliness, let's treat empty string as empty container if not Optional
+            if origin:
+                return origin()  # type: ignore
+            if target_type is dict:
+                return {}
+            if target_type is list:
+                return []
+            return target_type()  # type: ignore
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON for {target_type}: {e}")
+    # Fallback for other types (or if type hint is missing)
+    return value
+# --- Pydantic Support (Optional) ---
+try:
+    from pydantic import BaseModel
+    HAS_PYDANTIC = True
+except ImportError:
+    HAS_PYDANTIC = False
+    HAS_PYDANTIC = False
+    BaseModel = object  # type: ignore
+def _validate_table_dataclass(
+    table: "Table",
+    schema_cls: Type[T],
+    conversion_schema: ConversionSchema,
+) -> list[T]:
+    """
+    Validates a Table using standard dataclasses.
+    """
+    # Map headers to fields
+    cls_fields = {f.name: f for f in fields(schema_cls)}  # type: ignore
+    header_map: dict[int, str] = {}  # column_index -> field_name
+    normalized_headers = [normalize_header(h) for h in (table.headers or [])]
+    for idx, header in enumerate(normalized_headers):
+        if header in cls_fields:
+            header_map[idx] = header
+    # Process rows
+    results: list[T] = []
+    errors: list[str] = []
+    for row_idx, row in enumerate(table.rows):
+        row_data = {}
+        row_errors = []
+        for col_idx, cell_value in enumerate(row):
+            if col_idx in header_map:
+                field_name = header_map[col_idx]
+                field_def = cls_fields[field_name]
+                try:
+                    # Check for field-specific converter first
+                    if field_name in conversion_schema.field_converters:
+                        converter = conversion_schema.field_converters[field_name]
+                        converted_value = converter(cell_value)
+                    else:
+                        converted_value = _convert_value(
+                            cell_value,
+                            field_def.type,  # type: ignore
+                            conversion_schema,  # type: ignore
+                        )
+                    row_data[field_name] = converted_value
+                except ValueError as e:
+                    row_errors.append(f"Column '{field_name}': {str(e)}")
+                except Exception:
+                    row_errors.append(
+                        f"Column '{field_name}': Failed to convert '{cell_value}' to {field_def.type}"
+                    )
+        if row_errors:
+            for err in row_errors:
+                errors.append(f"Row {row_idx + 1}: {err}")
+            continue
+        try:
+            obj = schema_cls(**row_data)
+            results.append(obj)
+        except TypeError as e:
+            # This catches missing required arguments
+            errors.append(f"Row {row_idx + 1}: {str(e)}")
+    if errors:
+        raise TableValidationError(errors)
+    return results
+def _validate_table_typeddict(
+    table: "Table",
+    schema_cls: Type[T],
+    conversion_schema: ConversionSchema,
+) -> list[T]:
+    """
+    Validates a Table using TypedDict.
+    """
+    # TypedDict annotations
+    # __annotations__ or __required_keys__ / __optional_keys__ behavior
+    # For simplicity, we trust __annotations__ for type hints
+    annotations = schema_cls.__annotations__
+    header_map: dict[int, str] = {}
+    normalized_headers = [normalize_header(h) for h in (table.headers or [])]
+    # Map headers to TypedDict keys
+    # Prioritize exact match, then normalized match
+    # TypedDict doesn't support 'alias' natively usually, so simple mapping
+    for idx, header in enumerate(normalized_headers):
+        # 1. Check direct match with key names (normalized)
+        for key in annotations:
+            if normalize_header(key) == header:
+                header_map[idx] = key
+                break
+    results: list[T] = []
+    errors: list[str] = []
+    for row_idx, row in enumerate(table.rows):
+        row_data = {}
+        row_errors = []
+        for col_idx, cell_value in enumerate(row):
+            if col_idx in header_map:
+                key = header_map[col_idx]
+                target_type = annotations[key]
+                try:
+                    if key in conversion_schema.field_converters:
+                        converter = conversion_schema.field_converters[key]
+                        converted_value = converter(cell_value)
+                    else:
+                        converted_value = _convert_value(
+                            cell_value, target_type, conversion_schema
+                        )
+                    row_data[key] = converted_value
+                except Exception as e:
+                    row_errors.append(f"Column '{key}': {str(e)}")
+        if row_errors:
+            for err in row_errors:
+                errors.append(f"Row {row_idx + 1}: {err}")
+            continue
+        # Create TypedDict (it's just a dict at runtime)
+        # We should check required keys if using TypedDict features (Python 3.9+)
+        # But for now, simple dict construction
+        try:
+            # Basic check: Missing keys?
+            # TypedDict doesn't complain on instantiation (it's a dict),
+            # but static type checkers do.
+            # We should probably validate required keys if possible, but let's keep it simple for now.
+            results.append(row_data)  # type: ignore
+        except Exception as e:
+            errors.append(f"Row {row_idx + 1}: {str(e)}")
+    if errors:
+        raise TableValidationError(errors)
+    return results
+def _validate_table_dict(
+    table: "Table",
+    conversion_schema: ConversionSchema,
+) -> list[dict[str, Any]]:
+    """
+    Converts a Table to a list of dicts.
+    Keys are derived from headers.
+    """
+    # normalized_headers = [normalize_header(h) for h in table.headers]
+    # Use original header names or normalized?
+    # Usually users prefer original headers as keys if they passed 'dict'.
+    # But wait, validate_table usually normalizes.
+    # Let's use the actual header string from the table as the key,
+    # but normalize for field_converter lookups.
+    results = []
+    for row in table.rows:
+        row_data = {}
+        for idx, cell_value in enumerate(row):
+            if table.headers and idx < len(table.headers):
+                original_header = table.headers[idx]
+                key_for_conversion = normalize_header(original_header)
+                # Check converters
+                if key_for_conversion in conversion_schema.field_converters:
+                    converter = conversion_schema.field_converters[key_for_conversion]
+                    try:
+                        val = converter(cell_value)
+                    except Exception:
+                        val = (
+                            cell_value  # Fallback or Raise? Let's fallback for raw dict
+                        )
+                else:
+                    val = cell_value
+                row_data[original_header] = val
+        results.append(row_data)
+    return results
+def validate_table(
+    table: "Table",
+    schema_cls: Type[T],
+    conversion_schema: ConversionSchema = DEFAULT_CONVERSION_SCHEMA,
+) -> list[T]:
+    """
+    Validates a Table object against a dataclass OR Pydantic schema.
+    Args:
+        table: The Table object to validate.
+        schema_cls: The dataclass or Pydantic model type to validate against.
+        conversion_schema: Configuration for type conversion.
+    Returns:
+        list[T]: A list of validated instances.
+    Raises:
+        ValueError: If schema_cls is not a valid schema.
+        TableValidationError: If validation fails.
+    """
+    # Check for Pydantic Model
+    if HAS_PYDANTIC and BaseModel and issubclass(schema_cls, BaseModel):
+        if not table.headers:
+            raise TableValidationError(["Table has no headers"])
+        # Import adapter lazily to avoid unused imports when pydantic is not used
+        # (though we checked HAS_PYDANTIC so it exists)
+        from .pydantic_adapter import validate_table_pydantic
+        return validate_table_pydantic(table, schema_cls, conversion_schema)  # type: ignore
+    # Check for Dataclass
+    if is_dataclass(schema_cls):
+        if not table.headers:
+            raise TableValidationError(["Table has no headers"])
+        return _validate_table_dataclass(table, schema_cls, conversion_schema)
+    # Check for TypedDict
+    if is_typeddict(schema_cls):
+        if not table.headers:
+            raise TableValidationError(["Table has no headers"])
+        return _validate_table_typeddict(table, schema_cls, conversion_schema)
+    # Check for simple dict
+    # We compare schema_cls against dict type
+    if schema_cls is dict:
+        if not table.headers:
+            raise TableValidationError(["Table has no headers"])
+        return _validate_table_dict(table, conversion_schema)  # type: ignore
+    raise ValueError(
+        f"{schema_cls} must be a dataclass, Pydantic model, TypedDict, or dict"
+    )