PyPI - ultrasav - Versions diffs - 0.1.4__py3-none-any.whl - Mend

ultrasav 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

ultrasav/__init__.py +280 -0
ultrasav/_add_cases.py +227 -0
ultrasav/_data.py +513 -0
ultrasav/_make_dummy.py +137 -0
ultrasav/_merge_data.py +435 -0
ultrasav/_merge_meta.py +280 -0
ultrasav/_metadata.py +570 -0
ultrasav/_read_files.py +558 -0
ultrasav/_write_files.py +111 -0
ultrasav/metaman/__init__.py +91 -0
ultrasav/metaman/def_detect_variable_type.py +454 -0
ultrasav/metaman/def_get_meta.py +561 -0
ultrasav/metaman/def_make_datamap.py +127 -0
ultrasav/metaman/def_make_labels.py +833 -0
ultrasav/metaman/def_map_engine.py +529 -0
ultrasav/metaman/def_map_to_excel.py +294 -0
ultrasav/metaman/def_write_excel_engine.py +298 -0
ultrasav/metaman/pastel_color_schemes.py +185 -0
ultrasav-0.1.4.dist-info/METADATA +550 -0
ultrasav-0.1.4.dist-info/RECORD +21 -0
ultrasav-0.1.4.dist-info/WHEEL +4 -0

ultrasav/metaman/def_map_to_excel.py ADDED Viewed

@@ -0,0 +1,294 @@
+import polars as pl
+import xlsxwriter
+from typing import Any
+from pathlib import Path
+from .pastel_color_schemes import get_color_scheme
+from .def_write_excel_engine import write_excel_engine
+# version_2
+def map_to_excel(
+    df: pl.DataFrame,
+    file_path: str | Path,
+    merge_columns: list[str] | None = None,
+    column_widths: dict[str, int] | None = None,
+    header_format: dict[str, Any] | None = None,
+    column_formats: dict[str, dict[str, Any]] | None = None,
+    merge_format: dict[str, Any] | None = None,
+    group_border_format: dict[str, Any] | None = None,
+    alternating_row_colors: tuple[str, str] | None = None,
+    alternating_group_formats: tuple[dict[str, Any], dict[str, Any]] | None = None,
+    sheet_name: str = "Sheet1",
+    freeze_panes: tuple | None = (1, 0),
+) -> None:
+    """
+    Write survey metadata DataFrame to Excel with standardized formatting.
+    This is a convenience wrapper around write_excel_engine() with pre-configured
+    default formatting optimized for survey data maps and metadata. All parameters
+    can still be overridden if needed.
+    Parameters
+    ----------
+    df : pl.DataFrame
+        The DataFrame to write to Excel.
+    file_path : str or Path
+        Path to the output Excel file. Parent directories will be created if needed.
+    merge_columns : list of str, optional
+        Column names to merge. Defaults to ['variable', 'variable_label', 'variable_type'].
+        Pass an empty list [] to disable merging, or provide custom columns to override.
+    column_widths : dict of str to int, optional
+        Mapping of column names to widths in pixels. Defaults include optimized widths
+        for common survey metadata columns. User values override defaults.
+    header_format : dict, optional
+        xlsxwriter format properties for header row. Defaults to bold, 12pt font
+        with gray bottom border. User values override defaults.
+    column_formats : dict of str to dict, optional
+        Mapping of column names to xlsxwriter format dictionaries. Defaults include
+        centered formatting for value_code and percentage formatting for base_pct.
+        User values override defaults.
+    merge_format : dict, optional
+        xlsxwriter format properties for merged cells. Defaults to left-aligned
+        with vertical centering. User values override defaults.
+    group_border_format : dict, optional
+        Border format for merge group bottoms. Defaults to thick green border.
+        User values override defaults.
+    alternating_row_colors : tuple of (str, str), optional
+        Two colors to alternate between merge groups. Defaults to light and darker grey.
+        Pass None to disable alternating colors, or provide custom colors to override.
+        Note: If alternating_group_formats is provided, it takes precedence.
+    alternating_group_formats : tuple of (dict, dict), optional
+        Two complete format dictionaries to alternate between merge groups.
+        Provides full control over font color, borders, background, etc.
+        Takes precedence over alternating_row_colors. Pass None to use colors only.
+    sheet_name : str, default "Sheet1"
+        Name of the worksheet.
+    freeze_panes : tuple of (row, col), optional
+        Position to freeze panes. Default (1, 0) freezes the header row.
+    Examples
+    --------
+    Simple usage with all defaults:
+    >>> df = pl.DataFrame({
+    ...     "variable": ["Q1", "Q1", "Q2"],
+    ...     "variable_label": ["Age", "Age", "Gender"],
+    ...     "variable_type": ["single", "single", "single"],
+    ...     "value_code": [1, 2, 1],
+    ...     "value_label": ["18-25", "26+", "Male"]
+    ... })
+    >>> map_to_excel(df, "survey_map.xlsx")
+    Override with custom alternating formats:
+    >>> map_to_excel(
+    ...     df,
+    ...     "survey_map.xlsx",
+    ...     alternating_group_formats=(
+    ...         {"bg_color": "#F0F8FF", "font_color": "#00008B", "border": 1},
+    ...         {"bg_color": "#FFF0F5", "font_color": "#8B008B", "border": 1}
+    ...     )
+    ... )
+    Custom alternating colors only:
+    >>> map_to_excel(
+    ...     df,
+    ...     "survey_map.xlsx",
+    ...     alternating_row_colors=("#FFE6E6", "#FFCCCC")  # Light red alternating
+    ... )
+    Disable all alternating:
+    >>> map_to_excel(
+    ...     df,
+    ...     "survey_map.xlsx",
+    ...     alternating_row_colors=None,
+    ...     alternating_group_formats=None
+    ... )
+    Notes
+    -----
+    This function is specifically designed for survey metadata with standard columns
+    like variable, variable_label, value_code, etc. It provides sensible defaults
+    while maintaining full flexibility to override any formatting parameter.
+    The default formatting includes:
+    - Merging by variable, variable_label, and variable_type
+    - Optimized column widths for survey metadata
+    - Professional header styling with borders
+    - Centered value codes
+    - Percentage formatting for base_pct column (0.00%)
+    - Green group borders for visual separation
+    - Classic Grey Scale alternating formats with borders and professional fonts
+    See Also
+    --------
+    write_excel_with_merge : The underlying function with full control
+    """
+    # Validate input DataFrame type
+    if not isinstance(df, pl.DataFrame):
+        # Check specifically for pandas DataFrame
+        df_type = type(df).__module__ + "." + type(df).__name__
+        if "pandas" in df_type.lower():
+            raise TypeError(
+                f"Expected a Polars DataFrame, but received a pandas DataFrame. "
+                f"Convert with: pl.from_pandas(df)"
+            )
+        else:
+            raise TypeError(
+                f"Expected a Polars DataFrame, but received {type(df).__name__}. "
+                f"Please pass a pl.DataFrame."
+            )
+    # Define default values
+    default_merge_columns = [
+        'variable',
+        'variable_label',
+        'variable_type',
+    ]
+    default_column_widths = {
+        "variable": 100,
+        "variable_label": 400,
+        "variable_type": 100,
+        "value_code": 80,
+        "value_label": 200,
+        "value_n": 100,
+        "base_n": 100,
+        "base_pct": 100,
+        "total_n": 100,
+        "total_pct": 100,
+        "missing_value_label": 130,
+        "missing_data": 120
+    }
+    default_header_format = {
+        "bold": True,
+        "font_size": 12,
+        "bottom": 1,
+        "bottom_color": "#808080"
+    }
+    default_column_formats = {
+        "variable_label": {
+            "text_wrap": True
+        },
+        "value_code": {
+            "num_format": "0",
+            "align": "center",
+            "valign": "vcenter"
+        },
+        "base_pct": {
+            "num_format": "0.0%",
+            "align": "right",
+            "valign": "vcenter"
+        },
+        "total_pct": {
+            "num_format": "0.0%",
+            "align": "right",
+            "valign": "vcenter"
+        }
+    }
+    default_merge_format = {
+        "text_wrap": True, # This is needs to be true for triggering the wrap
+        "align": "left",
+        "valign": "vcenter",
+    }
+    default_group_border_format = {
+        # "bottom": 4,
+        # "bottom_color": "#4d6b4a"
+        # commented out if default_alternating_formats are provided below
+    }
+    # Classic Grey Scale as default alternating formats
+    default_alternating_formats = (
+        get_color_scheme("classic_grey")
+        # get_color_scheme("pastel_green")
+        # get_color_scheme("pastel_blue")
+        # get_color_scheme("pastel_purple")
+    )
+        # {
+        #     "bg_color": "#F5F5F5",      # Light grey
+        #     "font_color": "#1A1A1A",     # Near black
+        #     "border": 1,
+        #     "border_color": "#D9D9D9",
+        #     "valign": "vcenter"
+        # },
+        # {
+        #     "bg_color": "#FFFFFF",      # Pure white
+        #     "font_color": "#2C2C2C",     # Charcoal grey
+        #     "border": 1,
+        #     "border_color": "#D9D9D9",
+        #     "valign": "vcenter"
+        # }
+    # Merge user-provided values with defaults
+    # For merge_columns, use default only if None, allow empty list to disable
+    if merge_columns is None:
+        merge_columns = default_merge_columns
+    # For column_widths, merge user values with defaults
+    if column_widths is None:
+        column_widths = default_column_widths
+    else:
+        column_widths = {**default_column_widths, **column_widths}
+    # For header_format, merge user values with defaults
+    if header_format is None:
+        header_format = default_header_format
+    else:
+        header_format = {**default_header_format, **header_format}
+    # For column_formats, merge nested dictionaries
+    if column_formats is None:
+        column_formats = default_column_formats
+    else:
+        merged_formats = default_column_formats.copy()
+        for col, fmt in column_formats.items():
+            if col in merged_formats:
+                # Merge the format dicts for this column
+                merged_formats[col] = {**merged_formats[col], **fmt}
+            else:
+                merged_formats[col] = fmt
+        column_formats = merged_formats
+    # For merge_format, merge user values with defaults
+    if merge_format is None:
+        merge_format = default_merge_format
+    else:
+        merge_format = {**default_merge_format, **merge_format}
+    # For group_border_format, merge user values with defaults
+    if group_border_format is None:
+        group_border_format = default_group_border_format
+    else:
+        group_border_format = {**default_group_border_format, **group_border_format}
+    # For alternating formats, use the comprehensive defaults if neither is specified
+    if alternating_group_formats is None and alternating_row_colors is None:
+        # Neither specified, use default comprehensive formats
+        alternating_group_formats = default_alternating_formats
+    # If either is explicitly set (including to None), use as-is
+    # Call the main function with merged parameters
+    write_excel_engine(
+        df=df,
+        file_path=file_path,
+        merge_columns=merge_columns,
+        column_widths=column_widths,
+        header_format=header_format,
+        column_formats=column_formats,
+        merge_format=merge_format,
+        group_border_format=group_border_format,
+        alternating_row_colors=alternating_row_colors,
+        alternating_group_formats=alternating_group_formats,
+        sheet_name=sheet_name,
+        freeze_panes=freeze_panes
+    )
+    # Success message
+    output_path = Path(file_path)
+    print(f"✓ Saved: {output_path.name}") #({df.shape[0]:,} rows × {df.shape[1]} cols)"

ultrasav/metaman/def_write_excel_engine.py ADDED Viewed

@@ -0,0 +1,298 @@
+import polars as pl
+import xlsxwriter
+from typing import Any
+from pathlib import Path
+from .pastel_color_schemes import get_color_scheme
+# version_6
+def write_excel_engine(
+    df: pl.DataFrame,
+    file_path: str | Path,
+    merge_columns: list[str] | None = None,
+    column_widths: dict[str, int] | None = None,
+    header_format: dict[str, Any] | None = None,
+    column_formats: dict[str, dict[str, Any]] | None = None,
+    merge_format: dict[str, Any] | None = None,
+    group_border_format: dict[str, Any] | None = None,
+    alternating_row_colors: tuple[str, str] | None = None,
+    alternating_group_formats: tuple[dict[str, Any], dict[str, Any]] | None = None,
+    sheet_name: str = "Sheet1",
+    freeze_panes: tuple | None = (1, 0),
+) -> None:
+    """
+    Write a Polars DataFrame to Excel with merged cells for consecutive duplicate values.
+    This function extends Polars' write_excel functionality by adding support for
+    merging cells when consecutive rows have identical values in specified columns.
+    This is particularly useful for survey metadata, hierarchical data, or any
+    dataset where visual grouping improves readability.
+    Parameters
+    ----------
+    df : pl.DataFrame
+        The DataFrame to write to Excel.
+    file_path : str or Path
+        Path to the output Excel file. Parent directories will be created if needed.
+    merge_columns : list of str, optional
+        Column names to merge. Cells are merged when ALL specified columns have
+        consecutive duplicate values. If None, no merging is performed.
+        Example: ["variable", "question_label"]
+    column_widths : dict of str to int, optional
+        Mapping of column names to widths in pixels.
+        Example: {"variable": 200, "question_label": 500}
+    header_format : dict, optional
+        xlsxwriter format properties for header row.
+        Example: {"bold": True, "font_color": "#4472C4", "bg_color": "#F0F0F0"}
+    column_formats : dict of str to dict, optional
+        Mapping of column names to xlsxwriter format dictionaries.
+        Example: {"value_code": {"num_format": "0", "align": "center"}}
+    merge_format : dict, optional
+        xlsxwriter format properties for merged cells. Defaults to left-aligned
+        with vertical centering.
+        Example: {"align": "left", "valign": "vcenter", "text_wrap": True}
+    group_border_format : dict, optional
+        Border format to apply to the bottom row of each merge group. When set,
+        adds a bottom border to all cells (merged and non-merged) at the same row
+        where a merge group ends. Common usage: {"bottom": 1, "bottom_color": "#808080"}
+    alternating_row_colors : tuple of (str, str), optional
+        Two colors to alternate between merge groups for better visual separation.
+        Groups will alternate between these background colors.
+        Example: ("#F5F5F5", "#E0E0E0") for light and darker grey.
+        Note: If alternating_group_formats is also provided, it takes precedence.
+    alternating_group_formats : tuple of (dict, dict), optional
+        Two complete format dictionaries to alternate between merge groups.
+        This provides full control over all formatting aspects including font color,
+        borders, background, etc. Takes precedence over alternating_row_colors.
+        Example: (
+            {"bg_color": "#F5F5F5", "font_color": "#000000", "border": 1},
+            {"bg_color": "#E0E0E0", "font_color": "#333333", "border": 2}
+        )
+    sheet_name : str, default "Sheet1"
+        Name of the worksheet.
+    freeze_panes : tuple of (row, col), optional
+        Position to freeze panes. Default (1, 0) freezes the header row.
+        Set to None to disable. Example: (1, 2) freezes header and first 2 columns.
+    Raises
+    ------
+    ValueError
+        If merge_columns contains column names not in the DataFrame.
+    TypeError
+        If df is not a Polars DataFrame.
+    Examples
+    --------
+    Basic usage with merged cells:
+    >>> import polars as pl
+    >>> df = pl.DataFrame({
+    ...     "variable": ["S0", "S0", "S0", "S1", "S1"],
+    ...     "question": ["Age?", "Age?", "Age?", "Gender?", "Gender?"],
+    ...     "value_code": [1, 2, 3, 1, 2],
+    ...     "value_label": ["18-25", "26-35", "36+", "Male", "Female"]
+    ... })
+    >>> write_excel_engine(
+    ...     df=df,
+    ...     file_path="survey.xlsx",
+    ...     merge_columns=["variable", "question"],
+    ...     column_widths={"variable": 150, "question": 300}
+    ... )
+    Advanced formatting with alternating group formats:
+    >>> write_excel_engine(
+    ...     df=df,
+    ...     file_path="survey.xlsx",
+    ...     merge_columns=["variable", "question"],
+    ...     alternating_group_formats=(
+    ...         {"bg_color": "#F0F8FF", "font_color": "#00008B", "border": 1, "border_color": "#4169E1"},
+    ...         {"bg_color": "#FFF0F5", "font_color": "#8B008B", "border": 1, "border_color": "#DA70D6"}
+    ...     ),
+    ...     freeze_panes=(1, 2)
+    ... )
+    Notes
+    -----
+    - Column widths are converted from pixels to Excel character units (approx. 7 pixels per unit)
+    - Merging only occurs for consecutive rows with identical values in ALL merge_columns
+    - Non-consecutive duplicates are not merged (maintains data integrity)
+    - Alternating formats are applied to entire merge groups, not individual rows
+    - alternating_group_formats takes precedence over alternating_row_colors if both are provided
+    - The function uses xlsxwriter as the backend (same as Polars' write_excel)
+    See Also
+    --------
+    polars.DataFrame.write_excel : Standard Polars Excel writer without merging
+    """
+    # Input validation
+    if not isinstance(df, pl.DataFrame):
+        raise TypeError(f"Expected pl.DataFrame, got {type(df).__name__}")
+    if df.is_empty():
+        raise ValueError("Cannot write empty DataFrame to Excel")
+    # Validate merge_columns
+    if merge_columns:
+        invalid_cols = set(merge_columns) - set(df.columns)
+        if invalid_cols:
+            raise ValueError(
+                f"merge_columns contains invalid column names: {invalid_cols}. "
+                f"Available columns: {df.columns}"
+            )
+    # Convert file_path to Path and ensure parent directory exists
+    file_path = Path(file_path)
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    # Create workbook and worksheet
+    workbook = xlsxwriter.Workbook(str(file_path))
+    worksheet = workbook.add_worksheet(sheet_name)
+    # Apply freeze panes if specified
+    if freeze_panes:
+        worksheet.freeze_panes(*freeze_panes)
+    # Create formats
+    fmt_header = workbook.add_format(header_format or {})
+    # Default merge format with vertical centering
+    default_merge_format = {"align": "left", "valign": "vcenter"}
+    if merge_format:
+        default_merge_format.update(merge_format)
+    fmt_merge = workbook.add_format(default_merge_format)
+    # Create column-specific formats
+    fmt_columns = {}
+    if column_formats:
+        for col_name, fmt_dict in column_formats.items():
+            if col_name in df.columns:
+                fmt_columns[col_name] = workbook.add_format(fmt_dict)
+    # Get column names and create index mapping
+    columns = df.columns
+    col_indices = {col: idx for idx, col in enumerate(columns)}
+    # Set column widths
+    if column_widths:
+        for col_name, width_px in column_widths.items():
+            if col_name in col_indices:
+                # Convert pixels to Excel character units (approximate: 1 char ≈ 7 pixels)
+                width_chars = width_px / 7
+                col_idx = col_indices[col_name]
+                worksheet.set_column(col_idx, col_idx, width_chars)
+    # Write headers
+    for col_idx, col_name in enumerate(columns):
+        worksheet.write(0, col_idx, col_name, fmt_header)
+    # Convert dataframe to list of dictionaries for easier processing
+    data = df.to_dicts()
+    if not merge_columns or len(data) == 0:
+        # No merging needed - use standard write
+        for row_idx, row_data in enumerate(data, start=1):
+            for col_idx, col_name in enumerate(columns):
+                value = row_data[col_name]
+                cell_format = fmt_columns.get(col_name)
+                worksheet.write(row_idx, col_idx, value, cell_format)
+    else:
+        # Write data with merging logic
+        row = 1
+        i = 0
+        group_index = 0  # Track which group we're in for alternating formats
+        while i < len(data):
+            # Get values for merge columns in current row
+            merge_values = tuple(data[i][col] for col in merge_columns)
+            start_row = row
+            j = i
+            # Determine the formatting for this group
+            group_format_dict = None
+            if alternating_group_formats:
+                # Use comprehensive alternating formats
+                format_index = group_index % 2
+                group_format_dict = alternating_group_formats[format_index]
+            elif alternating_row_colors:
+                # Use simple color alternation
+                color_index = group_index % 2
+                group_format_dict = {"bg_color": alternating_row_colors[color_index]}
+            # Find consecutive rows with same merge column values
+            while j < len(data):
+                current_merge_values = tuple(data[j][col] for col in merge_columns)
+                if current_merge_values != merge_values:
+                    break
+                # Determine if this is the last row in the group
+                is_last_row_in_group = (j + 1 >= len(data) or
+                                       tuple(data[j + 1][col] for col in merge_columns) != merge_values)
+                # Write all non-merge columns for this row
+                for col_name in columns:
+                    if col_name not in merge_columns:
+                        col_idx = col_indices[col_name]
+                        value = data[j][col_name]
+                        # Prepare format with group formatting and/or border
+                        combined_format = {}
+                        # Add column-specific format if exists
+                        if col_name in column_formats:
+                            combined_format.update(column_formats[col_name])
+                        # Add group format (colors, borders, font, etc.)
+                        if group_format_dict:
+                            combined_format.update(group_format_dict)
+                        # Add group border if this is the last row
+                        if is_last_row_in_group and group_border_format:
+                            combined_format.update(group_border_format)
+                        # Create and apply the format
+                        if combined_format:
+                            cell_format = workbook.add_format(combined_format)
+                        else:
+                            cell_format = fmt_columns.get(col_name)
+                        worksheet.write(row, col_idx, value, cell_format)
+                row += 1
+                j += 1
+            # Write/merge the merge columns
+            end_row = row - 1
+            for col_name in merge_columns:
+                col_idx = col_indices[col_name]
+                value = data[i][col_name]
+                # Prepare format for merged cells
+                combined_merge_format = default_merge_format.copy()
+                # Add group format if specified
+                if group_format_dict:
+                    combined_merge_format.update(group_format_dict)
+                # Add border if group_border_format is specified
+                if group_border_format:
+                    combined_merge_format.update(group_border_format)
+                current_fmt = workbook.add_format(combined_merge_format)
+                if end_row > start_row:
+                    # Multiple rows - merge cells
+                    worksheet.merge_range(start_row, col_idx, end_row, col_idx,
+                                        value, current_fmt)
+                else:
+                    # Single row - no merge needed
+                    worksheet.write(start_row, col_idx, value, current_fmt)
+            i = j
+            group_index += 1  # Move to next group for format alternation
+    # Close workbook to save file
+    try:
+        workbook.close()
+    except Exception as e:
+        raise IOError(f"Failed to write Excel file to {file_path}: {e}") from e