PyPI - csrlite - Versions diffs - 0.1.0__py3-none-any.whl - Mend

csrlite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

csrlite/__init__.py +50 -0
csrlite/ae/__init__.py +1 -0
csrlite/ae/ae_listing.py +492 -0
csrlite/ae/ae_specific.py +478 -0
csrlite/ae/ae_summary.py +399 -0
csrlite/ae/ae_utils.py +132 -0
csrlite/common/count.py +199 -0
csrlite/common/parse.py +308 -0
csrlite/common/plan.py +353 -0
csrlite/common/utils.py +33 -0
csrlite/common/yaml_loader.py +71 -0
csrlite/disposition/__init__.py +2 -0
csrlite/disposition/disposition.py +301 -0
csrlite-0.1.0.dist-info/METADATA +68 -0
csrlite-0.1.0.dist-info/RECORD +17 -0
csrlite-0.1.0.dist-info/WHEEL +5 -0
csrlite-0.1.0.dist-info/top_level.txt +1 -0

csrlite/common/count.py ADDED Viewed

@@ -0,0 +1,199 @@
+# pyre-strict
+import polars as pl
+def _to_pop(
+    population: pl.DataFrame,
+    id: str,
+    group: str,
+    total: bool = True,
+    missing_group: str = "error",
+) -> pl.DataFrame:
+    # prepare data
+    pop = population.select(id, group)
+    # validate data
+    if pop[id].is_duplicated().any():
+        raise ValueError(f"The '{id}' column in the population DataFrame is not unique.")
+    if missing_group == "error" and pop[group].is_null().any():
+        raise ValueError(
+            f"Missing values found in the '{group}' column of the population DataFrame, "
+            "and 'missing_group' is set to 'error'."
+        )
+    # Convert group to Enum for consistent categorical ordering
+    u_pop = pop[group].unique().sort().to_list()
+    # handle total column
+    if total:
+        pop_total = pop.with_columns(pl.lit("Total").alias(group))
+        pop = pl.concat([pop, pop_total]).with_columns(
+            pl.col(group).cast(pl.Enum(u_pop + ["Total"]))
+        )
+    else:
+        pop = pop.with_columns(pl.col(group).cast(pl.Enum(u_pop)))
+    return pop
+def count_subject(
+    population: pl.DataFrame,
+    id: str,
+    group: str,
+    total: bool = True,
+    missing_group: str = "error",
+) -> pl.DataFrame:
+    """
+    Counts subjects by group and optionally includes a 'Total' column.
+    Args:
+        population (pl.DataFrame): DataFrame containing subject population data,
+                                   must include 'id' and 'group' columns.
+        id (str): The name of the subject ID column (e.g., "USUBJID").
+        group (str): The name of the treatment group column (e.g., "TRT01A").
+        total (bool, optional): If True, adds a 'Total' group with counts across all groups.
+                                Defaults to True.
+        missing_group (str, optional): How to handle missing values in the group column.
+                                       "error" will raise a ValueError. Defaults to "error".
+    Returns:
+        pl.DataFrame: A DataFrame with subject counts ('n_subj_pop') for each group.
+    """
+    pop = _to_pop(
+        population=population,
+        id=id,
+        group=group,
+        total=total,
+        missing_group=missing_group,
+    )
+    return pop.group_by(group).agg(pl.len().alias("n_subj_pop")).sort(group)
+def count_subject_with_observation(
+    population: pl.DataFrame,
+    observation: pl.DataFrame,
+    id: str,
+    group: str,
+    variable: str,
+    total: bool = True,
+    missing_group: str = "error",
+    pct_digit: int = 1,
+    max_n_width: int | None = None,
+) -> pl.DataFrame:
+    """
+    Counts subjects and observations by group and a specified variable,
+    calculating percentages based on population denominators.
+    Args:
+        population (pl.DataFrame): DataFrame containing subject population data,
+                                   must include 'id' and 'group' columns.
+        observation (pl.DataFrame): DataFrame containing observation data,
+                                    must include 'id' and 'variable' columns.
+        id (str): The name of the subject ID column (e.g., "USUBJID").
+        group (str): The name of the treatment group column (e.g., "TRT01A").
+        variable (str): The name of the variable to count observations for (e.g., "AESOC").
+        total (bool, optional): Not yet implemented. Defaults to True.
+        missing_group (str, optional): How to handle missing values in the group column.
+                                       "error" will raise a ValueError. Defaults to "error".
+        pct_digit (int, optional): Number of decimal places for percentage formatting.
+                                   Defaults to 1.
+        max_n_width (int, optional): Fixed width for subject count formatting. If None, inferred
+                                     from data. Defaults to None.
+    Returns:
+        pl.DataFrame: A DataFrame with counts and percentages of subjects and observations
+                      grouped by 'group' and 'variable'.
+    """
+    # prepare data
+    pop = _to_pop(
+        population=population,
+        id=id,
+        group=group,
+        total=total,
+        missing_group=missing_group,
+    )
+    obs = observation.select(id, variable).join(pop, on=id, how="left")
+    if not obs[id].is_in(pop[id].to_list()).all():
+        # Get IDs that are in obs but not in pop
+        missing_ids = (
+            obs.filter(~pl.col(id).is_in(pop[id].to_list()))
+            .select(id)
+            .unique()
+            .to_series()
+            .to_list()
+        )
+        raise ValueError(
+            f"Some '{id}' values in the observation DataFrame are not present in the population "
+            f"DataFrame: {missing_ids}"
+        )
+    df_pop = count_subject(
+        population=population,
+        id=id,
+        group=group,
+        total=total,
+        missing_group=missing_group,
+    )
+    # Count observations and subjects by group and variable
+    df_obs_counts = obs.group_by(group, variable).agg(
+        pl.len().alias("n_obs"), pl.n_unique(id).alias("n_subj")
+    )
+    # Create all combinations of groups and variables to ensure no missing groups
+    unique_groups = df_pop.select(group)
+    unique_variables = obs.select(variable).unique()
+    # Cross join to get all combinations
+    all_combinations = unique_groups.join(unique_variables, how="cross")
+    # Left join to preserve all combinations, filling missing counts with 0
+    df_obs = (
+        all_combinations.join(df_obs_counts, on=[group, variable], how="left")
+        .join(df_pop, on=group, how="left")
+        .with_columns([pl.col("n_obs").fill_null(0), pl.col("n_subj").fill_null(0)])
+        .with_columns(pct_subj=(pl.col("n_subj") / pl.col("n_subj_pop") * 100))
+        .with_columns(
+            pct_subj_fmt=(
+                pl.when(pl.col("pct_subj").is_null() | pl.col("pct_subj").is_nan())
+                .then(0.0)
+                .otherwise(pl.col("pct_subj"))
+                .round(pct_digit, mode="half_away_from_zero")
+                .cast(pl.String)
+            )
+        )
+    )
+    # Calculate max widths for proper alignment
+    if max_n_width is None:
+        max_n_width = df_obs.select(pl.col("n_subj").cast(pl.String).str.len_chars().max()).item()
+    # Infer max percentage width from pct_digit
+    max_pct_width = 3 if pct_digit == 0 else 4 + pct_digit
+    # Format with padding for alignment
+    df_obs = (
+        df_obs.with_columns(
+            [
+                pl.col("pct_subj_fmt").str.pad_start(max_pct_width, " "),
+                pl.col("n_subj")
+                .cast(pl.String)
+                .str.pad_start(max_n_width, " ")
+                .alias("n_subj_fmt"),
+            ]
+        )
+        .with_columns(
+            n_pct_subj_fmt=pl.concat_str(
+                [pl.col("n_subj_fmt"), pl.lit(" ("), pl.col("pct_subj_fmt"), pl.lit(")")]
+            )
+        )
+        .sort(group, variable)
+    )
+    return df_obs

csrlite/common/parse.py ADDED Viewed

@@ -0,0 +1,308 @@
+# pyre-strict
+"""
+StudyPlan Parsing Utilities
+This module provides utilities for parsing and extracting information from StudyPlan objects,
+including filter conversion, parameter parsing, and keyword resolution.
+"""
+import re
+from typing import Any
+import polars as pl
+from .plan import StudyPlan
+def parse_filter_to_sql(filter_str: str) -> str:
+    """
+    Parse custom filter syntax to SQL WHERE clause.
+    Converts:
+    - "adsl:saffl == 'Y'" -> "SAFFL = 'Y'"
+    - "adae:trtemfl == 'Y' and adae:aeser == 'Y'" -> "TRTEMFL = 'Y' AND AESER = 'Y'"
+    - "adae:aerel in ['A', 'B']" -> "AEREL IN ('A', 'B')"
+    Args:
+        filter_str: Custom filter string with dataset:column format
+    Returns:
+        SQL WHERE clause string
+    """
+    if not filter_str or filter_str.strip() == "":
+        return "1=1"  # Always true
+    # Remove dataset prefixes (adsl:, adae:)
+    sql = re.sub(r"\w+:", "", filter_str)
+    # Convert Python syntax to SQL
+    sql = sql.replace("==", "=")  # Python equality to SQL
+    sql = sql.replace(" and ", " AND ")  # Python to SQL
+    sql = sql.replace(" and ", " AND ")  # Python to SQL
+    sql = sql.replace(" or ", " OR ")  # Python to SQL
+    sql = sql.replace(" in ", " IN ")  # Python to SQL
+    # Convert Python list syntax to SQL IN: ['A', 'B'] -> ('A', 'B')
+    sql = sql.replace("[", "(").replace("]", ")")
+    # Uppercase column names (assuming ADaM standard)
+    # Match word boundaries before operators
+    sql = re.sub(
+        r"\b([a-z]\w*)\b(?=\s*[=<>!]|\s+IN)", lambda m: m.group(1).upper(), sql, flags=re.IGNORECASE
+    )
+    return sql
+def apply_filter_sql(df: pl.DataFrame, filter_str: str) -> pl.DataFrame:
+    """
+    Apply filter using pl.sql_expr() - simpler and faster than SQLContext.
+    Args:
+        df: DataFrame to filter
+        filter_str: Custom filter string
+    Returns:
+        Filtered DataFrame
+    """
+    if not filter_str or filter_str.strip() == "":
+        return df
+    where_clause = parse_filter_to_sql(filter_str)
+    try:
+        # Use pl.sql_expr() - much simpler and faster!
+        return df.filter(pl.sql_expr(where_clause))
+    except Exception as e:
+        # Fallback to manual parsing if SQL fails
+        print(f"Warning: SQL filter failed ({e}), using fallback method")
+        return df.filter(_parse_filter_expr(filter_str))
+def _parse_filter_expr(filter_str: str) -> Any:
+    """
+    Fallback filter parser using Polars expressions.
+    Used if SQL parsing fails.
+    Args:
+        filter_str: Filter string
+    Returns:
+        Polars expression
+    """
+    if not filter_str or filter_str.strip() == "":
+        return pl.lit(True)
+    # Remove dataset prefixes
+    filter_str = re.sub(r"\w+:", "", filter_str)
+    # Handle 'in' operator: column in ['A', 'B'] -> pl.col(column).is_in(['A', 'B'])
+    in_pattern = r"(\w+)\s+in\s+\[([^\]]+)\]"
+    def _parse_between(match: re.Match[str]) -> str:
+        col = match.group(1).upper()
+        values = match.group(2)
+        return f"(pl.col('{col}').is_in([{values}]))"
+    filter_str = re.sub(in_pattern, _parse_between, filter_str)
+    # Handle equality/inequality
+    eq_pattern = r"(\w+)\s*(==|!=|>|<|>=|<=)\s*'([^']+)'"
+    def _parse_like(match: re.Match[str]) -> str:
+        col = match.group(1).upper()
+        op = match.group(2)
+        val = match.group(3)
+        return f"(pl.col('{col}') {op} '{val}')"
+    filter_str = re.sub(eq_pattern, _parse_like, filter_str)
+    # Replace 'and'/'or'
+    filter_str = filter_str.replace(" and ", " & ")
+    filter_str = filter_str.replace(" or ", " | ")
+    return eval(filter_str)
+def parse_parameter(parameter_str: str) -> list[str]:
+    """
+    Parse semicolon-separated parameter string.
+    Args:
+        parameter_str: Single parameter or semicolon-separated (e.g., "any;rel;ser")
+    Returns:
+        List of parameter names
+    """
+    if not parameter_str:
+        return []
+    if ";" in parameter_str:
+        return [p.strip() for p in parameter_str.split(";")]
+    return [parameter_str]
+class StudyPlanParser:
+    """
+    Parser class for extracting and resolving information from StudyPlan objects.
+    This class provides methods to extract filters, labels, and other configuration
+    from StudyPlan keywords and convert them to analysis-ready formats.
+    """
+    def __init__(self, study_plan: StudyPlan) -> None:
+        """
+        Initialize parser with a StudyPlan object.
+        Args:
+            study_plan: StudyPlan object with loaded datasets and keywords
+        """
+        self.study_plan = study_plan
+    def get_population_filter(self, population: str) -> str:
+        """
+        Get population filter as SQL WHERE clause.
+        Args:
+            population: Population keyword name
+        Returns:
+            SQL WHERE clause string
+        Raises:
+            ValueError: If population keyword not found
+        """
+        pop = self.study_plan.keywords.get_population(population)
+        if pop is None:
+            raise ValueError(f"Population '{population}' not found")
+        return parse_filter_to_sql(pop.filter)
+    def get_observation_filter(self, observation: str | None) -> str | None:
+        """
+        Get observation filter as SQL WHERE clause.
+        Args:
+            observation: Optional observation keyword name
+        Returns:
+            SQL WHERE clause string or None if observation not specified
+        """
+        if not observation:
+            return None
+        obs = self.study_plan.keywords.get_observation(observation)
+        if obs:
+            return parse_filter_to_sql(obs.filter)
+        return None
+    def get_parameter_info(
+        self, parameter: str
+    ) -> tuple[list[str], list[str], list[str], list[int]]:
+        """
+        Get parameter names, filters, labels, and indent levels.
+        Args:
+            parameter: Parameter keyword, can be semicolon-separated (e.g., "any;rel;ser")
+        Returns:
+            Tuple of (parameter_names, parameter_filters, parameter_labels, parameter_indents)
+        Raises:
+            ValueError: If any parameter keyword not found
+        """
+        param_names = parse_parameter(parameter)
+        param_labels = []
+        param_filters = []
+        param_indents = []
+        for param_name in param_names:
+            param = self.study_plan.keywords.get_parameter(param_name)
+            if param is None:
+                raise ValueError(f"Parameter '{param_name}' not found")
+            param_filters.append(parse_filter_to_sql(param.filter))
+            param_labels.append(param.label or param_name)
+            param_indents.append(param.indent)
+        return param_names, param_filters, param_labels, param_indents
+    def get_single_parameter_info(self, parameter: str) -> tuple[str, str]:
+        """
+        Get single parameter filter and label (NOT semicolon-separated).
+        Args:
+            parameter: Single parameter keyword name
+        Returns:
+            Tuple of (parameter_filter, parameter_label)
+        Raises:
+            ValueError: If parameter keyword not found
+        """
+        param = self.study_plan.keywords.get_parameter(parameter)
+        if param is None:
+            raise ValueError(f"Parameter '{parameter}' not found")
+        return parse_filter_to_sql(param.filter), param.label or parameter
+    def get_group_info(self, group: str) -> tuple[str, list[str]]:
+        """
+        Get group variable name and labels.
+        Args:
+            group: Group keyword name
+        Returns:
+            Tuple of (group_variable, group_labels)
+        Raises:
+            ValueError: If group keyword not found
+        """
+        grp = self.study_plan.keywords.get_group(group)
+        if grp is None:
+            raise ValueError(f"Group '{group}' not found")
+        group_var = grp.variable.split(":")[-1].upper()
+        group_labels = grp.group_label if grp.group_label else []
+        return group_var, group_labels
+    def get_datasets(self, *dataset_names: str) -> tuple[pl.DataFrame, ...]:
+        """
+        Get multiple datasets from StudyPlan.
+        Args:
+            *dataset_names: Names of datasets to retrieve (e.g., "adsl", "adae")
+        Returns:
+            Tuple of DataFrames in the order requested
+        Raises:
+            ValueError: If any dataset not found
+        """
+        datasets = []
+        for name in dataset_names:
+            ds = self.study_plan.datasets.get(name)
+            if ds is None:
+                raise ValueError(f"Dataset '{name}' not found in study plan")
+            datasets.append(ds)
+        return tuple(datasets)
+    def get_population_data(self, population: str, group: str) -> tuple[pl.DataFrame, str]:
+        """
+        Get filtered population dataset and group variable.
+        Args:
+            population: Population keyword name
+            group: Group keyword name
+        Returns:
+            Tuple of (filtered_adsl, group_variable)
+        """
+        # Get ADSL dataset
+        (adsl,) = self.get_datasets("adsl")
+        # Apply population filter
+        pop_filter = self.get_population_filter(population)
+        adsl_pop = apply_filter_sql(adsl, pop_filter)
+        # Get group variable
+        group_var, _ = self.get_group_info(group)
+        return adsl_pop, group_var