PyPI - csrlite - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

csrlite 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

csrlite/__init__.py +16 -8
csrlite/ae/ae_listing.py +2 -0
csrlite/ae/ae_specific.py +10 -5
csrlite/ae/ae_summary.py +4 -2
csrlite/ae/ae_utils.py +0 -70
csrlite/common/config.py +34 -0
csrlite/common/count.py +174 -80
csrlite/common/plan.py +79 -67
csrlite/common/rtf.py +85 -0
csrlite/common/utils.py +4 -4
csrlite/disposition/disposition.py +126 -95
{csrlite-0.1.0.dist-info → csrlite-0.2.0.dist-info}/METADATA +7 -7
csrlite-0.2.0.dist-info/RECORD +19 -0
csrlite-0.1.0.dist-info/RECORD +0 -17
{csrlite-0.1.0.dist-info → csrlite-0.2.0.dist-info}/WHEEL +0 -0
{csrlite-0.1.0.dist-info → csrlite-0.2.0.dist-info}/top_level.txt +0 -0

csrlite/__init__.py CHANGED Viewed

@@ -1,18 +1,19 @@
-from .ae.ae_listing import (
-    # AE listing functions
+import logging
+import sys
+from .ae.ae_listing import (  # AE listing functions
     ae_listing,
     study_plan_to_ae_listing,
 )
-from .ae.ae_specific import (
-    # AE specific functions
+from .ae.ae_specific import (  # AE specific functions
     ae_specific,
     study_plan_to_ae_specific,
 )
-from .ae.ae_summary import (
-    # AE summary functions
+from .ae.ae_summary import (  # AE summary functions
     ae_summary,
     study_plan_to_ae_summary,
 )
+from .common.config import config
 from .common.count import (
     count_subject,
     count_subject_with_observation,
@@ -21,12 +22,19 @@ from .common.parse import (
     StudyPlanParser,
     parse_filter_to_sql,
 )
-from .common.plan import (
-    # Core classes
+from .common.plan import (  # Core classes
     load_plan,
 )
 from .disposition.disposition import study_plan_to_disposition_summary
+# Configure logging
+logging.basicConfig(
+    level=config.logging_level,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger("csrlite")
 # Main exports for common usage
 __all__ = [
     # Primary user interface

csrlite/ae/ae_listing.py CHANGED Viewed

@@ -71,6 +71,8 @@ def ae_listing_ard(
         parameter_filter=parameter_filter,
     )
+    assert observation_to_filter is not None
     # Filter observation to include only subjects in filtered population
     observation_filtered = observation_to_filter.filter(
         pl.col(id_var_name).is_in(population_filtered[id_var_name].to_list())

csrlite/ae/ae_specific.py CHANGED Viewed

@@ -24,8 +24,9 @@ from rtflite import RTFDocument
 from ..common.count import count_subject, count_subject_with_observation
 from ..common.parse import StudyPlanParser
 from ..common.plan import StudyPlan
+from ..common.rtf import create_rtf_table_n_pct
 from ..common.utils import apply_common_filters
-from .ae_utils import create_ae_rtf_table, get_ae_parameter_row_labels, get_ae_parameter_title
+from .ae_utils import get_ae_parameter_row_labels, get_ae_parameter_title
 def ae_specific_ard(
@@ -80,6 +81,8 @@ def ae_specific_ard(
         parameter_filter=parameter_filter,
     )
+    assert observation_to_filter is not None
     # Filter observation to include only subjects in filtered population
     observation_filtered = observation_to_filter.filter(
         pl.col(id_var_name).is_in(population_filtered[id_var_name].to_list())
@@ -114,7 +117,9 @@ def ae_specific_ard(
     # Get population with event indicator
     pop_with_indicator = population_filtered.with_columns(
-        pl.col(id_var_name).is_in(subjects_with_events[id_var_name]).alias("__has_event__")
+        pl.col(id_var_name)
+        .is_in(subjects_with_events[id_var_name].to_list())
+        .alias("__has_event__")
     )
     # Count subjects with and without events using count_subject_with_observation
@@ -129,7 +134,7 @@ def ae_specific_ard(
     )
     # Extract 'with' counts
-    n_with = event_counts.filter(pl.col("__has_event__")).select(
+    n_with = event_counts.filter(pl.col("__has_event__") == "true").select(
         [
             pl.lit(n_with_label).alias("__index__"),
             pl.col(group_var_name).cast(pl.String).alias("__group__"),
@@ -138,7 +143,7 @@ def ae_specific_ard(
     )
     # Extract 'without' counts
-    n_without = event_counts.filter(~pl.col("__has_event__")).select(
+    n_without = event_counts.filter(pl.col("__has_event__") == "false").select(
         [
             pl.lit(n_without_label).alias("__index__"),
             pl.col(group_var_name).cast(pl.String).alias("__group__"),
@@ -254,7 +259,7 @@ def ae_specific_rtf(
     else:
         col_widths = col_rel_width
-    return create_ae_rtf_table(
+    return create_rtf_table_n_pct(
         df=df_rtf,
         col_header_1=col_header_1,
         col_header_2=col_header_2,

csrlite/ae/ae_summary.py CHANGED Viewed

@@ -21,8 +21,8 @@ from rtflite import RTFDocument
 from ..common.count import count_subject, count_subject_with_observation
 from ..common.parse import StudyPlanParser
 from ..common.plan import StudyPlan
+from ..common.rtf import create_rtf_table_n_pct
 from ..common.utils import apply_common_filters
-from .ae_utils import create_ae_rtf_table
 def study_plan_to_ae_summary(
@@ -258,6 +258,8 @@ def ae_summary_ard(
         observation_filter=observation_filter,
     )
+    assert observation_to_filter is not None
     # Filter observation data to include only subjects in the filtered population
     # Process all variables in the list
     observation_filtered_list = []
@@ -388,7 +390,7 @@ def ae_summary_rtf(
     else:
         col_widths = col_rel_width
-    return create_ae_rtf_table(
+    return create_rtf_table_n_pct(
         df=df_rtf,
         col_header_1=col_header_1,
         col_header_2=col_header_2,

csrlite/ae/ae_utils.py CHANGED Viewed

@@ -1,9 +1,5 @@
-# pyre-strict
 from typing import Any
-import polars as pl
-from rtflite import RTFBody, RTFColumnHeader, RTFDocument, RTFFootnote, RTFPage, RTFSource, RTFTitle
 def get_ae_parameter_title(param: Any, prefix: str = "Participants With") -> str:
     """
@@ -64,69 +60,3 @@ def get_ae_parameter_row_labels(param: Any) -> tuple[str, str]:
     without_label = "    " + " ".join(without_label.split())
     return (with_label, without_label)
-def create_ae_rtf_table(
-    df: pl.DataFrame,
-    col_header_1: list[str],
-    col_header_2: list[str] | None,
-    col_widths: list[float] | None,
-    title: list[str] | str,
-    footnote: list[str] | str | None,
-    source: list[str] | str | None,
-    borders_2: bool = True,
-    orientation: str = "landscape",
-) -> RTFDocument:
-    """
-    Create a standardized RTF table document with 1 or 2 header rows.
-    """
-    n_cols = len(df.columns)
-    # Calculate column widths if None - simple default
-    if col_widths is None:
-        col_widths = [1] * n_cols
-    # Normalize metadata
-    title_list = [title] if isinstance(title, str) else title
-    footnote_list = [footnote] if isinstance(footnote, str) else (footnote or [])
-    source_list = [source] if isinstance(source, str) else (source or [])
-    headers = [
-        RTFColumnHeader(
-            text=col_header_1,
-            col_rel_width=col_widths,
-            text_justification=["l"] + ["c"] * (n_cols - 1),
-        )
-    ]
-    if col_header_2:
-        h2_kwargs = {
-            "text": col_header_2,
-            "col_rel_width": col_widths,
-            "text_justification": ["l"] + ["c"] * (n_cols - 1),
-        }
-        if borders_2:
-            h2_kwargs["border_left"] = ["single"]
-            h2_kwargs["border_top"] = [""]
-        headers.append(RTFColumnHeader(**h2_kwargs))
-    rtf_components: dict[str, Any] = {
-        "df": df,
-        "rtf_page": RTFPage(orientation=orientation),
-        "rtf_title": RTFTitle(text=title_list),
-        "rtf_column_header": headers,
-        "rtf_body": RTFBody(
-            col_rel_width=col_widths,
-            text_justification=["l"] + ["c"] * (n_cols - 1),
-            border_left=["single"] * n_cols,
-        ),
-    }
-    if footnote_list:
-        rtf_components["rtf_footnote"] = RTFFootnote(text=footnote_list)
-    if source_list:
-        rtf_components["rtf_source"] = RTFSource(text=source_list)
-    return RTFDocument(**rtf_components)

csrlite/common/config.py ADDED Viewed

@@ -0,0 +1,34 @@
+# pyre-strict
+"""
+Central configuration for csrlite.
+"""
+from typing import Literal, Optional
+from pydantic import BaseModel, ConfigDict, Field
+class CsrLiteConfig(BaseModel):
+    """
+    Global configuration for csrlite library.
+    """
+    # Column Name Defaults
+    id_col: str = Field(default="USUBJID", description="Subject Identifier Column")
+    group_col: Optional[str] = Field(default=None, description="Treatment Group Column")
+    # Missing Value Handling
+    missing_str: str = Field(
+        default="__missing__", description="String to represent missing string values"
+    )
+    # Logging
+    logging_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(
+        default="INFO", description="Default logging level"
+    )
+    model_config = ConfigDict(validate_assignment=True)
+# Global configuration instance
+config = CsrLiteConfig()

csrlite/common/count.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # pyre-strict
 import polars as pl
+from .config import config
 def _to_pop(
     population: pl.DataFrame,
@@ -48,14 +50,11 @@ def count_subject(
     Counts subjects by group and optionally includes a 'Total' column.
     Args:
-        population (pl.DataFrame): DataFrame containing subject population data,
-                                   must include 'id' and 'group' columns.
-        id (str): The name of the subject ID column (e.g., "USUBJID").
-        group (str): The name of the treatment group column (e.g., "TRT01A").
-        total (bool, optional): If True, adds a 'Total' group with counts across all groups.
-                                Defaults to True.
-        missing_group (str, optional): How to handle missing values in the group column.
-                                       "error" will raise a ValueError. Defaults to "error".
+        population (pl.DataFrame): DataFrame containing subject population data.
+        id (str): The name of the subject ID column.
+        group (str): The name of the treatment group column.
+        total (bool, optional): If True, adds a 'Total' group. Defaults to True.
+        missing_group (str, optional): How to handle missing values ("error", "ignore").
     Returns:
         pl.DataFrame: A DataFrame with subject counts ('n_subj_pop') for each group.
@@ -72,41 +71,33 @@ def count_subject(
     return pop.group_by(group).agg(pl.len().alias("n_subj_pop")).sort(group)
-def count_subject_with_observation(
+def count_summary_data(
     population: pl.DataFrame,
     observation: pl.DataFrame,
     id: str,
     group: str,
-    variable: str,
+    variable: str | list[str],
     total: bool = True,
     missing_group: str = "error",
-    pct_digit: int = 1,
-    max_n_width: int | None = None,
 ) -> pl.DataFrame:
     """
-    Counts subjects and observations by group and a specified variable,
-    calculating percentages based on population denominators.
-    Args:
-        population (pl.DataFrame): DataFrame containing subject population data,
-                                   must include 'id' and 'group' columns.
-        observation (pl.DataFrame): DataFrame containing observation data,
-                                    must include 'id' and 'variable' columns.
-        id (str): The name of the subject ID column (e.g., "USUBJID").
-        group (str): The name of the treatment group column (e.g., "TRT01A").
-        variable (str): The name of the variable to count observations for (e.g., "AESOC").
-        total (bool, optional): Not yet implemented. Defaults to True.
-        missing_group (str, optional): How to handle missing values in the group column.
-                                       "error" will raise a ValueError. Defaults to "error".
-        pct_digit (int, optional): Number of decimal places for percentage formatting.
-                                   Defaults to 1.
-        max_n_width (int, optional): Fixed width for subject count formatting. If None, inferred
-                                     from data. Defaults to None.
+    Generates numeric summary data (counts and percentages) for observations.
+    Does NOT perform string formatting.
     Returns:
-        pl.DataFrame: A DataFrame with counts and percentages of subjects and observations
-                      grouped by 'group' and 'variable'.
+        pl.DataFrame: DataFrame with columns:
+            - [group]: Group column
+            - [variable]: Variable columns
+            - n_obs: Count of observations
+            - n_subj: Count of unique subjects with observation
+            - n_subj_pop: Total subjects in group
+            - pct_subj: Percentage of subjects (0-100)
     """
+    # Normalize variable to list
+    if isinstance(variable, str):
+        variables = [variable]
+    else:
+        variables = variable
     # prepare data
     pop = _to_pop(
@@ -117,10 +108,14 @@ def count_subject_with_observation(
         missing_group=missing_group,
     )
-    obs = observation.select(id, variable).join(pop, on=id, how="left")
+    # Select all required columns (id + all variables)
+    obs = observation.select(id, *variables).join(pop, on=id, how="left")
+    for var in variables:
+        obs = obs.with_columns(pl.col(var).cast(pl.String).fill_null(config.missing_str))
+    # Check for IDs in observation that are not in population
     if not obs[id].is_in(pop[id].to_list()).all():
-        # Get IDs that are in obs but not in pop
         missing_ids = (
             obs.filter(~pl.col(id).is_in(pop[id].to_list()))
             .select(id)
@@ -129,8 +124,8 @@ def count_subject_with_observation(
             .to_list()
         )
         raise ValueError(
-            f"Some '{id}' values in the observation DataFrame are not present in the population "
-            f"DataFrame: {missing_ids}"
+            f"Some '{id}' values in the observation DataFrame are not present in the population: "
+            f"{missing_ids}"
         )
     df_pop = count_subject(
@@ -141,59 +136,158 @@ def count_subject_with_observation(
         missing_group=missing_group,
     )
-    # Count observations and subjects by group and variable
-    df_obs_counts = obs.group_by(group, variable).agg(
-        pl.len().alias("n_obs"), pl.n_unique(id).alias("n_subj")
-    )
+    all_levels_df = []
-    # Create all combinations of groups and variables to ensure no missing groups
-    unique_groups = df_pop.select(group)
-    unique_variables = obs.select(variable).unique()
-    # Cross join to get all combinations
-    all_combinations = unique_groups.join(unique_variables, how="cross")
-    # Left join to preserve all combinations, filling missing counts with 0
-    df_obs = (
-        all_combinations.join(df_obs_counts, on=[group, variable], how="left")
-        .join(df_pop, on=group, how="left")
-        .with_columns([pl.col("n_obs").fill_null(0), pl.col("n_subj").fill_null(0)])
-        .with_columns(pct_subj=(pl.col("n_subj") / pl.col("n_subj_pop") * 100))
-        .with_columns(
-            pct_subj_fmt=(
-                pl.when(pl.col("pct_subj").is_null() | pl.col("pct_subj").is_nan())
-                .then(0.0)
-                .otherwise(pl.col("pct_subj"))
-                .round(pct_digit, mode="half_away_from_zero")
-                .cast(pl.String)
-            )
+    # Iterate through hierarchies
+    for i in range(1, len(variables) + 1):
+        current_vars = variables[:i]
+        # Aggregation
+        df_obs_counts = obs.group_by(group, *current_vars).agg(
+            pl.len().alias("n_obs"), pl.n_unique(id).alias("n_subj")
+        )
+        # Cross join for all combinations
+        unique_groups = df_pop.select(group)
+        unique_variables = obs.select(current_vars).unique()
+        all_combinations = unique_groups.join(unique_variables, how="cross")
+        # Join back
+        df_level = (
+            all_combinations.join(df_obs_counts, on=[group, *current_vars], how="left")
+            .join(df_pop, on=group, how="left")
+            .with_columns([pl.col("n_obs").fill_null(0), pl.col("n_subj").fill_null(0)])
+        )
+        df_level = df_level.with_columns([pl.col(c).cast(pl.String) for c in current_vars])
+        # Add missing columns with "__all__"
+        for var in variables:
+            if var not in df_level.columns:
+                df_level = df_level.with_columns(pl.lit("__all__").cast(pl.String).alias(var))
+        all_levels_df.append(df_level)
+    # Stack
+    df_obs = pl.concat(all_levels_df, how="diagonal")
+    # Calculate percentage
+    df_obs = df_obs.with_columns(pct_subj=(pl.col("n_subj") / pl.col("n_subj_pop") * 100))
+    return df_obs
+def format_summary_table(
+    df: pl.DataFrame,
+    group: str,
+    variable: str | list[str],
+    pct_digit: int = 1,
+    max_n_width: int | None = None,
+) -> pl.DataFrame:
+    """
+    Formats numeric summary data into display strings (e.g., "n ( pct)").
+    Adds indentation and sorting.
+    """
+    if isinstance(variable, str):
+        variables = [variable]
+    else:
+        variables = variable
+    df_fmt = df.with_columns(
+        pct_subj_fmt=(
+            pl.when(pl.col("pct_subj").is_null() | pl.col("pct_subj").is_nan())
+            .then(0.0)
+            .otherwise(pl.col("pct_subj"))
+            .round(pct_digit, mode="half_away_from_zero")
+            .cast(pl.String)
         )
     )
-    # Calculate max widths for proper alignment
     if max_n_width is None:
-        max_n_width = df_obs.select(pl.col("n_subj").cast(pl.String).str.len_chars().max()).item()
+        max_n_width = df_fmt.select(pl.col("n_subj").cast(pl.String).str.len_chars().max()).item()
-    # Infer max percentage width from pct_digit
     max_pct_width = 3 if pct_digit == 0 else 4 + pct_digit
-    # Format with padding for alignment
-    df_obs = (
-        df_obs.with_columns(
-            [
-                pl.col("pct_subj_fmt").str.pad_start(max_pct_width, " "),
-                pl.col("n_subj")
-                .cast(pl.String)
-                .str.pad_start(max_n_width, " ")
-                .alias("n_subj_fmt"),
-            ]
+    df_fmt = df_fmt.with_columns(
+        [
+            pl.col("pct_subj_fmt").str.pad_start(max_pct_width, " "),
+            pl.col("n_subj").cast(pl.String).str.pad_start(max_n_width, " ").alias("n_subj_fmt"),
+        ]
+    ).with_columns(
+        n_pct_subj_fmt=pl.concat_str(
+            [pl.col("n_subj_fmt"), pl.lit(" ("), pl.col("pct_subj_fmt"), pl.lit(")")]
         )
-        .with_columns(
-            n_pct_subj_fmt=pl.concat_str(
-                [pl.col("n_subj_fmt"), pl.lit(" ("), pl.col("pct_subj_fmt"), pl.lit(")")]
-            )
+    )
+    # Sorting Logic
+    sort_exprs = [pl.col(group)]
+    for var in variables:
+        # 0 for __all__, 1 for values, 2 for config.missing_str
+        sort_key_col = f"__sort_key_{var}__"
+        df_fmt = df_fmt.with_columns(
+            pl.when(pl.col(var) == "__all__")
+            .then(0)
+            .when(pl.col(var) == config.missing_str)
+            .then(2)
+            .otherwise(1)
+            .alias(sort_key_col)
+        )
+        sort_exprs.append(pl.col(sort_key_col))
+        sort_exprs.append(pl.col(var))
+    df_fmt = df_fmt.sort(sort_exprs).select(pl.exclude(r"^__sort_key_.*$"))
+    # Indentation logic
+    if len(variables) > 0:
+        var_expr = (
+            pl.when(pl.col(variables[0]) == config.missing_str)
+            .then(pl.lit("Missing"))
+            .otherwise(pl.col(variables[0]))
         )
-        .sort(group, variable)
+        for i in range(1, len(variables)):
+            var_expr = (
+                pl.when(pl.col(variables[i]) == "__all__")
+                .then(var_expr)
+                .when(pl.col(variables[i]) == config.missing_str)
+                .then(pl.lit(" " * 4 * i) + pl.lit("Missing"))
+                .otherwise(pl.lit(" " * 4 * i) + pl.col(variables[i]))
+            )
+        df_fmt = df_fmt.with_columns(var_expr.alias("__variable__"))
+    df_fmt = df_fmt.with_row_index(name="__id__", offset=1)
+    return df_fmt
+def count_subject_with_observation(
+    population: pl.DataFrame,
+    observation: pl.DataFrame,
+    id: str,
+    group: str,
+    variable: str | list[str],
+    total: bool = True,
+    missing_group: str = "error",
+    pct_digit: int = 1,
+    max_n_width: int | None = None,
+) -> pl.DataFrame:
+    """
+    Legacy wrapper for backward compatibility (mostly for tests that rely on the old signature),
+    but now strictly composing the new functions.
+    """
+    df_raw = count_summary_data(
+        population=population,
+        observation=observation,
+        id=id,
+        group=group,
+        variable=variable,
+        total=total,
+        missing_group=missing_group,
     )
-    return df_obs
+    return format_summary_table(
+        df=df_raw,
+        group=group,
+        variable=variable,
+        pct_digit=pct_digit,
+        max_n_width=max_n_width,
+    )

csrlite 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

csrlite 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl