csrlite 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csrlite/__init__.py +16 -8
- csrlite/ae/ae_listing.py +2 -0
- csrlite/ae/ae_specific.py +10 -5
- csrlite/ae/ae_summary.py +4 -2
- csrlite/ae/ae_utils.py +0 -70
- csrlite/common/config.py +34 -0
- csrlite/common/count.py +174 -80
- csrlite/common/plan.py +79 -67
- csrlite/common/rtf.py +85 -0
- csrlite/common/utils.py +4 -4
- csrlite/disposition/disposition.py +126 -95
- {csrlite-0.1.0.dist-info → csrlite-0.2.0.dist-info}/METADATA +7 -7
- csrlite-0.2.0.dist-info/RECORD +19 -0
- csrlite-0.1.0.dist-info/RECORD +0 -17
- {csrlite-0.1.0.dist-info → csrlite-0.2.0.dist-info}/WHEEL +0 -0
- {csrlite-0.1.0.dist-info → csrlite-0.2.0.dist-info}/top_level.txt +0 -0
csrlite/__init__.py
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from .ae.ae_listing import ( # AE listing functions
|
|
3
5
|
ae_listing,
|
|
4
6
|
study_plan_to_ae_listing,
|
|
5
7
|
)
|
|
6
|
-
from .ae.ae_specific import (
|
|
7
|
-
# AE specific functions
|
|
8
|
+
from .ae.ae_specific import ( # AE specific functions
|
|
8
9
|
ae_specific,
|
|
9
10
|
study_plan_to_ae_specific,
|
|
10
11
|
)
|
|
11
|
-
from .ae.ae_summary import (
|
|
12
|
-
# AE summary functions
|
|
12
|
+
from .ae.ae_summary import ( # AE summary functions
|
|
13
13
|
ae_summary,
|
|
14
14
|
study_plan_to_ae_summary,
|
|
15
15
|
)
|
|
16
|
+
from .common.config import config
|
|
16
17
|
from .common.count import (
|
|
17
18
|
count_subject,
|
|
18
19
|
count_subject_with_observation,
|
|
@@ -21,12 +22,19 @@ from .common.parse import (
|
|
|
21
22
|
StudyPlanParser,
|
|
22
23
|
parse_filter_to_sql,
|
|
23
24
|
)
|
|
24
|
-
from .common.plan import (
|
|
25
|
-
# Core classes
|
|
25
|
+
from .common.plan import ( # Core classes
|
|
26
26
|
load_plan,
|
|
27
27
|
)
|
|
28
28
|
from .disposition.disposition import study_plan_to_disposition_summary
|
|
29
29
|
|
|
30
|
+
# Configure logging
|
|
31
|
+
logging.basicConfig(
|
|
32
|
+
level=config.logging_level,
|
|
33
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
34
|
+
stream=sys.stdout,
|
|
35
|
+
)
|
|
36
|
+
logger = logging.getLogger("csrlite")
|
|
37
|
+
|
|
30
38
|
# Main exports for common usage
|
|
31
39
|
__all__ = [
|
|
32
40
|
# Primary user interface
|
csrlite/ae/ae_listing.py
CHANGED
|
@@ -71,6 +71,8 @@ def ae_listing_ard(
|
|
|
71
71
|
parameter_filter=parameter_filter,
|
|
72
72
|
)
|
|
73
73
|
|
|
74
|
+
assert observation_to_filter is not None
|
|
75
|
+
|
|
74
76
|
# Filter observation to include only subjects in filtered population
|
|
75
77
|
observation_filtered = observation_to_filter.filter(
|
|
76
78
|
pl.col(id_var_name).is_in(population_filtered[id_var_name].to_list())
|
csrlite/ae/ae_specific.py
CHANGED
|
@@ -24,8 +24,9 @@ from rtflite import RTFDocument
|
|
|
24
24
|
from ..common.count import count_subject, count_subject_with_observation
|
|
25
25
|
from ..common.parse import StudyPlanParser
|
|
26
26
|
from ..common.plan import StudyPlan
|
|
27
|
+
from ..common.rtf import create_rtf_table_n_pct
|
|
27
28
|
from ..common.utils import apply_common_filters
|
|
28
|
-
from .ae_utils import
|
|
29
|
+
from .ae_utils import get_ae_parameter_row_labels, get_ae_parameter_title
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
def ae_specific_ard(
|
|
@@ -80,6 +81,8 @@ def ae_specific_ard(
|
|
|
80
81
|
parameter_filter=parameter_filter,
|
|
81
82
|
)
|
|
82
83
|
|
|
84
|
+
assert observation_to_filter is not None
|
|
85
|
+
|
|
83
86
|
# Filter observation to include only subjects in filtered population
|
|
84
87
|
observation_filtered = observation_to_filter.filter(
|
|
85
88
|
pl.col(id_var_name).is_in(population_filtered[id_var_name].to_list())
|
|
@@ -114,7 +117,9 @@ def ae_specific_ard(
|
|
|
114
117
|
|
|
115
118
|
# Get population with event indicator
|
|
116
119
|
pop_with_indicator = population_filtered.with_columns(
|
|
117
|
-
pl.col(id_var_name)
|
|
120
|
+
pl.col(id_var_name)
|
|
121
|
+
.is_in(subjects_with_events[id_var_name].to_list())
|
|
122
|
+
.alias("__has_event__")
|
|
118
123
|
)
|
|
119
124
|
|
|
120
125
|
# Count subjects with and without events using count_subject_with_observation
|
|
@@ -129,7 +134,7 @@ def ae_specific_ard(
|
|
|
129
134
|
)
|
|
130
135
|
|
|
131
136
|
# Extract 'with' counts
|
|
132
|
-
n_with = event_counts.filter(pl.col("__has_event__")).select(
|
|
137
|
+
n_with = event_counts.filter(pl.col("__has_event__") == "true").select(
|
|
133
138
|
[
|
|
134
139
|
pl.lit(n_with_label).alias("__index__"),
|
|
135
140
|
pl.col(group_var_name).cast(pl.String).alias("__group__"),
|
|
@@ -138,7 +143,7 @@ def ae_specific_ard(
|
|
|
138
143
|
)
|
|
139
144
|
|
|
140
145
|
# Extract 'without' counts
|
|
141
|
-
n_without = event_counts.filter(
|
|
146
|
+
n_without = event_counts.filter(pl.col("__has_event__") == "false").select(
|
|
142
147
|
[
|
|
143
148
|
pl.lit(n_without_label).alias("__index__"),
|
|
144
149
|
pl.col(group_var_name).cast(pl.String).alias("__group__"),
|
|
@@ -254,7 +259,7 @@ def ae_specific_rtf(
|
|
|
254
259
|
else:
|
|
255
260
|
col_widths = col_rel_width
|
|
256
261
|
|
|
257
|
-
return
|
|
262
|
+
return create_rtf_table_n_pct(
|
|
258
263
|
df=df_rtf,
|
|
259
264
|
col_header_1=col_header_1,
|
|
260
265
|
col_header_2=col_header_2,
|
csrlite/ae/ae_summary.py
CHANGED
|
@@ -21,8 +21,8 @@ from rtflite import RTFDocument
|
|
|
21
21
|
from ..common.count import count_subject, count_subject_with_observation
|
|
22
22
|
from ..common.parse import StudyPlanParser
|
|
23
23
|
from ..common.plan import StudyPlan
|
|
24
|
+
from ..common.rtf import create_rtf_table_n_pct
|
|
24
25
|
from ..common.utils import apply_common_filters
|
|
25
|
-
from .ae_utils import create_ae_rtf_table
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def study_plan_to_ae_summary(
|
|
@@ -258,6 +258,8 @@ def ae_summary_ard(
|
|
|
258
258
|
observation_filter=observation_filter,
|
|
259
259
|
)
|
|
260
260
|
|
|
261
|
+
assert observation_to_filter is not None
|
|
262
|
+
|
|
261
263
|
# Filter observation data to include only subjects in the filtered population
|
|
262
264
|
# Process all variables in the list
|
|
263
265
|
observation_filtered_list = []
|
|
@@ -388,7 +390,7 @@ def ae_summary_rtf(
|
|
|
388
390
|
else:
|
|
389
391
|
col_widths = col_rel_width
|
|
390
392
|
|
|
391
|
-
return
|
|
393
|
+
return create_rtf_table_n_pct(
|
|
392
394
|
df=df_rtf,
|
|
393
395
|
col_header_1=col_header_1,
|
|
394
396
|
col_header_2=col_header_2,
|
csrlite/ae/ae_utils.py
CHANGED
|
@@ -1,9 +1,5 @@
|
|
|
1
|
-
# pyre-strict
|
|
2
1
|
from typing import Any
|
|
3
2
|
|
|
4
|
-
import polars as pl
|
|
5
|
-
from rtflite import RTFBody, RTFColumnHeader, RTFDocument, RTFFootnote, RTFPage, RTFSource, RTFTitle
|
|
6
|
-
|
|
7
3
|
|
|
8
4
|
def get_ae_parameter_title(param: Any, prefix: str = "Participants With") -> str:
|
|
9
5
|
"""
|
|
@@ -64,69 +60,3 @@ def get_ae_parameter_row_labels(param: Any) -> tuple[str, str]:
|
|
|
64
60
|
without_label = " " + " ".join(without_label.split())
|
|
65
61
|
|
|
66
62
|
return (with_label, without_label)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def create_ae_rtf_table(
|
|
70
|
-
df: pl.DataFrame,
|
|
71
|
-
col_header_1: list[str],
|
|
72
|
-
col_header_2: list[str] | None,
|
|
73
|
-
col_widths: list[float] | None,
|
|
74
|
-
title: list[str] | str,
|
|
75
|
-
footnote: list[str] | str | None,
|
|
76
|
-
source: list[str] | str | None,
|
|
77
|
-
borders_2: bool = True,
|
|
78
|
-
orientation: str = "landscape",
|
|
79
|
-
) -> RTFDocument:
|
|
80
|
-
"""
|
|
81
|
-
Create a standardized RTF table document with 1 or 2 header rows.
|
|
82
|
-
"""
|
|
83
|
-
n_cols = len(df.columns)
|
|
84
|
-
|
|
85
|
-
# Calculate column widths if None - simple default
|
|
86
|
-
if col_widths is None:
|
|
87
|
-
col_widths = [1] * n_cols
|
|
88
|
-
|
|
89
|
-
# Normalize metadata
|
|
90
|
-
title_list = [title] if isinstance(title, str) else title
|
|
91
|
-
footnote_list = [footnote] if isinstance(footnote, str) else (footnote or [])
|
|
92
|
-
source_list = [source] if isinstance(source, str) else (source or [])
|
|
93
|
-
|
|
94
|
-
headers = [
|
|
95
|
-
RTFColumnHeader(
|
|
96
|
-
text=col_header_1,
|
|
97
|
-
col_rel_width=col_widths,
|
|
98
|
-
text_justification=["l"] + ["c"] * (n_cols - 1),
|
|
99
|
-
)
|
|
100
|
-
]
|
|
101
|
-
|
|
102
|
-
if col_header_2:
|
|
103
|
-
h2_kwargs = {
|
|
104
|
-
"text": col_header_2,
|
|
105
|
-
"col_rel_width": col_widths,
|
|
106
|
-
"text_justification": ["l"] + ["c"] * (n_cols - 1),
|
|
107
|
-
}
|
|
108
|
-
if borders_2:
|
|
109
|
-
h2_kwargs["border_left"] = ["single"]
|
|
110
|
-
h2_kwargs["border_top"] = [""]
|
|
111
|
-
|
|
112
|
-
headers.append(RTFColumnHeader(**h2_kwargs))
|
|
113
|
-
|
|
114
|
-
rtf_components: dict[str, Any] = {
|
|
115
|
-
"df": df,
|
|
116
|
-
"rtf_page": RTFPage(orientation=orientation),
|
|
117
|
-
"rtf_title": RTFTitle(text=title_list),
|
|
118
|
-
"rtf_column_header": headers,
|
|
119
|
-
"rtf_body": RTFBody(
|
|
120
|
-
col_rel_width=col_widths,
|
|
121
|
-
text_justification=["l"] + ["c"] * (n_cols - 1),
|
|
122
|
-
border_left=["single"] * n_cols,
|
|
123
|
-
),
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
if footnote_list:
|
|
127
|
-
rtf_components["rtf_footnote"] = RTFFootnote(text=footnote_list)
|
|
128
|
-
|
|
129
|
-
if source_list:
|
|
130
|
-
rtf_components["rtf_source"] = RTFSource(text=source_list)
|
|
131
|
-
|
|
132
|
-
return RTFDocument(**rtf_components)
|
csrlite/common/config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# pyre-strict
|
|
2
|
+
"""
|
|
3
|
+
Central configuration for csrlite.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Literal, Optional
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CsrLiteConfig(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
Global configuration for csrlite library.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Column Name Defaults
|
|
17
|
+
id_col: str = Field(default="USUBJID", description="Subject Identifier Column")
|
|
18
|
+
group_col: Optional[str] = Field(default=None, description="Treatment Group Column")
|
|
19
|
+
|
|
20
|
+
# Missing Value Handling
|
|
21
|
+
missing_str: str = Field(
|
|
22
|
+
default="__missing__", description="String to represent missing string values"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Logging
|
|
26
|
+
logging_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(
|
|
27
|
+
default="INFO", description="Default logging level"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(validate_assignment=True)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Global configuration instance
|
|
34
|
+
config = CsrLiteConfig()
|
csrlite/common/count.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# pyre-strict
|
|
2
2
|
import polars as pl
|
|
3
3
|
|
|
4
|
+
from .config import config
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
def _to_pop(
|
|
6
8
|
population: pl.DataFrame,
|
|
@@ -48,14 +50,11 @@ def count_subject(
|
|
|
48
50
|
Counts subjects by group and optionally includes a 'Total' column.
|
|
49
51
|
|
|
50
52
|
Args:
|
|
51
|
-
population (pl.DataFrame): DataFrame containing subject population data
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Defaults to True.
|
|
57
|
-
missing_group (str, optional): How to handle missing values in the group column.
|
|
58
|
-
"error" will raise a ValueError. Defaults to "error".
|
|
53
|
+
population (pl.DataFrame): DataFrame containing subject population data.
|
|
54
|
+
id (str): The name of the subject ID column.
|
|
55
|
+
group (str): The name of the treatment group column.
|
|
56
|
+
total (bool, optional): If True, adds a 'Total' group. Defaults to True.
|
|
57
|
+
missing_group (str, optional): How to handle missing values ("error", "ignore").
|
|
59
58
|
|
|
60
59
|
Returns:
|
|
61
60
|
pl.DataFrame: A DataFrame with subject counts ('n_subj_pop') for each group.
|
|
@@ -72,41 +71,33 @@ def count_subject(
|
|
|
72
71
|
return pop.group_by(group).agg(pl.len().alias("n_subj_pop")).sort(group)
|
|
73
72
|
|
|
74
73
|
|
|
75
|
-
def
|
|
74
|
+
def count_summary_data(
|
|
76
75
|
population: pl.DataFrame,
|
|
77
76
|
observation: pl.DataFrame,
|
|
78
77
|
id: str,
|
|
79
78
|
group: str,
|
|
80
|
-
variable: str,
|
|
79
|
+
variable: str | list[str],
|
|
81
80
|
total: bool = True,
|
|
82
81
|
missing_group: str = "error",
|
|
83
|
-
pct_digit: int = 1,
|
|
84
|
-
max_n_width: int | None = None,
|
|
85
82
|
) -> pl.DataFrame:
|
|
86
83
|
"""
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
Args:
|
|
91
|
-
population (pl.DataFrame): DataFrame containing subject population data,
|
|
92
|
-
must include 'id' and 'group' columns.
|
|
93
|
-
observation (pl.DataFrame): DataFrame containing observation data,
|
|
94
|
-
must include 'id' and 'variable' columns.
|
|
95
|
-
id (str): The name of the subject ID column (e.g., "USUBJID").
|
|
96
|
-
group (str): The name of the treatment group column (e.g., "TRT01A").
|
|
97
|
-
variable (str): The name of the variable to count observations for (e.g., "AESOC").
|
|
98
|
-
total (bool, optional): Not yet implemented. Defaults to True.
|
|
99
|
-
missing_group (str, optional): How to handle missing values in the group column.
|
|
100
|
-
"error" will raise a ValueError. Defaults to "error".
|
|
101
|
-
pct_digit (int, optional): Number of decimal places for percentage formatting.
|
|
102
|
-
Defaults to 1.
|
|
103
|
-
max_n_width (int, optional): Fixed width for subject count formatting. If None, inferred
|
|
104
|
-
from data. Defaults to None.
|
|
84
|
+
Generates numeric summary data (counts and percentages) for observations.
|
|
85
|
+
Does NOT perform string formatting.
|
|
105
86
|
|
|
106
87
|
Returns:
|
|
107
|
-
pl.DataFrame:
|
|
108
|
-
|
|
88
|
+
pl.DataFrame: DataFrame with columns:
|
|
89
|
+
- [group]: Group column
|
|
90
|
+
- [variable]: Variable columns
|
|
91
|
+
- n_obs: Count of observations
|
|
92
|
+
- n_subj: Count of unique subjects with observation
|
|
93
|
+
- n_subj_pop: Total subjects in group
|
|
94
|
+
- pct_subj: Percentage of subjects (0-100)
|
|
109
95
|
"""
|
|
96
|
+
# Normalize variable to list
|
|
97
|
+
if isinstance(variable, str):
|
|
98
|
+
variables = [variable]
|
|
99
|
+
else:
|
|
100
|
+
variables = variable
|
|
110
101
|
|
|
111
102
|
# prepare data
|
|
112
103
|
pop = _to_pop(
|
|
@@ -117,10 +108,14 @@ def count_subject_with_observation(
|
|
|
117
108
|
missing_group=missing_group,
|
|
118
109
|
)
|
|
119
110
|
|
|
120
|
-
|
|
111
|
+
# Select all required columns (id + all variables)
|
|
112
|
+
obs = observation.select(id, *variables).join(pop, on=id, how="left")
|
|
113
|
+
|
|
114
|
+
for var in variables:
|
|
115
|
+
obs = obs.with_columns(pl.col(var).cast(pl.String).fill_null(config.missing_str))
|
|
121
116
|
|
|
117
|
+
# Check for IDs in observation that are not in population
|
|
122
118
|
if not obs[id].is_in(pop[id].to_list()).all():
|
|
123
|
-
# Get IDs that are in obs but not in pop
|
|
124
119
|
missing_ids = (
|
|
125
120
|
obs.filter(~pl.col(id).is_in(pop[id].to_list()))
|
|
126
121
|
.select(id)
|
|
@@ -129,8 +124,8 @@ def count_subject_with_observation(
|
|
|
129
124
|
.to_list()
|
|
130
125
|
)
|
|
131
126
|
raise ValueError(
|
|
132
|
-
f"Some '{id}' values in the observation DataFrame are not present in the population "
|
|
133
|
-
f"
|
|
127
|
+
f"Some '{id}' values in the observation DataFrame are not present in the population: "
|
|
128
|
+
f"{missing_ids}"
|
|
134
129
|
)
|
|
135
130
|
|
|
136
131
|
df_pop = count_subject(
|
|
@@ -141,59 +136,158 @@ def count_subject_with_observation(
|
|
|
141
136
|
missing_group=missing_group,
|
|
142
137
|
)
|
|
143
138
|
|
|
144
|
-
|
|
145
|
-
df_obs_counts = obs.group_by(group, variable).agg(
|
|
146
|
-
pl.len().alias("n_obs"), pl.n_unique(id).alias("n_subj")
|
|
147
|
-
)
|
|
139
|
+
all_levels_df = []
|
|
148
140
|
|
|
149
|
-
#
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
.
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
141
|
+
# Iterate through hierarchies
|
|
142
|
+
for i in range(1, len(variables) + 1):
|
|
143
|
+
current_vars = variables[:i]
|
|
144
|
+
|
|
145
|
+
# Aggregation
|
|
146
|
+
df_obs_counts = obs.group_by(group, *current_vars).agg(
|
|
147
|
+
pl.len().alias("n_obs"), pl.n_unique(id).alias("n_subj")
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Cross join for all combinations
|
|
151
|
+
unique_groups = df_pop.select(group)
|
|
152
|
+
unique_variables = obs.select(current_vars).unique()
|
|
153
|
+
all_combinations = unique_groups.join(unique_variables, how="cross")
|
|
154
|
+
|
|
155
|
+
# Join back
|
|
156
|
+
df_level = (
|
|
157
|
+
all_combinations.join(df_obs_counts, on=[group, *current_vars], how="left")
|
|
158
|
+
.join(df_pop, on=group, how="left")
|
|
159
|
+
.with_columns([pl.col("n_obs").fill_null(0), pl.col("n_subj").fill_null(0)])
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
df_level = df_level.with_columns([pl.col(c).cast(pl.String) for c in current_vars])
|
|
163
|
+
|
|
164
|
+
# Add missing columns with "__all__"
|
|
165
|
+
for var in variables:
|
|
166
|
+
if var not in df_level.columns:
|
|
167
|
+
df_level = df_level.with_columns(pl.lit("__all__").cast(pl.String).alias(var))
|
|
168
|
+
|
|
169
|
+
all_levels_df.append(df_level)
|
|
170
|
+
|
|
171
|
+
# Stack
|
|
172
|
+
df_obs = pl.concat(all_levels_df, how="diagonal")
|
|
173
|
+
|
|
174
|
+
# Calculate percentage
|
|
175
|
+
df_obs = df_obs.with_columns(pct_subj=(pl.col("n_subj") / pl.col("n_subj_pop") * 100))
|
|
176
|
+
|
|
177
|
+
return df_obs
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def format_summary_table(
|
|
181
|
+
df: pl.DataFrame,
|
|
182
|
+
group: str,
|
|
183
|
+
variable: str | list[str],
|
|
184
|
+
pct_digit: int = 1,
|
|
185
|
+
max_n_width: int | None = None,
|
|
186
|
+
) -> pl.DataFrame:
|
|
187
|
+
"""
|
|
188
|
+
Formats numeric summary data into display strings (e.g., "n ( pct)").
|
|
189
|
+
Adds indentation and sorting.
|
|
190
|
+
"""
|
|
191
|
+
if isinstance(variable, str):
|
|
192
|
+
variables = [variable]
|
|
193
|
+
else:
|
|
194
|
+
variables = variable
|
|
195
|
+
|
|
196
|
+
df_fmt = df.with_columns(
|
|
197
|
+
pct_subj_fmt=(
|
|
198
|
+
pl.when(pl.col("pct_subj").is_null() | pl.col("pct_subj").is_nan())
|
|
199
|
+
.then(0.0)
|
|
200
|
+
.otherwise(pl.col("pct_subj"))
|
|
201
|
+
.round(pct_digit, mode="half_away_from_zero")
|
|
202
|
+
.cast(pl.String)
|
|
170
203
|
)
|
|
171
204
|
)
|
|
172
205
|
|
|
173
|
-
# Calculate max widths for proper alignment
|
|
174
206
|
if max_n_width is None:
|
|
175
|
-
max_n_width =
|
|
207
|
+
max_n_width = df_fmt.select(pl.col("n_subj").cast(pl.String).str.len_chars().max()).item()
|
|
176
208
|
|
|
177
|
-
# Infer max percentage width from pct_digit
|
|
178
209
|
max_pct_width = 3 if pct_digit == 0 else 4 + pct_digit
|
|
179
210
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
.alias("n_subj_fmt"),
|
|
189
|
-
]
|
|
211
|
+
df_fmt = df_fmt.with_columns(
|
|
212
|
+
[
|
|
213
|
+
pl.col("pct_subj_fmt").str.pad_start(max_pct_width, " "),
|
|
214
|
+
pl.col("n_subj").cast(pl.String).str.pad_start(max_n_width, " ").alias("n_subj_fmt"),
|
|
215
|
+
]
|
|
216
|
+
).with_columns(
|
|
217
|
+
n_pct_subj_fmt=pl.concat_str(
|
|
218
|
+
[pl.col("n_subj_fmt"), pl.lit(" ("), pl.col("pct_subj_fmt"), pl.lit(")")]
|
|
190
219
|
)
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Sorting Logic
|
|
223
|
+
sort_exprs = [pl.col(group)]
|
|
224
|
+
for var in variables:
|
|
225
|
+
# 0 for __all__, 1 for values, 2 for config.missing_str
|
|
226
|
+
sort_key_col = f"__sort_key_{var}__"
|
|
227
|
+
df_fmt = df_fmt.with_columns(
|
|
228
|
+
pl.when(pl.col(var) == "__all__")
|
|
229
|
+
.then(0)
|
|
230
|
+
.when(pl.col(var) == config.missing_str)
|
|
231
|
+
.then(2)
|
|
232
|
+
.otherwise(1)
|
|
233
|
+
.alias(sort_key_col)
|
|
234
|
+
)
|
|
235
|
+
sort_exprs.append(pl.col(sort_key_col))
|
|
236
|
+
sort_exprs.append(pl.col(var))
|
|
237
|
+
|
|
238
|
+
df_fmt = df_fmt.sort(sort_exprs).select(pl.exclude(r"^__sort_key_.*$"))
|
|
239
|
+
|
|
240
|
+
# Indentation logic
|
|
241
|
+
if len(variables) > 0:
|
|
242
|
+
var_expr = (
|
|
243
|
+
pl.when(pl.col(variables[0]) == config.missing_str)
|
|
244
|
+
.then(pl.lit("Missing"))
|
|
245
|
+
.otherwise(pl.col(variables[0]))
|
|
195
246
|
)
|
|
196
|
-
|
|
247
|
+
|
|
248
|
+
for i in range(1, len(variables)):
|
|
249
|
+
var_expr = (
|
|
250
|
+
pl.when(pl.col(variables[i]) == "__all__")
|
|
251
|
+
.then(var_expr)
|
|
252
|
+
.when(pl.col(variables[i]) == config.missing_str)
|
|
253
|
+
.then(pl.lit(" " * 4 * i) + pl.lit("Missing"))
|
|
254
|
+
.otherwise(pl.lit(" " * 4 * i) + pl.col(variables[i]))
|
|
255
|
+
)
|
|
256
|
+
df_fmt = df_fmt.with_columns(var_expr.alias("__variable__"))
|
|
257
|
+
|
|
258
|
+
df_fmt = df_fmt.with_row_index(name="__id__", offset=1)
|
|
259
|
+
return df_fmt
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def count_subject_with_observation(
|
|
263
|
+
population: pl.DataFrame,
|
|
264
|
+
observation: pl.DataFrame,
|
|
265
|
+
id: str,
|
|
266
|
+
group: str,
|
|
267
|
+
variable: str | list[str],
|
|
268
|
+
total: bool = True,
|
|
269
|
+
missing_group: str = "error",
|
|
270
|
+
pct_digit: int = 1,
|
|
271
|
+
max_n_width: int | None = None,
|
|
272
|
+
) -> pl.DataFrame:
|
|
273
|
+
"""
|
|
274
|
+
Legacy wrapper for backward compatibility (mostly for tests that rely on the old signature),
|
|
275
|
+
but now strictly composing the new functions.
|
|
276
|
+
"""
|
|
277
|
+
df_raw = count_summary_data(
|
|
278
|
+
population=population,
|
|
279
|
+
observation=observation,
|
|
280
|
+
id=id,
|
|
281
|
+
group=group,
|
|
282
|
+
variable=variable,
|
|
283
|
+
total=total,
|
|
284
|
+
missing_group=missing_group,
|
|
197
285
|
)
|
|
198
286
|
|
|
199
|
-
return
|
|
287
|
+
return format_summary_table(
|
|
288
|
+
df=df_raw,
|
|
289
|
+
group=group,
|
|
290
|
+
variable=variable,
|
|
291
|
+
pct_digit=pct_digit,
|
|
292
|
+
max_n_width=max_n_width,
|
|
293
|
+
)
|