csrlite 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,292 @@
1
+ # pyre-strict
2
+ """
3
+ Inclusion/Exclusion (IE) Table Analysis Functions
4
+
5
+ This module provides a pipeline for IE summary analysis:
6
+ - ie_ard: Generate Analysis Results Data (ARD)
7
+ - ie_df: Transform ARD to display format
8
+ - ie_rtf: Generate formatted RTF output
9
+ - study_plan_to_ie_summary: Batch generation from StudyPlan
10
+ """
11
+
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import polars as pl
16
+
17
+ from ..common.parse import StudyPlanParser
18
+ from ..common.plan import StudyPlan
19
+ from ..common.rtf import create_rtf_table_n_pct
20
+ from ..common.utils import apply_common_filters
21
+
22
+
23
+ def study_plan_to_ie_summary(
24
+ study_plan: StudyPlan,
25
+ ) -> list[str]:
26
+ """
27
+ Generate IE Summary Table outputs for all analyses defined in StudyPlan.
28
+ """
29
+ # Meta data
30
+ analysis_type = "ie_summary"
31
+ output_dir = study_plan.output_dir
32
+ title = "Summary of Protocol Deviations (Inclusion/Exclusion)"
33
+ # footnote = ["Percentages are based on the number of enrolled participants."]
34
+
35
+ # Defaults
36
+ criteria_df_name = "adie"
37
+
38
+ # Ensure output directory exists
39
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
40
+
41
+ # Initialize parser
42
+ parser = StudyPlanParser(study_plan)
43
+
44
+ # Get expanded plan (Manually expansion to avoid AttributeError)
45
+ plans = study_plan.study_data.get("plans", [])
46
+ all_specs = []
47
+ for plan_data in plans:
48
+ expanded = study_plan.expander.expand_plan(plan_data)
49
+ for p in expanded:
50
+ all_specs.append(study_plan.expander.create_analysis_spec(p))
51
+
52
+ plan_df = pl.DataFrame(all_specs)
53
+
54
+ if "analysis" in plan_df.columns:
55
+ ie_plans = plan_df.filter(pl.col("analysis") == analysis_type)
56
+ else:
57
+ ie_plans = pl.DataFrame()
58
+
59
+ generated_files = []
60
+
61
+ # Iterate over analyses
62
+ for analysis in ie_plans.iter_rows(named=True):
63
+ # Load data
64
+ # Note: IE analysis needs both ADSL (for population/group) and ADIE (for criteria)
65
+ pop_name = analysis.get("population", "enrolled")
66
+ group_kw = analysis.get("group") # Can be None
67
+
68
+ try:
69
+ if group_kw:
70
+ # Load Filtered Population (ADSL) with Group
71
+ adsl, group_col = parser.get_population_data(pop_name, group_kw)
72
+ group_col = group_col.upper()
73
+ grp_suffix = group_col
74
+ else:
75
+ # Load Filtered Population (ADSL) without Group
76
+ # Manual load + filter since get_population_data requires group
77
+ (adsl_raw,) = parser.get_datasets("adsl")
78
+ pop_filter = parser.get_population_filter(pop_name)
79
+
80
+ adsl, _ = apply_common_filters(
81
+ population=adsl_raw,
82
+ observation=None,
83
+ population_filter=pop_filter,
84
+ observation_filter=None,
85
+ )
86
+
87
+ group_col = None
88
+ grp_suffix = "total"
89
+
90
+ except ValueError as e:
91
+ print(f"Error loading population: {e}")
92
+ continue
93
+
94
+ # Load ADIE
95
+ try:
96
+ (adie,) = parser.get_datasets(criteria_df_name)
97
+ except ValueError as e:
98
+ print(f"Error loading datasets: {e}")
99
+ continue
100
+
101
+ # Output filename
102
+ filename = f"{analysis_type}_{pop_name}_{grp_suffix}.rtf".lower()
103
+ output_path = f"{output_dir}/{filename}"
104
+
105
+ # Generate ARD
106
+ ard = ie_ard(adsl=adsl, adie=adie, group_col=group_col)
107
+
108
+ # Generate DF
109
+ df = ie_df(ard)
110
+
111
+ # Generate RTF
112
+ ie_rtf(df, output_path, title=title)
113
+
114
+ generated_files.append(output_path)
115
+
116
+ return generated_files
117
+
118
+
119
+ def ie_ard(adsl: pl.DataFrame, adie: pl.DataFrame, group_col: str | None = None) -> pl.DataFrame:
120
+ """
121
+ Generate Analysis Results Data (ARD) for IE Table.
122
+
123
+ Structure:
124
+ - Total Screening Failures
125
+ - Exclusion Criteria Met
126
+ - [Detail]
127
+ - Inclusion Criteria Not Met
128
+ - [Detail]
129
+ """
130
+ # If group_col is None, create a dummy group column
131
+ actual_group_col: str = group_col if group_col else "Total"
132
+
133
+ # 1. Prepare Data
134
+ # Join ADIE to ADSL to get treatment group info
135
+ df_joined: pl.DataFrame = adie.join(
136
+ adsl.select(["USUBJID"] + ([group_col] if group_col else [])), on="USUBJID", how="inner"
137
+ )
138
+
139
+ if not group_col:
140
+ # Add dummy Total column
141
+ df_joined = df_joined.with_columns(pl.lit("Total").alias("Total"))
142
+
143
+ # Define hierarchy
144
+ results: list[dict[str, Any]] = []
145
+
146
+ # Get distinct groups
147
+ groups: list[str]
148
+ if group_col:
149
+ groups_raw: list[str | None] = sorted(adsl.select(group_col).unique().to_series().to_list())
150
+ groups = [g for g in groups_raw if g is not None]
151
+ else:
152
+ groups = ["Total"]
153
+
154
+ # Helper to calculate n and pct (pct of what? usually pct of failures? or pct of screened?)
155
+ # Usually IE table % is based on Total Screening Failures.
156
+ # Let's count Total Screening Failures per Group first.
157
+
158
+ # Total Screening Failures (Subjects present in ADIE)
159
+ # Note: A subject can match multiple criteria.
160
+ total_failures_by_group = df_joined.group_by(actual_group_col).agg(
161
+ pl.col("USUBJID").n_unique().alias("count")
162
+ )
163
+
164
+ total_failures_map: dict[str, int] = {
165
+ row[actual_group_col]: row["count"] for row in total_failures_by_group.iter_rows(named=True)
166
+ }
167
+
168
+ # Helper for row generation
169
+ def add_row(
170
+ label: str, filter_expr: pl.Expr | None = None, is_header: bool = False, indent: int = 0
171
+ ) -> None:
172
+ row_data: dict[str, Any] = {"label": label, "indent": indent, "is_header": is_header}
173
+
174
+ for g in groups:
175
+ # Filter data for this group
176
+ g_df = df_joined.filter(pl.col(actual_group_col) == g)
177
+
178
+ if filter_expr is not None:
179
+ # Filter specific criteria
180
+ g_df = g_df.filter(filter_expr)
181
+
182
+ n = g_df.select("USUBJID").n_unique()
183
+
184
+ # Pct based on total failures in that group?
185
+ denom = total_failures_map.get(g, 0)
186
+ pct = (n / denom * 100) if denom > 0 else 0.0
187
+
188
+ row_data[f"count_{g}"] = n
189
+ row_data[f"pct_{g}"] = pct
190
+
191
+ results.append(row_data)
192
+
193
+ # 1. Total Screening Failures
194
+ add_row("Total Screening Failures")
195
+
196
+ # 2. Exclusion Criteria Met
197
+ excl_expr = pl.col("PARAMCAT") == "EXCLUSION CRITERIA MET"
198
+ add_row("Exclusion Criteria Met", excl_expr, is_header=True, indent=1)
199
+
200
+ # Details for Exclusion
201
+ excl_params = (
202
+ df_joined.filter(excl_expr).select("PARAM").unique().sort("PARAM").to_series().to_list()
203
+ )
204
+ for param in excl_params:
205
+ add_row(param, excl_expr & (pl.col("PARAM") == param), indent=2)
206
+
207
+ # 3. Inclusion Criteria Not Met
208
+ incl_expr = pl.col("PARAMCAT") == "INCLUSION CRITERIA NOT MET"
209
+ add_row("Inclusion Criteria Not Met", incl_expr, is_header=True, indent=1)
210
+
211
+ # Details for Inclusion
212
+ incl_params = (
213
+ df_joined.filter(incl_expr).select("PARAM").unique().sort("PARAM").to_series().to_list()
214
+ )
215
+ for param in incl_params:
216
+ add_row(param, incl_expr & (pl.col("PARAM") == param), indent=2)
217
+
218
+ return pl.DataFrame(results)
219
+
220
+
221
+ def ie_df(ard: pl.DataFrame) -> pl.DataFrame:
222
+ """Transform ARD to display DataFrame."""
223
+ # Find group columns
224
+ cols = ard.columns
225
+ group_cols = [c for c in cols if c.startswith("count_")]
226
+ groups = [c.replace("count_", "") for c in group_cols]
227
+
228
+ # Create valid Polars expressions for selecting columns
229
+ # Apply indentation: 3 spaces per indent level
230
+ # Note: Using \u00A0 (NBSP) might be safer for RTF if spaces get collapsed,
231
+ # but regular spaces usually work in table cells. Let's start with regular spaces.
232
+
233
+ select_exprs = [
234
+ (pl.lit(" ").repeat_by(pl.col("indent")).list.join("") + pl.col("label")).alias(
235
+ "Criteria"
236
+ )
237
+ ]
238
+
239
+ for g in groups:
240
+ # Format n (%)
241
+ # We need to construct the string.
242
+ # Polars string formatting
243
+ # format: "{n} ({pct:.1f})"
244
+
245
+ # Note: Polars doesn't have f-string strictly in expressions like python
246
+ # We use strict casting and concatenation
247
+
248
+ col_n = pl.col(f"count_{g}")
249
+ col_pct = pl.col(f"pct_{g}")
250
+
251
+ fmt = (
252
+ col_n.cast(pl.Utf8)
253
+ + " ("
254
+ + col_pct.map_elements(lambda x: f"{x:.1f}", return_dtype=pl.Utf8)
255
+ + ")"
256
+ ).alias(g)
257
+
258
+ select_exprs.append(fmt)
259
+
260
+ return ard.select(select_exprs)
261
+
262
+
263
+ def ie_rtf(df: pl.DataFrame, output_path: str, title: str = "") -> None:
264
+ """Generate RTF."""
265
+
266
+ # Rename Criteria column to empty string for display if needed or keep as is?
267
+ # Usually "Criteria".
268
+
269
+ # Calculate number of columns
270
+ n_cols = len(df.columns)
271
+
272
+ # Build first-level column headers (use actual column names)
273
+ col_header_1 = list(df.columns)
274
+
275
+ # Build second-level column headers (empty for first, "n (%)" for groups)
276
+ col_header_2 = [""] + ["n (%)"] * (n_cols - 1)
277
+
278
+ # Calculate column widths - auto-calculate
279
+ # [n_cols-1, 1, 1, 1, ...]
280
+ col_widths = [float(n_cols - 1)] + [1.0] * (n_cols - 1)
281
+
282
+ rtf_doc = create_rtf_table_n_pct(
283
+ df=df,
284
+ col_header_1=col_header_1,
285
+ col_header_2=col_header_2,
286
+ col_widths=col_widths,
287
+ title=title,
288
+ footnote=None,
289
+ source=None,
290
+ )
291
+
292
+ rtf_doc.write_rtf(output_path)
@@ -0,0 +1,209 @@
1
+ # pyre-strict
2
+ """
3
+ Medical History (MH) Listing Analysis Functions
4
+ """
5
+
6
+ from pathlib import Path
7
+
8
+ import polars as pl
9
+
10
+ from ..common.parse import StudyPlanParser
11
+ from ..common.plan import StudyPlan
12
+ from ..common.rtf import create_rtf_listing
13
+ from ..common.utils import apply_common_filters
14
+
15
+
16
+ def mh_listing(
17
+ population: pl.DataFrame,
18
+ observation: pl.DataFrame,
19
+ population_filter: str | None = "SAFFL = 'Y'",
20
+ observation_filter: str | None = "MHOCCUR = 'Y'",
21
+ id: tuple[str, str] = ("USUBJID", "Subject ID"),
22
+ title: list[str] | None = None,
23
+ footnote: list[str] | None = None,
24
+ source: list[str] | None = None,
25
+ output_file: str = "mh_listing.rtf",
26
+ population_columns: list[tuple[str, str]] | None = None,
27
+ observation_columns: list[tuple[str, str]] | None = None,
28
+ sort_columns: list[str] | None = None,
29
+ ) -> str:
30
+ """
31
+ Generate Medical History Listing.
32
+ """
33
+ if title is None:
34
+ title = ["Listing of Medical History"]
35
+
36
+ # Generate DF
37
+ df = mh_listing_df(
38
+ population=population,
39
+ observation=observation,
40
+ population_filter=population_filter,
41
+ observation_filter=observation_filter,
42
+ id_col=id[0],
43
+ pop_cols=population_columns,
44
+ obs_cols=observation_columns,
45
+ sort_cols=sort_columns,
46
+ )
47
+
48
+ # Generate RTF
49
+ mh_listing_rtf(df=df, output_path=output_file, title=title, footnote=footnote, source=source)
50
+
51
+ return output_file
52
+
53
+
54
+ def mh_listing_df(
55
+ population: pl.DataFrame,
56
+ observation: pl.DataFrame,
57
+ population_filter: str | None,
58
+ observation_filter: str | None,
59
+ id_col: str,
60
+ pop_cols: list[tuple[str, str]] | None,
61
+ obs_cols: list[tuple[str, str]] | None,
62
+ sort_cols: list[str] | None,
63
+ ) -> pl.DataFrame:
64
+ # Defaults
65
+ if pop_cols is None:
66
+ # Default interesting cols from ADSL
67
+ pop_cols = [("TRT01A", "Treatment"), ("AGE", "Age"), ("SEX", "Sex")]
68
+
69
+ if obs_cols is None:
70
+ # Default from ADMH
71
+ obs_cols = [
72
+ ("MHSEQ", "Seq"),
73
+ ("MHBODSYS", "System Organ Class"),
74
+ ("MHDECOD", "Preferred Term"),
75
+ ("MHSTDTC", "Start Date"),
76
+ ("MHENRTPT", "Status"),
77
+ ]
78
+
79
+ # Apply filters
80
+ adsl, adq = apply_common_filters(
81
+ population=population,
82
+ observation=observation,
83
+ population_filter=population_filter,
84
+ observation_filter=observation_filter,
85
+ )
86
+
87
+ if adq is None:
88
+ raise ValueError("Observation data is missing")
89
+
90
+ # Join
91
+ # Select specific columns from ADSL
92
+ pop_col_names = [c[0] for c in pop_cols]
93
+ # Ensure ID is there
94
+ if id_col not in pop_col_names:
95
+ pop_col_names = [id_col] + pop_col_names
96
+
97
+ adsl_sub = adsl.select(pop_col_names)
98
+
99
+ joined = adq.join(adsl_sub, on=id_col, how="inner")
100
+
101
+ # Sort
102
+ if sort_cols:
103
+ # Check if cols exist
104
+ valid_sorts = [c for c in sort_cols if c in joined.columns]
105
+ if valid_sorts:
106
+ joined = joined.sort(valid_sorts)
107
+
108
+ # Select display columns (id + pop + obs)
109
+ display_cols = [id_col] + [c[0] for c in pop_cols if c[0] != id_col] + [c[0] for c in obs_cols]
110
+ final_df = joined.select([c for c in display_cols if c in joined.columns])
111
+
112
+ # Rename for display?
113
+ # Usually listing keeps raw names or we Map them.
114
+ # The create_rtf_listing function takes col_header list.
115
+
116
+ return final_df
117
+
118
+
119
+ def mh_listing_rtf(
120
+ df: pl.DataFrame,
121
+ output_path: str,
122
+ title: list[str] | str,
123
+ footnote: list[str] | None,
124
+ source: list[str] | None,
125
+ ) -> None:
126
+ if df.is_empty():
127
+ return
128
+
129
+ # Generate headers from predefined mapping or current logic?
130
+ # Here we just use column names for simplicity or we could pass headers.
131
+ # We didn't output headers from mh_listing_df.
132
+ # Let's assume the order is maintained.
133
+
134
+ headers = list(df.columns)
135
+
136
+ # Approximate widths
137
+ # ID: 1, TRT: 1.5, AGE: 0.5, SEX: 0.5, SEQ: 0.5, SOC: 2, PT: 2, DATE: 1, STATUS: 1
138
+ # Total ~ 10 units?
139
+ # Simple uniform distribution or weighted?
140
+ n_cols = len(headers)
141
+ col_widths = [1.0] * n_cols
142
+
143
+ rtf_doc = create_rtf_listing(
144
+ df=df,
145
+ col_header=headers,
146
+ col_widths=col_widths,
147
+ title=title,
148
+ footnote=footnote,
149
+ source=source,
150
+ )
151
+
152
+ rtf_doc.write_rtf(output_path)
153
+
154
+
155
+ def study_plan_to_mh_listing(study_plan: StudyPlan) -> list[str]:
156
+ """
157
+ Batch generate MH listings.
158
+ """
159
+ analysis_type = "mh_listing"
160
+ output_dir = study_plan.output_dir
161
+
162
+ parser = StudyPlanParser(study_plan)
163
+
164
+ plans = study_plan.study_data.get("plans", [])
165
+ all_specs = []
166
+ for plan_data in plans:
167
+ expanded = study_plan.expander.expand_plan(plan_data)
168
+ for p in expanded:
169
+ all_specs.append(study_plan.expander.create_analysis_spec(p))
170
+
171
+ plan_df = pl.DataFrame(all_specs)
172
+
173
+ if "analysis" in plan_df.columns:
174
+ mh_plans = plan_df.filter(pl.col("analysis") == analysis_type)
175
+ else:
176
+ mh_plans = pl.DataFrame()
177
+
178
+ generated_files = []
179
+
180
+ for analysis in mh_plans.iter_rows(named=True):
181
+ pop_name = analysis.get("population", "enrolled")
182
+
183
+ try:
184
+ # Load Population
185
+ adsl, _ = parser.get_population_data(pop_name, "trt01a") # dummy group
186
+
187
+ (admh,) = parser.get_datasets("admh")
188
+
189
+ filename = f"{analysis_type}_{pop_name}.rtf".lower()
190
+ output_path = f"{output_dir}/{filename}"
191
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
192
+
193
+ mh_listing(
194
+ population=adsl,
195
+ observation=admh,
196
+ population_filter=None,
197
+ observation_filter=None, # Show all?
198
+ output_file=output_path,
199
+ title=["Listing of Medical History", f"({pop_name} Population)"],
200
+ source=["Source: ADSL, ADMH"],
201
+ )
202
+
203
+ generated_files.append(output_path)
204
+
205
+ except Exception as e:
206
+ print(f"Error generating MH listing: {e}")
207
+ continue
208
+
209
+ return generated_files