csrlite 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,332 +1,332 @@
1
- # pyre-strict
2
- """
3
- Disposition Table 1.1 Analysis Functions
4
-
5
- This module provides a pipeline for Disposition Table 1.1 summary analysis:
6
- - disposition_ard: Generate Analysis Results Data (ARD)
7
- - disposition_df: Transform ARD to display format
8
- - disposition_rtf: Generate formatted RTF output
9
- - disposition: Complete pipeline wrapper
10
- - study_plan_to_disposition_summary: Batch generation from StudyPlan
11
- """
12
-
13
- from pathlib import Path
14
-
15
- import polars as pl
16
- from rtflite import RTFDocument
17
-
18
- from ..common.count import count_subject, count_subject_with_observation
19
- from ..common.parse import StudyPlanParser
20
- from ..common.plan import StudyPlan
21
- from ..common.rtf import create_rtf_table_n_pct
22
- from ..common.utils import apply_common_filters
23
-
24
-
25
- def study_plan_to_disposition_summary(
26
- study_plan: StudyPlan,
27
- ) -> list[str]:
28
- """
29
- Generate Disposition Summary Table outputs for all analyses defined in StudyPlan.
30
- """
31
- # Meta data
32
- analysis_type = "disposition_summary"
33
- output_dir = study_plan.output_dir
34
- title = "Disposition of Participants"
35
- footnote = ["Percentages are based on the number of enrolled participants."]
36
- source = None
37
-
38
- population_df_name = "adsl"
39
-
40
- id = ("USUBJID", "Subject ID")
41
- ds_term = ("EOSSTT", "Disposition Status")
42
- dist_reason_term = ("DCSREAS", "Discontinued Reason")
43
-
44
- total = True
45
- missing_group = "error"
46
-
47
- # Create output directory
48
- Path(output_dir).mkdir(parents=True, exist_ok=True)
49
-
50
- # Initialize parser
51
- parser = StudyPlanParser(study_plan)
52
-
53
- # Get expanded plan DataFrame
54
- plan_df = study_plan.get_plan_df()
55
-
56
- # Filter for disposition analyses
57
- disp_plans = plan_df.filter(pl.col("analysis") == analysis_type)
58
-
59
- rtf_files = []
60
-
61
- for row in disp_plans.iter_rows(named=True):
62
- population = row["population"]
63
- group = row.get("group")
64
- title_text = title
65
-
66
- # Get datasets
67
- (population_df,) = parser.get_datasets(population_df_name)
68
-
69
- # Get filters
70
- population_filter = parser.get_population_filter(population)
71
-
72
- # Get group info (optional)
73
- if group is not None:
74
- group_var_name, group_labels = parser.get_group_info(group)
75
- group_var_label = group_labels[0] if group_labels else group_var_name
76
- group_tuple = (group_var_name, group_var_label)
77
- else:
78
- # When no group specified, use a dummy group column for overall counts
79
- group_tuple = None
80
-
81
- # Build title
82
- title_parts = [title_text]
83
- pop_kw = study_plan.keywords.populations.get(population)
84
- if pop_kw and pop_kw.label:
85
- title_parts.append(pop_kw.label)
86
-
87
- # Build output filename
88
- group_suffix = f"_{group}" if group else ""
89
- filename = f"{analysis_type}_{population}{group_suffix}.rtf"
90
- output_file = str(Path(output_dir) / filename)
91
-
92
- rtf_path = disposition(
93
- population=population_df,
94
- population_filter=population_filter,
95
- id=id,
96
- group=group_tuple,
97
- ds_term=ds_term,
98
- dist_reason_term=dist_reason_term,
99
- title=title_parts,
100
- footnote=footnote,
101
- source=source,
102
- output_file=output_file,
103
- total=total,
104
- missing_group=missing_group,
105
- )
106
- rtf_files.append(rtf_path)
107
-
108
- return rtf_files
109
-
110
-
111
- def disposition(
112
- population: pl.DataFrame,
113
- population_filter: str | None,
114
- id: tuple[str, str],
115
- group: tuple[str, str] | None,
116
- ds_term: tuple[str, str],
117
- dist_reason_term: tuple[str, str],
118
- title: list[str],
119
- footnote: list[str] | None,
120
- source: list[str] | None,
121
- output_file: str,
122
- total: bool = True,
123
- col_rel_width: list[float] | None = None,
124
- missing_group: str = "error",
125
- ) -> str:
126
- """
127
- Complete Disposition Summary Table pipeline wrapper.
128
- """
129
- # Step 1: Generate ARD
130
- ard = disposition_ard(
131
- population=population,
132
- population_filter=population_filter,
133
- id=id,
134
- group=group,
135
- ds_term=ds_term,
136
- dist_reason_term=dist_reason_term,
137
- total=total,
138
- missing_group=missing_group,
139
- )
140
-
141
- # Step 2: Transform to display format
142
- df = disposition_df(ard)
143
-
144
- # Step 3: Generate RTF
145
- rtf_doc = disposition_rtf(
146
- df=df,
147
- title=title,
148
- footnote=footnote,
149
- source=source,
150
- col_rel_width=col_rel_width,
151
- )
152
- rtf_doc.write_rtf(output_file)
153
-
154
- return output_file
155
-
156
-
157
- def _validate_disposition_data(df: pl.DataFrame, ds_var: str, reason_var: str) -> None:
158
- """
159
- Validate disposition data integrity.
160
-
161
- Rules:
162
- 1. ds_var must be {Completed, Ongoing, Discontinued} and non-null.
163
- 2. If ds_var is Completed/Ongoing, reason_var must be the same as ds_var or null.
164
- 3. If ds_var is Discontinued, reason_var must be non-null and not Completed/Ongoing.
165
- """
166
- # Rule 1: Valid Statuses
167
- valid_statuses = ["Completed", "Ongoing", "Discontinued"]
168
- if df[ds_var].is_null().any():
169
- raise ValueError(f"Found null values in disposition status column '{ds_var}'")
170
-
171
- invalid_status = df.filter(~pl.col(ds_var).is_in(valid_statuses))
172
- if not invalid_status.is_empty():
173
- bad_values = invalid_status[ds_var].unique().to_list()
174
- raise ValueError(
175
- f"Invalid disposition statuses found: {bad_values}. Must be one of {valid_statuses}"
176
- )
177
-
178
- # Rule 2: Completed/Ongoing implies Reason is Null OR equal to Status
179
- inconsistent_completed = df.filter(
180
- (pl.col(ds_var).is_in(["Completed", "Ongoing"]))
181
- & (~pl.col(reason_var).is_null())
182
- & (pl.col(reason_var) != pl.col(ds_var))
183
- )
184
- if not inconsistent_completed.is_empty():
185
- raise ValueError(
186
- f"Found subjects with status 'Completed' or 'Ongoing' with mismatched "
187
- f"discontinuation reason in '{reason_var}'. Reason must be Null or match Status."
188
- )
189
-
190
- # Rule 3: Discontinued implies Reason is NOT Null AND NOT {Completed, Ongoing}
191
- invalid_discontinued = df.filter(
192
- (pl.col(ds_var) == "Discontinued")
193
- & ((pl.col(reason_var).is_null()) | (pl.col(reason_var).is_in(["Completed", "Ongoing"])))
194
- )
195
- if not invalid_discontinued.is_empty():
196
- raise ValueError(
197
- f"Found subjects with status 'Discontinued' but missing or invalid "
198
- f"discontinuation reason in '{reason_var}'"
199
- )
200
-
201
-
202
- def disposition_ard(
203
- population: pl.DataFrame,
204
- population_filter: str | None,
205
- id: tuple[str, str],
206
- group: tuple[str, str] | None,
207
- ds_term: tuple[str, str],
208
- dist_reason_term: tuple[str, str],
209
- total: bool,
210
- missing_group: str,
211
- pop_var_name: str = "Enrolled",
212
- ) -> pl.DataFrame:
213
- """
214
- Generate ARD for Summary Table.
215
- """
216
- # Unpack variables
217
- ds_var_name, _ = ds_term
218
- dist_reason_var_name, _ = dist_reason_term
219
- id_var_name, _ = id
220
-
221
- # Validate Data
222
- _validate_disposition_data(population, ds_var_name, dist_reason_var_name)
223
-
224
- # Apply common filters
225
- population_filtered, _ = apply_common_filters(
226
- population=population,
227
- observation=None,
228
- population_filter=population_filter,
229
- observation_filter=None,
230
- )
231
-
232
- if group:
233
- group_var_name, _ = group
234
- else:
235
- # Create dummy group for overall analysis
236
- group_var_name = "Overall"
237
- total = False
238
- population_filtered = population_filtered.with_columns(
239
- pl.lit("Overall").alias(group_var_name)
240
- )
241
-
242
- # Enrolled Subjects
243
- n_pop_counts = count_subject(
244
- population=population_filtered,
245
- id=id_var_name,
246
- group=group_var_name,
247
- total=total,
248
- missing_group=missing_group,
249
- )
250
-
251
- n_pop = n_pop_counts.select(
252
- pl.lit(pop_var_name).alias("__index__"),
253
- pl.col(group_var_name).cast(pl.String).alias("__group__"),
254
- pl.col("n_subj_pop").cast(pl.String).alias("__value__"),
255
- )
256
-
257
- # Hierarchical Counts for Status and Reason
258
- # Level 1: Status (Completed, Ongoing, Discontinued)
259
- # Level 2: Status + Reason (Only relevant for Discontinued)
260
- n_dict = count_subject_with_observation(
261
- population=population_filtered,
262
- observation=population_filtered,
263
- id=id_var_name,
264
- group=group_var_name,
265
- variable=[ds_var_name, dist_reason_var_name],
266
- total=total,
267
- missing_group=missing_group,
268
- )
269
-
270
- # Filter and format
271
- # Identify rows:
272
- # 1. Status rows: Where reason is "__all__"
273
- # 2. Reason rows: Where reason is specific value (indented)
274
- n_dict = n_dict.unique([group_var_name, ds_var_name, dist_reason_var_name, "__id__"])
275
-
276
- # Filter out redundant nested rows (e.g., "Completed" under "Completed")
277
- n_dict = n_dict.filter(pl.col(dist_reason_var_name) != pl.col(ds_var_name))
278
-
279
- n_final = n_dict.sort("__id__").select(
280
- pl.col("__variable__").alias("__index__"),
281
- pl.col(group_var_name).cast(pl.String).alias("__group__"),
282
- pl.col("n_pct_subj_fmt").cast(pl.String).alias("__value__"),
283
- )
284
-
285
- return pl.concat([n_pop, n_final])
286
-
287
-
288
- def disposition_df(ard: pl.DataFrame) -> pl.DataFrame:
289
- """
290
- Transform ARD to display format.
291
- """
292
- # Pivot
293
- # Pivot from long to wide format
294
- df_wide = ard.pivot(index="__index__", on="__group__", values="__value__")
295
-
296
- # Rename __index__ to display column name
297
- df_wide = df_wide.rename({"__index__": "Term"}).select(pl.col("Term"), pl.exclude("Term"))
298
-
299
- return df_wide
300
-
301
-
302
- def disposition_rtf(
303
- df: pl.DataFrame,
304
- title: list[str],
305
- footnote: list[str] | None,
306
- source: list[str] | None,
307
- col_rel_width: list[float] | None = None,
308
- ) -> RTFDocument:
309
- """
310
- Generate RTF.
311
- """
312
- # Reuse generic table creation
313
- # Columns: Disposition Status, Group 1, Group 2, ... Total
314
-
315
- n_cols = len(df.columns)
316
- col_header_1 = [""] + list(df.columns[1:])
317
- col_header_2 = [""] + ["n (%)"] * (n_cols - 1)
318
-
319
- if col_rel_width is None:
320
- col_widths = [2.5] + [1] * (n_cols - 1)
321
- else:
322
- col_widths = col_rel_width
323
-
324
- return create_rtf_table_n_pct(
325
- df=df,
326
- col_header_1=col_header_1,
327
- col_header_2=col_header_2,
328
- col_widths=col_widths,
329
- title=title,
330
- footnote=footnote,
331
- source=source,
332
- )
1
+ # pyre-strict
2
+ """
3
+ Disposition Table 1.1 Analysis Functions
4
+
5
+ This module provides a pipeline for Disposition Table 1.1 summary analysis:
6
+ - disposition_ard: Generate Analysis Results Data (ARD)
7
+ - disposition_df: Transform ARD to display format
8
+ - disposition_rtf: Generate formatted RTF output
9
+ - disposition: Complete pipeline wrapper
10
+ - study_plan_to_disposition_summary: Batch generation from StudyPlan
11
+ """
12
+
13
+ from pathlib import Path
14
+
15
+ import polars as pl
16
+ from rtflite import RTFDocument
17
+
18
+ from ..common.count import count_subject, count_subject_with_observation
19
+ from ..common.parse import StudyPlanParser
20
+ from ..common.plan import StudyPlan
21
+ from ..common.rtf import create_rtf_table_n_pct
22
+ from ..common.utils import apply_common_filters
23
+
24
+
25
+ def study_plan_to_disposition_summary(
26
+ study_plan: StudyPlan,
27
+ ) -> list[str]:
28
+ """
29
+ Generate Disposition Summary Table outputs for all analyses defined in StudyPlan.
30
+ """
31
+ # Meta data
32
+ analysis_type = "disposition_summary"
33
+ output_dir = study_plan.output_dir
34
+ title = "Disposition of Participants"
35
+ footnote = ["Percentages are based on the number of enrolled participants."]
36
+ source = None
37
+
38
+ population_df_name = "adsl"
39
+
40
+ id = ("USUBJID", "Subject ID")
41
+ ds_term = ("EOSSTT", "Disposition Status")
42
+ dist_reason_term = ("DCSREAS", "Discontinued Reason")
43
+
44
+ total = True
45
+ missing_group = "error"
46
+
47
+ # Create output directory
48
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
49
+
50
+ # Initialize parser
51
+ parser = StudyPlanParser(study_plan)
52
+
53
+ # Get expanded plan DataFrame
54
+ plan_df = study_plan.get_plan_df()
55
+
56
+ # Filter for disposition analyses
57
+ disp_plans = plan_df.filter(pl.col("analysis") == analysis_type)
58
+
59
+ rtf_files = []
60
+
61
+ for row in disp_plans.iter_rows(named=True):
62
+ population = row["population"]
63
+ group = row.get("group")
64
+ title_text = title
65
+
66
+ # Get datasets
67
+ (population_df,) = parser.get_datasets(population_df_name)
68
+
69
+ # Get filters
70
+ population_filter = parser.get_population_filter(population)
71
+
72
+ # Get group info (optional)
73
+ if group is not None:
74
+ group_var_name, group_labels = parser.get_group_info(group)
75
+ group_var_label = group_labels[0] if group_labels else group_var_name
76
+ group_tuple = (group_var_name, group_var_label)
77
+ else:
78
+ # When no group specified, use a dummy group column for overall counts
79
+ group_tuple = None
80
+
81
+ # Build title
82
+ title_parts = [title_text]
83
+ pop_kw = study_plan.keywords.populations.get(population)
84
+ if pop_kw and pop_kw.label:
85
+ title_parts.append(pop_kw.label)
86
+
87
+ # Build output filename
88
+ group_suffix = f"_{group}" if group else ""
89
+ filename = f"{analysis_type}_{population}{group_suffix}.rtf"
90
+ output_file = str(Path(output_dir) / filename)
91
+
92
+ rtf_path = disposition(
93
+ population=population_df,
94
+ population_filter=population_filter,
95
+ id=id,
96
+ group=group_tuple,
97
+ ds_term=ds_term,
98
+ dist_reason_term=dist_reason_term,
99
+ title=title_parts,
100
+ footnote=footnote,
101
+ source=source,
102
+ output_file=output_file,
103
+ total=total,
104
+ missing_group=missing_group,
105
+ )
106
+ rtf_files.append(rtf_path)
107
+
108
+ return rtf_files
109
+
110
+
111
+ def disposition(
112
+ population: pl.DataFrame,
113
+ population_filter: str | None,
114
+ id: tuple[str, str],
115
+ group: tuple[str, str] | None,
116
+ ds_term: tuple[str, str],
117
+ dist_reason_term: tuple[str, str],
118
+ title: list[str],
119
+ footnote: list[str] | None,
120
+ source: list[str] | None,
121
+ output_file: str,
122
+ total: bool = True,
123
+ col_rel_width: list[float] | None = None,
124
+ missing_group: str = "error",
125
+ ) -> str:
126
+ """
127
+ Complete Disposition Summary Table pipeline wrapper.
128
+ """
129
+ # Step 1: Generate ARD
130
+ ard = disposition_ard(
131
+ population=population,
132
+ population_filter=population_filter,
133
+ id=id,
134
+ group=group,
135
+ ds_term=ds_term,
136
+ dist_reason_term=dist_reason_term,
137
+ total=total,
138
+ missing_group=missing_group,
139
+ )
140
+
141
+ # Step 2: Transform to display format
142
+ df = disposition_df(ard)
143
+
144
+ # Step 3: Generate RTF
145
+ rtf_doc = disposition_rtf(
146
+ df=df,
147
+ title=title,
148
+ footnote=footnote,
149
+ source=source,
150
+ col_rel_width=col_rel_width,
151
+ )
152
+ rtf_doc.write_rtf(output_file)
153
+
154
+ return output_file
155
+
156
+
157
+ def _validate_disposition_data(df: pl.DataFrame, ds_var: str, reason_var: str) -> None:
158
+ """
159
+ Validate disposition data integrity.
160
+
161
+ Rules:
162
+ 1. ds_var must be {Completed, Ongoing, Discontinued} and non-null.
163
+ 2. If ds_var is Completed/Ongoing, reason_var must be the same as ds_var or null.
164
+ 3. If ds_var is Discontinued, reason_var must be non-null and not Completed/Ongoing.
165
+ """
166
+ # Rule 1: Valid Statuses
167
+ valid_statuses = ["Completed", "Ongoing", "Discontinued"]
168
+ if df[ds_var].is_null().any():
169
+ raise ValueError(f"Found null values in disposition status column '{ds_var}'")
170
+
171
+ invalid_status = df.filter(~pl.col(ds_var).is_in(valid_statuses))
172
+ if not invalid_status.is_empty():
173
+ bad_values = invalid_status[ds_var].unique().to_list()
174
+ raise ValueError(
175
+ f"Invalid disposition statuses found: {bad_values}. Must be one of {valid_statuses}"
176
+ )
177
+
178
+ # Rule 2: Completed/Ongoing implies Reason is Null OR equal to Status
179
+ inconsistent_completed = df.filter(
180
+ (pl.col(ds_var).is_in(["Completed", "Ongoing"]))
181
+ & (~pl.col(reason_var).is_null())
182
+ & (pl.col(reason_var) != pl.col(ds_var))
183
+ )
184
+ if not inconsistent_completed.is_empty():
185
+ raise ValueError(
186
+ f"Found subjects with status 'Completed' or 'Ongoing' with mismatched "
187
+ f"discontinuation reason in '{reason_var}'. Reason must be Null or match Status."
188
+ )
189
+
190
+ # Rule 3: Discontinued implies Reason is NOT Null AND NOT {Completed, Ongoing}
191
+ invalid_discontinued = df.filter(
192
+ (pl.col(ds_var) == "Discontinued")
193
+ & ((pl.col(reason_var).is_null()) | (pl.col(reason_var).is_in(["Completed", "Ongoing"])))
194
+ )
195
+ if not invalid_discontinued.is_empty():
196
+ raise ValueError(
197
+ f"Found subjects with status 'Discontinued' but missing or invalid "
198
+ f"discontinuation reason in '{reason_var}'"
199
+ )
200
+
201
+
202
+ def disposition_ard(
203
+ population: pl.DataFrame,
204
+ population_filter: str | None,
205
+ id: tuple[str, str],
206
+ group: tuple[str, str] | None,
207
+ ds_term: tuple[str, str],
208
+ dist_reason_term: tuple[str, str],
209
+ total: bool,
210
+ missing_group: str,
211
+ pop_var_name: str = "Enrolled",
212
+ ) -> pl.DataFrame:
213
+ """
214
+ Generate ARD for Summary Table.
215
+ """
216
+ # Unpack variables
217
+ ds_var_name, _ = ds_term
218
+ dist_reason_var_name, _ = dist_reason_term
219
+ id_var_name, _ = id
220
+
221
+ # Validate Data
222
+ _validate_disposition_data(population, ds_var_name, dist_reason_var_name)
223
+
224
+ # Apply common filters
225
+ population_filtered, _ = apply_common_filters(
226
+ population=population,
227
+ observation=None,
228
+ population_filter=population_filter,
229
+ observation_filter=None,
230
+ )
231
+
232
+ if group:
233
+ group_var_name, _ = group
234
+ else:
235
+ # Create dummy group for overall analysis
236
+ group_var_name = "Overall"
237
+ total = False
238
+ population_filtered = population_filtered.with_columns(
239
+ pl.lit("Overall").alias(group_var_name)
240
+ )
241
+
242
+ # Enrolled Subjects
243
+ n_pop_counts = count_subject(
244
+ population=population_filtered,
245
+ id=id_var_name,
246
+ group=group_var_name,
247
+ total=total,
248
+ missing_group=missing_group,
249
+ )
250
+
251
+ n_pop = n_pop_counts.select(
252
+ pl.lit(pop_var_name).alias("__index__"),
253
+ pl.col(group_var_name).cast(pl.String).alias("__group__"),
254
+ pl.col("n_subj_pop").cast(pl.String).alias("__value__"),
255
+ )
256
+
257
+ # Hierarchical Counts for Status and Reason
258
+ # Level 1: Status (Completed, Ongoing, Discontinued)
259
+ # Level 2: Status + Reason (Only relevant for Discontinued)
260
+ n_dict = count_subject_with_observation(
261
+ population=population_filtered,
262
+ observation=population_filtered,
263
+ id=id_var_name,
264
+ group=group_var_name,
265
+ variable=[ds_var_name, dist_reason_var_name],
266
+ total=total,
267
+ missing_group=missing_group,
268
+ )
269
+
270
+ # Filter and format
271
+ # Identify rows:
272
+ # 1. Status rows: Where reason is "__all__"
273
+ # 2. Reason rows: Where reason is specific value (indented)
274
+ n_dict = n_dict.unique([group_var_name, ds_var_name, dist_reason_var_name, "__id__"])
275
+
276
+ # Filter out redundant nested rows (e.g., "Completed" under "Completed")
277
+ n_dict = n_dict.filter(pl.col(dist_reason_var_name) != pl.col(ds_var_name))
278
+
279
+ n_final = n_dict.sort("__id__").select(
280
+ pl.col("__variable__").alias("__index__"),
281
+ pl.col(group_var_name).cast(pl.String).alias("__group__"),
282
+ pl.col("n_pct_subj_fmt").cast(pl.String).alias("__value__"),
283
+ )
284
+
285
+ return pl.concat([n_pop, n_final])
286
+
287
+
288
+ def disposition_df(ard: pl.DataFrame) -> pl.DataFrame:
289
+ """
290
+ Transform ARD to display format.
291
+ """
292
+ # Pivot
293
+ # Pivot from long to wide format
294
+ df_wide = ard.pivot(index="__index__", on="__group__", values="__value__")
295
+
296
+ # Rename __index__ to display column name
297
+ df_wide = df_wide.rename({"__index__": "Term"}).select(pl.col("Term"), pl.exclude("Term"))
298
+
299
+ return df_wide
300
+
301
+
302
+ def disposition_rtf(
303
+ df: pl.DataFrame,
304
+ title: list[str],
305
+ footnote: list[str] | None,
306
+ source: list[str] | None,
307
+ col_rel_width: list[float] | None = None,
308
+ ) -> RTFDocument:
309
+ """
310
+ Generate RTF.
311
+ """
312
+ # Reuse generic table creation
313
+ # Columns: Disposition Status, Group 1, Group 2, ... Total
314
+
315
+ n_cols = len(df.columns)
316
+ col_header_1 = [""] + list(df.columns[1:])
317
+ col_header_2 = [""] + ["n (%)"] * (n_cols - 1)
318
+
319
+ if col_rel_width is None:
320
+ col_widths = [2.5] + [1] * (n_cols - 1)
321
+ else:
322
+ col_widths = col_rel_width
323
+
324
+ return create_rtf_table_n_pct(
325
+ df=df,
326
+ col_header_1=col_header_1,
327
+ col_header_2=col_header_2,
328
+ col_widths=col_widths,
329
+ title=title,
330
+ footnote=footnote,
331
+ source=source,
332
+ )