csrlite 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ # pyre-strict
2
+ """
3
+ Medical History (MH) Listing Analysis Functions
4
+ """
5
+
6
+ from pathlib import Path
7
+
8
+ import polars as pl
9
+
10
+ from ..common.parse import StudyPlanParser
11
+ from ..common.plan import StudyPlan
12
+ from ..common.rtf import create_rtf_listing
13
+ from ..common.utils import apply_common_filters
14
+
15
+
16
+ def mh_listing(
17
+ population: pl.DataFrame,
18
+ observation: pl.DataFrame,
19
+ population_filter: str | None = "SAFFL = 'Y'",
20
+ observation_filter: str | None = "MHOCCUR = 'Y'",
21
+ id: tuple[str, str] = ("USUBJID", "Subject ID"),
22
+ title: list[str] | None = None,
23
+ footnote: list[str] | None = None,
24
+ source: list[str] | None = None,
25
+ output_file: str = "mh_listing.rtf",
26
+ population_columns: list[tuple[str, str]] | None = None,
27
+ observation_columns: list[tuple[str, str]] | None = None,
28
+ sort_columns: list[str] | None = None,
29
+ ) -> str:
30
+ """
31
+ Generate Medical History Listing.
32
+ """
33
+ if title is None:
34
+ title = ["Listing of Medical History"]
35
+
36
+ # Generate DF
37
+ df = mh_listing_df(
38
+ population=population,
39
+ observation=observation,
40
+ population_filter=population_filter,
41
+ observation_filter=observation_filter,
42
+ id_col=id[0],
43
+ pop_cols=population_columns,
44
+ obs_cols=observation_columns,
45
+ sort_cols=sort_columns,
46
+ )
47
+
48
+ # Generate RTF
49
+ mh_listing_rtf(df=df, output_path=output_file, title=title, footnote=footnote, source=source)
50
+
51
+ return output_file
52
+
53
+
54
+ def mh_listing_df(
55
+ population: pl.DataFrame,
56
+ observation: pl.DataFrame,
57
+ population_filter: str | None,
58
+ observation_filter: str | None,
59
+ id_col: str,
60
+ pop_cols: list[tuple[str, str]] | None,
61
+ obs_cols: list[tuple[str, str]] | None,
62
+ sort_cols: list[str] | None,
63
+ ) -> pl.DataFrame:
64
+ # Defaults
65
+ if pop_cols is None:
66
+ # Default interesting cols from ADSL
67
+ pop_cols = [("TRT01A", "Treatment"), ("AGE", "Age"), ("SEX", "Sex")]
68
+
69
+ if obs_cols is None:
70
+ # Default from ADMH
71
+ obs_cols = [
72
+ ("MHSEQ", "Seq"),
73
+ ("MHBODSYS", "System Organ Class"),
74
+ ("MHDECOD", "Preferred Term"),
75
+ ("MHSTDTC", "Start Date"),
76
+ ("MHENRTPT", "Status"),
77
+ ]
78
+
79
+ # Apply filters
80
+ adsl, adq = apply_common_filters(
81
+ population=population,
82
+ observation=observation,
83
+ population_filter=population_filter,
84
+ observation_filter=observation_filter,
85
+ )
86
+
87
+ if adq is None:
88
+ raise ValueError("Observation data is missing")
89
+
90
+ # Join
91
+ # Select specific columns from ADSL
92
+ pop_col_names = [c[0] for c in pop_cols]
93
+ # Ensure ID is there
94
+ if id_col not in pop_col_names:
95
+ pop_col_names = [id_col] + pop_col_names
96
+
97
+ adsl_sub = adsl.select(pop_col_names)
98
+
99
+ joined = adq.join(adsl_sub, on=id_col, how="inner")
100
+
101
+ # Sort
102
+ if sort_cols:
103
+ # Check if cols exist
104
+ valid_sorts = [c for c in sort_cols if c in joined.columns]
105
+ if valid_sorts:
106
+ joined = joined.sort(valid_sorts)
107
+
108
+ # Select display columns (id + pop + obs)
109
+ display_cols = [id_col] + [c[0] for c in pop_cols if c[0] != id_col] + [c[0] for c in obs_cols]
110
+ final_df = joined.select([c for c in display_cols if c in joined.columns])
111
+
112
+ # Rename for display?
113
+ # Usually listing keeps raw names or we Map them.
114
+ # The create_rtf_listing function takes col_header list.
115
+
116
+ return final_df
117
+
118
+
119
+ def mh_listing_rtf(
120
+ df: pl.DataFrame,
121
+ output_path: str,
122
+ title: list[str] | str,
123
+ footnote: list[str] | None,
124
+ source: list[str] | None,
125
+ ) -> None:
126
+ if df.is_empty():
127
+ return
128
+
129
+ # Generate headers from predefined mapping or current logic?
130
+ # Here we just use column names for simplicity or we could pass headers.
131
+ # We didn't output headers from mh_listing_df.
132
+ # Let's assume the order is maintained.
133
+
134
+ headers = list(df.columns)
135
+
136
+ # Approximate widths
137
+ # ID: 1, TRT: 1.5, AGE: 0.5, SEX: 0.5, SEQ: 0.5, SOC: 2, PT: 2, DATE: 1, STATUS: 1
138
+ # Total ~ 10 units?
139
+ # Simple uniform distribution or weighted?
140
+ n_cols = len(headers)
141
+ col_widths = [1.0] * n_cols
142
+
143
+ rtf_doc = create_rtf_listing(
144
+ df=df,
145
+ col_header=headers,
146
+ col_widths=col_widths,
147
+ title=title,
148
+ footnote=footnote,
149
+ source=source,
150
+ )
151
+
152
+ rtf_doc.write_rtf(output_path)
153
+
154
+
155
+ def study_plan_to_mh_listing(study_plan: StudyPlan) -> list[str]:
156
+ """
157
+ Batch generate MH listings.
158
+ """
159
+ analysis_type = "mh_listing"
160
+ output_dir = study_plan.output_dir
161
+
162
+ parser = StudyPlanParser(study_plan)
163
+
164
+ plans = study_plan.study_data.get("plans", [])
165
+ all_specs = []
166
+ for plan_data in plans:
167
+ expanded = study_plan.expander.expand_plan(plan_data)
168
+ for p in expanded:
169
+ all_specs.append(study_plan.expander.create_analysis_spec(p))
170
+
171
+ plan_df = pl.DataFrame(all_specs)
172
+
173
+ if "analysis" in plan_df.columns:
174
+ mh_plans = plan_df.filter(pl.col("analysis") == analysis_type)
175
+ else:
176
+ mh_plans = pl.DataFrame()
177
+
178
+ generated_files = []
179
+
180
+ for analysis in mh_plans.iter_rows(named=True):
181
+ pop_name = analysis.get("population", "enrolled")
182
+
183
+ try:
184
+ # Load Population
185
+ adsl, _ = parser.get_population_data(pop_name, "trt01a") # dummy group
186
+
187
+ (admh,) = parser.get_datasets("admh")
188
+
189
+ filename = f"{analysis_type}_{pop_name}.rtf".lower()
190
+ output_path = f"{output_dir}/{filename}"
191
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
192
+
193
+ mh_listing(
194
+ population=adsl,
195
+ observation=admh,
196
+ population_filter=None,
197
+ observation_filter=None, # Show all?
198
+ output_file=output_path,
199
+ title=["Listing of Medical History", f"({pop_name} Population)"],
200
+ source=["Source: ADSL, ADMH"],
201
+ )
202
+
203
+ generated_files.append(output_path)
204
+
205
+ except Exception as e:
206
+ print(f"Error generating MH listing: {e}")
207
+ continue
208
+
209
+ return generated_files
@@ -0,0 +1,333 @@
1
+ # pyre-strict
2
+ """
3
+ Medical History (MH) Summary Analysis Functions
4
+ """
5
+
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import polars as pl
10
+
11
+ from ..common.parse import StudyPlanParser
12
+ from ..common.plan import StudyPlan
13
+ from ..common.rtf import create_rtf_table_n_pct
14
+ from ..common.utils import apply_common_filters
15
+
16
+
17
+ def mh_summary(
18
+ population: pl.DataFrame,
19
+ observation: pl.DataFrame,
20
+ population_filter: str | None = "SAFFL = 'Y'",
21
+ observation_filter: str | None = "MHOCCUR = 'Y'",
22
+ id: tuple[str, str] = ("USUBJID", "Subject ID"),
23
+ group: tuple[str, str] = ("TRT01A", "Treatment"),
24
+ variables: list[tuple[str, str]] | None = None,
25
+ title: list[str] | None = None,
26
+ footnote: list[str] | None = None,
27
+ source: list[str] | None = None,
28
+ output_file: str = "mh_summary.rtf",
29
+ ) -> str:
30
+ """
31
+ Generate Medical History Summary Table.
32
+ """
33
+ if title is None:
34
+ title = ["Summary of Medical History by Body System and Preferred Term"]
35
+
36
+ if variables is None:
37
+ # Default hierarchy: Body System -> Preferred Term
38
+ variables = [("MHBODSYS", "System Organ Class"), ("MHDECOD", "Preferred Term")]
39
+
40
+ # Generate ARD
41
+ ard = mh_summary_ard(
42
+ population=population,
43
+ observation=observation,
44
+ population_filter=population_filter,
45
+ observation_filter=observation_filter,
46
+ group_col=group[0],
47
+ id_col=id[0],
48
+ variables=variables,
49
+ )
50
+
51
+ # Transform to Display DF
52
+ df = mh_summary_df(ard)
53
+
54
+ # Generate RTF
55
+ mh_summary_rtf(df=df, output_path=output_file, title=title, footnote=footnote, source=source)
56
+
57
+ return output_file
58
+
59
+
60
+ def mh_summary_ard(
61
+ population: pl.DataFrame,
62
+ observation: pl.DataFrame,
63
+ population_filter: str | None,
64
+ observation_filter: str | None,
65
+ group_col: str,
66
+ id_col: str,
67
+ variables: list[tuple[str, str]],
68
+ ) -> pl.DataFrame:
69
+ """
70
+ Generate ARD for MH Summary.
71
+ Hierarchy is often Body System -> Preferred Term.
72
+ """
73
+
74
+ # Apply filters
75
+ adsl, adq = apply_common_filters(
76
+ population=population,
77
+ observation=observation,
78
+ population_filter=population_filter,
79
+ observation_filter=observation_filter,
80
+ )
81
+
82
+ if adq is None:
83
+ # Should not happen as we passed observation df
84
+ raise ValueError("Observation data is missing")
85
+
86
+ # This summary usually nests MHDECOD under MHBODSYS
87
+ # Structure:
88
+ # Any Medical History (1=1)
89
+ # Body System 1
90
+ # Term A
91
+ # Term B
92
+
93
+ # We can reuse count_subject_with_observation but it handles list of flexible conditions.
94
+ # For nested structure, we might need manual construction or nested calls.
95
+
96
+ # Let's assume standard 2-level nesting: MHBODSYS -> MHDECOD
97
+ # Check if variables match this pattern
98
+
99
+ # Identify the hierarchy columns
100
+ # If standard usage: variables=[("MHBODSYS", "SOC"), ("MHDECOD", "PT")]
101
+
102
+ # We will build a list of (filter_expr, label, indent_level, is_header)
103
+
104
+ specs: list[dict[str, Any]] = []
105
+
106
+ # 1. Overall "Any Medical History"
107
+ specs.append(
108
+ {"filter": pl.lit(True), "label": "Any Medical History", "indent": 0, "is_header": False}
109
+ )
110
+
111
+ # Get distinct Body Systems
112
+ bodsys_list: list[str | None] = (
113
+ adq.select("MHBODSYS").unique().sort("MHBODSYS").to_series().to_list()
114
+ )
115
+
116
+ for sys in bodsys_list:
117
+ if sys is None:
118
+ continue
119
+
120
+ # Add Body System Row
121
+ specs.append(
122
+ {
123
+ "filter": pl.col("MHBODSYS") == sys,
124
+ "label": sys,
125
+ "indent": 1,
126
+ "is_header": False, # It has counts
127
+ }
128
+ )
129
+
130
+ # Get distinct Terms within this System
131
+ terms: list[str | None] = (
132
+ adq.filter(pl.col("MHBODSYS") == sys)
133
+ .select("MHDECOD")
134
+ .unique()
135
+ .sort("MHDECOD")
136
+ .to_series()
137
+ .to_list()
138
+ )
139
+
140
+ for term in terms:
141
+ if term is None:
142
+ continue
143
+ specs.append(
144
+ {
145
+ "filter": (pl.col("MHBODSYS") == sys) & (pl.col("MHDECOD") == term),
146
+ "label": term,
147
+ "indent": 2,
148
+ "is_header": False,
149
+ }
150
+ )
151
+
152
+ # Now calculate counts for each spec
153
+ results: list[dict[str, Any]] = []
154
+
155
+ # Get total population counts by group
156
+ pop_counts = adsl.group_by(group_col).count().sort(group_col)
157
+ groups: list[Any] = pop_counts.select(group_col).to_series().to_list()
158
+ # Pre-calculate totals map
159
+ pop_totals: dict[Any, int] = {
160
+ row[group_col]: row["count"] for row in pop_counts.iter_rows(named=True)
161
+ }
162
+
163
+ # Helper to calculate row
164
+ def calc_row(
165
+ spec: dict[str, Any], obs_data: pl.DataFrame, pop_data: pl.DataFrame
166
+ ) -> dict[str, Any]:
167
+ row_res = {"label": spec["label"], "indent": spec["indent"], "is_header": spec["is_header"]}
168
+
169
+ # Filter observation data based on spec string/expr
170
+ # Note: count_subject_with_observation logic handles join.
171
+ # We can simulate logic here.
172
+
173
+ # 1. Filter ADQ based on criteria
174
+ filtered_obs = obs_data.filter(spec["filter"])
175
+
176
+ # 2. Join with ADSL to get groups (inner join to count only subjects in population)
177
+ # But we already filtered ADSL (population).
178
+
179
+ subset = filtered_obs.join(pop_data.select([id_col, group_col]), on=id_col, how="inner")
180
+
181
+ # 3. Group by Group Col
182
+ counts = subset.select(id_col, group_col).unique().group_by(group_col).count()
183
+ counts_map = {row[group_col]: row["count"] for row in counts.iter_rows(named=True)}
184
+
185
+ for g in groups:
186
+ n = counts_map.get(g, 0)
187
+ denom = pop_totals.get(g, 0)
188
+ pct = (n / denom * 100.0) if denom > 0 else 0.0
189
+ row_res[f"count_{g}"] = n
190
+ row_res[f"pct_{g}"] = pct
191
+
192
+ return row_res
193
+
194
+ for spec in specs:
195
+ results.append(calc_row(spec, adq, adsl))
196
+
197
+ return pl.DataFrame(results)
198
+
199
+
200
+ def mh_summary_df(ard: pl.DataFrame) -> pl.DataFrame:
201
+ """
202
+ Transform ARD to Display DataFrame.
203
+ """
204
+ if ard.is_empty():
205
+ return pl.DataFrame()
206
+
207
+ # Identify group columns
208
+ cols = ard.columns
209
+ group_cols = [c for c in cols if c.startswith("count_")]
210
+ groups = [c.replace("count_", "") for c in group_cols]
211
+
212
+ select_exprs = [
213
+ (pl.lit(" ").repeat_by(pl.col("indent")).list.join("") + pl.col("label")).alias(
214
+ "Medical History"
215
+ )
216
+ ]
217
+
218
+ for g in groups:
219
+ col_n = pl.col(f"count_{g}")
220
+ col_pct = pl.col(f"pct_{g}")
221
+
222
+ fmt = (
223
+ col_n.cast(pl.Utf8)
224
+ + " ("
225
+ + col_pct.map_elements(lambda x: f"{x:.1f}", return_dtype=pl.Utf8)
226
+ + ")"
227
+ ).alias(g)
228
+
229
+ select_exprs.append(fmt)
230
+
231
+ return ard.select(select_exprs)
232
+
233
+
234
+ def mh_summary_rtf(
235
+ df: pl.DataFrame,
236
+ output_path: str,
237
+ title: list[str] | str,
238
+ footnote: list[str] | None,
239
+ source: list[str] | None,
240
+ ) -> None:
241
+ """
242
+ Generate RTF document.
243
+ """
244
+ if df.is_empty():
245
+ # Handle empty case?
246
+ return
247
+
248
+ n_cols = len(df.columns)
249
+ col_width_first = 2.5
250
+ remaining_width = 7.0 # Approx page width
251
+ col_width_others = remaining_width / (n_cols - 1)
252
+ col_widths = [col_width_first] + [col_width_others] * (n_cols - 1)
253
+
254
+ col_header_1 = list(df.columns)
255
+ col_header_2 = [""] + ["n (%)"] * (n_cols - 1)
256
+
257
+ rtf_doc = create_rtf_table_n_pct(
258
+ df=df,
259
+ col_header_1=col_header_1,
260
+ col_header_2=col_header_2,
261
+ col_widths=col_widths,
262
+ title=title,
263
+ footnote=footnote,
264
+ source=source,
265
+ )
266
+
267
+ rtf_doc.write_rtf(output_path)
268
+
269
+
270
+ def study_plan_to_mh_summary(study_plan: StudyPlan) -> list[str]:
271
+ """
272
+ Batch generate MH summaries from study plan.
273
+ """
274
+ analysis_type = "mh_summary"
275
+ output_dir = study_plan.output_dir
276
+
277
+ # Initialize parser
278
+ parser = StudyPlanParser(study_plan)
279
+
280
+ # Get plans
281
+ plans = study_plan.study_data.get("plans", [])
282
+ all_specs = []
283
+ for plan_data in plans:
284
+ expanded = study_plan.expander.expand_plan(plan_data)
285
+ for p in expanded:
286
+ all_specs.append(study_plan.expander.create_analysis_spec(p))
287
+
288
+ plan_df = pl.DataFrame(all_specs)
289
+
290
+ if "analysis" in plan_df.columns:
291
+ mh_plans = plan_df.filter(pl.col("analysis") == analysis_type)
292
+ else:
293
+ mh_plans = pl.DataFrame()
294
+
295
+ generated_files = []
296
+
297
+ for analysis in mh_plans.iter_rows(named=True):
298
+ pop_name = analysis.get("population", "enrolled")
299
+ group_kw = analysis.get("group", "trt01a") # specific key?
300
+
301
+ try:
302
+ # Load Population
303
+ adsl, group_col = parser.get_population_data(pop_name, group_kw)
304
+
305
+ # Load MH Data
306
+ # Note: Assuming 'admh' is the dataset name
307
+ (admh,) = parser.get_datasets("admh")
308
+
309
+ filename = f"{analysis_type}_{pop_name}_{group_kw}.rtf".lower()
310
+ output_path = f"{output_dir}/{filename}"
311
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
312
+
313
+ mh_summary(
314
+ population=adsl,
315
+ observation=admh,
316
+ population_filter=None, # Already filtered by parser
317
+ observation_filter="MHOCCUR = 'Y'",
318
+ group=(group_col, group_col), # Use actual col name
319
+ output_file=output_path,
320
+ title=[
321
+ "Summary of Medical History by System Organ Class and Preferred Term",
322
+ f"({pop_name} Population)",
323
+ ],
324
+ source=["Source: ADSL, ADMH"],
325
+ )
326
+
327
+ generated_files.append(output_path)
328
+
329
+ except Exception as e:
330
+ print(f"Error generating MH summary: {e}")
331
+ continue
332
+
333
+ return generated_files