csrlite 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csrlite/common/parse.py CHANGED
@@ -1,308 +1,308 @@
1
- # pyre-strict
2
- """
3
- StudyPlan Parsing Utilities
4
-
5
- This module provides utilities for parsing and extracting information from StudyPlan objects,
6
- including filter conversion, parameter parsing, and keyword resolution.
7
- """
8
-
9
- import re
10
- from typing import Any
11
-
12
- import polars as pl
13
-
14
- from .plan import StudyPlan
15
-
16
-
17
- def parse_filter_to_sql(filter_str: str) -> str:
18
- """
19
- Parse custom filter syntax to SQL WHERE clause.
20
-
21
- Converts:
22
- - "adsl:saffl == 'Y'" -> "SAFFL = 'Y'"
23
- - "adae:trtemfl == 'Y' and adae:aeser == 'Y'" -> "TRTEMFL = 'Y' AND AESER = 'Y'"
24
- - "adae:aerel in ['A', 'B']" -> "AEREL IN ('A', 'B')"
25
-
26
- Args:
27
- filter_str: Custom filter string with dataset:column format
28
-
29
- Returns:
30
- SQL WHERE clause string
31
- """
32
- if not filter_str or filter_str.strip() == "":
33
- return "1=1" # Always true
34
-
35
- # Remove dataset prefixes (adsl:, adae:)
36
- sql = re.sub(r"\w+:", "", filter_str)
37
-
38
- # Convert Python syntax to SQL
39
- sql = sql.replace("==", "=") # Python equality to SQL
40
- sql = sql.replace(" and ", " AND ") # Python to SQL
41
- sql = sql.replace(" and ", " AND ") # Python to SQL
42
- sql = sql.replace(" or ", " OR ") # Python to SQL
43
- sql = sql.replace(" in ", " IN ") # Python to SQL
44
-
45
- # Convert Python list syntax to SQL IN: ['A', 'B'] -> ('A', 'B')
46
- sql = sql.replace("[", "(").replace("]", ")")
47
-
48
- # Uppercase column names (assuming ADaM standard)
49
- # Match word boundaries before operators
50
- sql = re.sub(
51
- r"\b([a-z]\w*)\b(?=\s*[=<>!]|\s+IN)", lambda m: m.group(1).upper(), sql, flags=re.IGNORECASE
52
- )
53
-
54
- return sql
55
-
56
-
57
- def apply_filter_sql(df: pl.DataFrame, filter_str: str) -> pl.DataFrame:
58
- """
59
- Apply filter using pl.sql_expr() - simpler and faster than SQLContext.
60
-
61
- Args:
62
- df: DataFrame to filter
63
- filter_str: Custom filter string
64
-
65
- Returns:
66
- Filtered DataFrame
67
- """
68
- if not filter_str or filter_str.strip() == "":
69
- return df
70
-
71
- where_clause = parse_filter_to_sql(filter_str)
72
-
73
- try:
74
- # Use pl.sql_expr() - much simpler and faster!
75
- return df.filter(pl.sql_expr(where_clause))
76
- except Exception as e:
77
- # Fallback to manual parsing if SQL fails
78
- print(f"Warning: SQL filter failed ({e}), using fallback method")
79
- return df.filter(_parse_filter_expr(filter_str))
80
-
81
-
82
- def _parse_filter_expr(filter_str: str) -> Any:
83
- """
84
- Fallback filter parser using Polars expressions.
85
- Used if SQL parsing fails.
86
-
87
- Args:
88
- filter_str: Filter string
89
-
90
- Returns:
91
- Polars expression
92
- """
93
- if not filter_str or filter_str.strip() == "":
94
- return pl.lit(True)
95
-
96
- # Remove dataset prefixes
97
- filter_str = re.sub(r"\w+:", "", filter_str)
98
-
99
- # Handle 'in' operator: column in ['A', 'B'] -> pl.col(column).is_in(['A', 'B'])
100
- in_pattern = r"(\w+)\s+in\s+\[([^\]]+)\]"
101
-
102
- def _parse_between(match: re.Match[str]) -> str:
103
- col = match.group(1).upper()
104
- values = match.group(2)
105
- return f"(pl.col('{col}').is_in([{values}]))"
106
-
107
- filter_str = re.sub(in_pattern, _parse_between, filter_str)
108
-
109
- # Handle equality/inequality
110
- eq_pattern = r"(\w+)\s*(==|!=|>|<|>=|<=)\s*'([^']+)'"
111
-
112
- def _parse_like(match: re.Match[str]) -> str:
113
- col = match.group(1).upper()
114
- op = match.group(2)
115
- val = match.group(3)
116
- return f"(pl.col('{col}') {op} '{val}')"
117
-
118
- filter_str = re.sub(eq_pattern, _parse_like, filter_str)
119
-
120
- # Replace 'and'/'or'
121
- filter_str = filter_str.replace(" and ", " & ")
122
- filter_str = filter_str.replace(" or ", " | ")
123
-
124
- return eval(filter_str)
125
-
126
-
127
- def parse_parameter(parameter_str: str) -> list[str]:
128
- """
129
- Parse semicolon-separated parameter string.
130
-
131
- Args:
132
- parameter_str: Single parameter or semicolon-separated (e.g., "any;rel;ser")
133
-
134
- Returns:
135
- List of parameter names
136
- """
137
- if not parameter_str:
138
- return []
139
- if ";" in parameter_str:
140
- return [p.strip() for p in parameter_str.split(";")]
141
- return [parameter_str]
142
-
143
-
144
- class StudyPlanParser:
145
- """
146
- Parser class for extracting and resolving information from StudyPlan objects.
147
-
148
- This class provides methods to extract filters, labels, and other configuration
149
- from StudyPlan keywords and convert them to analysis-ready formats.
150
- """
151
-
152
- def __init__(self, study_plan: StudyPlan) -> None:
153
- """
154
- Initialize parser with a StudyPlan object.
155
-
156
- Args:
157
- study_plan: StudyPlan object with loaded datasets and keywords
158
- """
159
- self.study_plan = study_plan
160
-
161
- def get_population_filter(self, population: str) -> str:
162
- """
163
- Get population filter as SQL WHERE clause.
164
-
165
- Args:
166
- population: Population keyword name
167
-
168
- Returns:
169
- SQL WHERE clause string
170
-
171
- Raises:
172
- ValueError: If population keyword not found
173
- """
174
- pop = self.study_plan.keywords.get_population(population)
175
- if pop is None:
176
- raise ValueError(f"Population '{population}' not found")
177
- return parse_filter_to_sql(pop.filter)
178
-
179
- def get_observation_filter(self, observation: str | None) -> str | None:
180
- """
181
- Get observation filter as SQL WHERE clause.
182
-
183
- Args:
184
- observation: Optional observation keyword name
185
-
186
- Returns:
187
- SQL WHERE clause string or None if observation not specified
188
- """
189
- if not observation:
190
- return None
191
- obs = self.study_plan.keywords.get_observation(observation)
192
- if obs:
193
- return parse_filter_to_sql(obs.filter)
194
- return None
195
-
196
- def get_parameter_info(
197
- self, parameter: str
198
- ) -> tuple[list[str], list[str], list[str], list[int]]:
199
- """
200
- Get parameter names, filters, labels, and indent levels.
201
-
202
- Args:
203
- parameter: Parameter keyword, can be semicolon-separated (e.g., "any;rel;ser")
204
-
205
- Returns:
206
- Tuple of (parameter_names, parameter_filters, parameter_labels, parameter_indents)
207
-
208
- Raises:
209
- ValueError: If any parameter keyword not found
210
- """
211
- param_names = parse_parameter(parameter)
212
- param_labels = []
213
- param_filters = []
214
- param_indents = []
215
-
216
- for param_name in param_names:
217
- param = self.study_plan.keywords.get_parameter(param_name)
218
- if param is None:
219
- raise ValueError(f"Parameter '{param_name}' not found")
220
- param_filters.append(parse_filter_to_sql(param.filter))
221
- param_labels.append(param.label or param_name)
222
- param_indents.append(param.indent)
223
-
224
- return param_names, param_filters, param_labels, param_indents
225
-
226
- def get_single_parameter_info(self, parameter: str) -> tuple[str, str]:
227
- """
228
- Get single parameter filter and label (NOT semicolon-separated).
229
-
230
- Args:
231
- parameter: Single parameter keyword name
232
-
233
- Returns:
234
- Tuple of (parameter_filter, parameter_label)
235
-
236
- Raises:
237
- ValueError: If parameter keyword not found
238
- """
239
- param = self.study_plan.keywords.get_parameter(parameter)
240
- if param is None:
241
- raise ValueError(f"Parameter '{parameter}' not found")
242
- return parse_filter_to_sql(param.filter), param.label or parameter
243
-
244
- def get_group_info(self, group: str) -> tuple[str, list[str]]:
245
- """
246
- Get group variable name and labels.
247
-
248
- Args:
249
- group: Group keyword name
250
-
251
- Returns:
252
- Tuple of (group_variable, group_labels)
253
-
254
- Raises:
255
- ValueError: If group keyword not found
256
- """
257
- grp = self.study_plan.keywords.get_group(group)
258
- if grp is None:
259
- raise ValueError(f"Group '{group}' not found")
260
-
261
- group_var = grp.variable.split(":")[-1].upper()
262
- group_labels = grp.group_label if grp.group_label else []
263
-
264
- return group_var, group_labels
265
-
266
- def get_datasets(self, *dataset_names: str) -> tuple[pl.DataFrame, ...]:
267
- """
268
- Get multiple datasets from StudyPlan.
269
-
270
- Args:
271
- *dataset_names: Names of datasets to retrieve (e.g., "adsl", "adae")
272
-
273
- Returns:
274
- Tuple of DataFrames in the order requested
275
-
276
- Raises:
277
- ValueError: If any dataset not found
278
- """
279
- datasets = []
280
- for name in dataset_names:
281
- ds = self.study_plan.datasets.get(name)
282
- if ds is None:
283
- raise ValueError(f"Dataset '{name}' not found in study plan")
284
- datasets.append(ds)
285
- return tuple(datasets)
286
-
287
- def get_population_data(self, population: str, group: str) -> tuple[pl.DataFrame, str]:
288
- """
289
- Get filtered population dataset and group variable.
290
-
291
- Args:
292
- population: Population keyword name
293
- group: Group keyword name
294
-
295
- Returns:
296
- Tuple of (filtered_adsl, group_variable)
297
- """
298
- # Get ADSL dataset
299
- (adsl,) = self.get_datasets("adsl")
300
-
301
- # Apply population filter
302
- pop_filter = self.get_population_filter(population)
303
- adsl_pop = apply_filter_sql(adsl, pop_filter)
304
-
305
- # Get group variable
306
- group_var, _ = self.get_group_info(group)
307
-
308
- return adsl_pop, group_var
1
+ # pyre-strict
2
+ """
3
+ StudyPlan Parsing Utilities
4
+
5
+ This module provides utilities for parsing and extracting information from StudyPlan objects,
6
+ including filter conversion, parameter parsing, and keyword resolution.
7
+ """
8
+
9
+ import re
10
+ from typing import Any
11
+
12
+ import polars as pl
13
+
14
+ from .plan import StudyPlan
15
+
16
+
17
+ def parse_filter_to_sql(filter_str: str) -> str:
18
+ """
19
+ Parse custom filter syntax to SQL WHERE clause.
20
+
21
+ Converts:
22
+ - "adsl:saffl == 'Y'" -> "SAFFL = 'Y'"
23
+ - "adae:trtemfl == 'Y' and adae:aeser == 'Y'" -> "TRTEMFL = 'Y' AND AESER = 'Y'"
24
+ - "adae:aerel in ['A', 'B']" -> "AEREL IN ('A', 'B')"
25
+
26
+ Args:
27
+ filter_str: Custom filter string with dataset:column format
28
+
29
+ Returns:
30
+ SQL WHERE clause string
31
+ """
32
+ if not filter_str or filter_str.strip() == "":
33
+ return "1=1" # Always true
34
+
35
+ # Remove dataset prefixes (adsl:, adae:)
36
+ sql = re.sub(r"\w+:", "", filter_str)
37
+
38
+ # Convert Python syntax to SQL
39
+ sql = sql.replace("==", "=") # Python equality to SQL
40
+ sql = sql.replace(" and ", " AND ") # Python to SQL
41
+ sql = sql.replace(" and ", " AND ") # Python to SQL
42
+ sql = sql.replace(" or ", " OR ") # Python to SQL
43
+ sql = sql.replace(" in ", " IN ") # Python to SQL
44
+
45
+ # Convert Python list syntax to SQL IN: ['A', 'B'] -> ('A', 'B')
46
+ sql = sql.replace("[", "(").replace("]", ")")
47
+
48
+ # Uppercase column names (assuming ADaM standard)
49
+ # Match word boundaries before operators
50
+ sql = re.sub(
51
+ r"\b([a-z]\w*)\b(?=\s*[=<>!]|\s+IN)", lambda m: m.group(1).upper(), sql, flags=re.IGNORECASE
52
+ )
53
+
54
+ return sql
55
+
56
+
57
+ def apply_filter_sql(df: pl.DataFrame, filter_str: str) -> pl.DataFrame:
58
+ """
59
+ Apply filter using pl.sql_expr() - simpler and faster than SQLContext.
60
+
61
+ Args:
62
+ df: DataFrame to filter
63
+ filter_str: Custom filter string
64
+
65
+ Returns:
66
+ Filtered DataFrame
67
+ """
68
+ if not filter_str or filter_str.strip() == "":
69
+ return df
70
+
71
+ where_clause = parse_filter_to_sql(filter_str)
72
+
73
+ try:
74
+ # Use pl.sql_expr() - much simpler and faster!
75
+ return df.filter(pl.sql_expr(where_clause))
76
+ except Exception as e:
77
+ # Fallback to manual parsing if SQL fails
78
+ print(f"Warning: SQL filter failed ({e}), using fallback method")
79
+ return df.filter(_parse_filter_expr(filter_str))
80
+
81
+
82
+ def _parse_filter_expr(filter_str: str) -> Any:
83
+ """
84
+ Fallback filter parser using Polars expressions.
85
+ Used if SQL parsing fails.
86
+
87
+ Args:
88
+ filter_str: Filter string
89
+
90
+ Returns:
91
+ Polars expression
92
+ """
93
+ if not filter_str or filter_str.strip() == "":
94
+ return pl.lit(True)
95
+
96
+ # Remove dataset prefixes
97
+ filter_str = re.sub(r"\w+:", "", filter_str)
98
+
99
+ # Handle 'in' operator: column in ['A', 'B'] -> pl.col(column).is_in(['A', 'B'])
100
+ in_pattern = r"(\w+)\s+in\s+\[([^\]]+)\]"
101
+
102
+ def _parse_between(match: re.Match[str]) -> str:
103
+ col = match.group(1).upper()
104
+ values = match.group(2)
105
+ return f"(pl.col('{col}').is_in([{values}]))"
106
+
107
+ filter_str = re.sub(in_pattern, _parse_between, filter_str)
108
+
109
+ # Handle equality/inequality
110
+ eq_pattern = r"(\w+)\s*(==|!=|>|<|>=|<=)\s*'([^']+)'"
111
+
112
+ def _parse_like(match: re.Match[str]) -> str:
113
+ col = match.group(1).upper()
114
+ op = match.group(2)
115
+ val = match.group(3)
116
+ return f"(pl.col('{col}') {op} '{val}')"
117
+
118
+ filter_str = re.sub(eq_pattern, _parse_like, filter_str)
119
+
120
+ # Replace 'and'/'or'
121
+ filter_str = filter_str.replace(" and ", " & ")
122
+ filter_str = filter_str.replace(" or ", " | ")
123
+
124
+ return eval(filter_str)
125
+
126
+
127
+ def parse_parameter(parameter_str: str) -> list[str]:
128
+ """
129
+ Parse semicolon-separated parameter string.
130
+
131
+ Args:
132
+ parameter_str: Single parameter or semicolon-separated (e.g., "any;rel;ser")
133
+
134
+ Returns:
135
+ List of parameter names
136
+ """
137
+ if not parameter_str:
138
+ return []
139
+ if ";" in parameter_str:
140
+ return [p.strip() for p in parameter_str.split(";")]
141
+ return [parameter_str]
142
+
143
+
144
+ class StudyPlanParser:
145
+ """
146
+ Parser class for extracting and resolving information from StudyPlan objects.
147
+
148
+ This class provides methods to extract filters, labels, and other configuration
149
+ from StudyPlan keywords and convert them to analysis-ready formats.
150
+ """
151
+
152
+ def __init__(self, study_plan: StudyPlan) -> None:
153
+ """
154
+ Initialize parser with a StudyPlan object.
155
+
156
+ Args:
157
+ study_plan: StudyPlan object with loaded datasets and keywords
158
+ """
159
+ self.study_plan = study_plan
160
+
161
+ def get_population_filter(self, population: str) -> str:
162
+ """
163
+ Get population filter as SQL WHERE clause.
164
+
165
+ Args:
166
+ population: Population keyword name
167
+
168
+ Returns:
169
+ SQL WHERE clause string
170
+
171
+ Raises:
172
+ ValueError: If population keyword not found
173
+ """
174
+ pop = self.study_plan.keywords.get_population(population)
175
+ if pop is None:
176
+ raise ValueError(f"Population '{population}' not found")
177
+ return parse_filter_to_sql(pop.filter)
178
+
179
+ def get_observation_filter(self, observation: str | None) -> str | None:
180
+ """
181
+ Get observation filter as SQL WHERE clause.
182
+
183
+ Args:
184
+ observation: Optional observation keyword name
185
+
186
+ Returns:
187
+ SQL WHERE clause string or None if observation not specified
188
+ """
189
+ if not observation:
190
+ return None
191
+ obs = self.study_plan.keywords.get_observation(observation)
192
+ if obs:
193
+ return parse_filter_to_sql(obs.filter)
194
+ return None
195
+
196
+ def get_parameter_info(
197
+ self, parameter: str
198
+ ) -> tuple[list[str], list[str], list[str], list[int]]:
199
+ """
200
+ Get parameter names, filters, labels, and indent levels.
201
+
202
+ Args:
203
+ parameter: Parameter keyword, can be semicolon-separated (e.g., "any;rel;ser")
204
+
205
+ Returns:
206
+ Tuple of (parameter_names, parameter_filters, parameter_labels, parameter_indents)
207
+
208
+ Raises:
209
+ ValueError: If any parameter keyword not found
210
+ """
211
+ param_names = parse_parameter(parameter)
212
+ param_labels = []
213
+ param_filters = []
214
+ param_indents = []
215
+
216
+ for param_name in param_names:
217
+ param = self.study_plan.keywords.get_parameter(param_name)
218
+ if param is None:
219
+ raise ValueError(f"Parameter '{param_name}' not found")
220
+ param_filters.append(parse_filter_to_sql(param.filter))
221
+ param_labels.append(param.label or param_name)
222
+ param_indents.append(param.indent)
223
+
224
+ return param_names, param_filters, param_labels, param_indents
225
+
226
+ def get_single_parameter_info(self, parameter: str) -> tuple[str, str]:
227
+ """
228
+ Get single parameter filter and label (NOT semicolon-separated).
229
+
230
+ Args:
231
+ parameter: Single parameter keyword name
232
+
233
+ Returns:
234
+ Tuple of (parameter_filter, parameter_label)
235
+
236
+ Raises:
237
+ ValueError: If parameter keyword not found
238
+ """
239
+ param = self.study_plan.keywords.get_parameter(parameter)
240
+ if param is None:
241
+ raise ValueError(f"Parameter '{parameter}' not found")
242
+ return parse_filter_to_sql(param.filter), param.label or parameter
243
+
244
+ def get_group_info(self, group: str) -> tuple[str, list[str]]:
245
+ """
246
+ Get group variable name and labels.
247
+
248
+ Args:
249
+ group: Group keyword name
250
+
251
+ Returns:
252
+ Tuple of (group_variable, group_labels)
253
+
254
+ Raises:
255
+ ValueError: If group keyword not found
256
+ """
257
+ grp = self.study_plan.keywords.get_group(group)
258
+ if grp is None:
259
+ raise ValueError(f"Group '{group}' not found")
260
+
261
+ group_var = grp.variable.split(":")[-1].upper()
262
+ group_labels = grp.group_label if grp.group_label else []
263
+
264
+ return group_var, group_labels
265
+
266
+ def get_datasets(self, *dataset_names: str) -> tuple[pl.DataFrame, ...]:
267
+ """
268
+ Get multiple datasets from StudyPlan.
269
+
270
+ Args:
271
+ *dataset_names: Names of datasets to retrieve (e.g., "adsl", "adae")
272
+
273
+ Returns:
274
+ Tuple of DataFrames in the order requested
275
+
276
+ Raises:
277
+ ValueError: If any dataset not found
278
+ """
279
+ datasets = []
280
+ for name in dataset_names:
281
+ ds = self.study_plan.datasets.get(name)
282
+ if ds is None:
283
+ raise ValueError(f"Dataset '{name}' not found in study plan")
284
+ datasets.append(ds)
285
+ return tuple(datasets)
286
+
287
+ def get_population_data(self, population: str, group: str) -> tuple[pl.DataFrame, str]:
288
+ """
289
+ Get filtered population dataset and group variable.
290
+
291
+ Args:
292
+ population: Population keyword name
293
+ group: Group keyword name
294
+
295
+ Returns:
296
+ Tuple of (filtered_adsl, group_variable)
297
+ """
298
+ # Get ADSL dataset
299
+ (adsl,) = self.get_datasets("adsl")
300
+
301
+ # Apply population filter
302
+ pop_filter = self.get_population_filter(population)
303
+ adsl_pop = apply_filter_sql(adsl, pop_filter)
304
+
305
+ # Get group variable
306
+ group_var, _ = self.get_group_info(group)
307
+
308
+ return adsl_pop, group_var