cdiscbuilder 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
cdisc_builder/cli.py ADDED
@@ -0,0 +1,42 @@
1
+ import os
2
+ import argparse
3
+ from .odm import parse_odm_to_long_df
4
+ from .sdtm import create_sdtm_datasets
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(description="Convert ODM XML to SDTM Datasets")
8
+ # Determine default config path inside package
9
+ current_dir = os.path.dirname(__file__)
10
+ default_config_path = os.path.join(current_dir, "specs")
11
+
12
+ parser.add_argument("--xml", required=True, help="Path to input ODM XML file")
13
+ parser.add_argument("--csv", default="odm_long.csv", help="Path to intermediate long CSV file")
14
+ parser.add_argument("--configs", default=default_config_path, help="Path to SDTM configuration directory")
15
+ parser.add_argument("--output", default="sdtm_output", help="Path to output SDTM directory")
16
+
17
+ args = parser.parse_args()
18
+
19
+ # Step 1: ODM XML -> Long CSV
20
+ print(f"--- Step 1: Parsing ODM XML from {args.xml} ---")
21
+ try:
22
+ df = parse_odm_to_long_df(args.xml)
23
+ print(f"Parsed {len(df)} rows.")
24
+ df.to_csv(args.csv, index=False)
25
+ print(f"Saved intermediate data to {args.csv}")
26
+ except Exception as e:
27
+ print(f"Error parsing XML: {e}")
28
+ return
29
+
30
+ # Step 2: Long CSV -> SDTM Datasets
31
+ print(f"\n--- Step 2: Generating SDTM Datasets using configs from {args.configs} ---")
32
+ if not os.path.exists(args.output):
33
+ os.makedirs(args.output)
34
+
35
+ try:
36
+ create_sdtm_datasets(args.configs, args.csv, args.output)
37
+ print(f"\nSuccess! SDTM datasets created in {args.output}")
38
+ except Exception as e:
39
+ print(f"Error creating SDTM datasets: {e}")
40
+
41
+ if __name__ == "__main__":
42
+ main()
File without changes
File without changes
@@ -0,0 +1,267 @@
1
+ import pandas as pd
2
+ import yaml
3
+ import os
4
+
5
+ from ..functions import extract_value
6
+
7
+ # Map string names to actual functions
8
+ FUNCTION_MAP = {
9
+ 'extract_value': extract_value
10
+ }
11
+
12
+ class FindingsProcessor:
13
+ def __init__(self, metadata_path):
14
+ self.metadata = {}
15
+ if os.path.exists(metadata_path):
16
+ with open(metadata_path, 'r') as f:
17
+ self.metadata = yaml.safe_load(f)
18
+ else:
19
+ print(f"Warning: Metadata file not found at {metadata_path}")
20
+
21
+ def process(self, domain_name, sources, df_long, default_keys):
22
+ domain_dfs = []
23
+
24
+ for settings in sources:
25
+ # Check for 'columns' list with functions (New Strategy)
26
+ columns_cfg = settings.get('columns', [])
27
+
28
+ # If columns is a dict (old style), convert or warn?
29
+ # The new FA.yaml uses list of dicts.
30
+
31
+ is_functional = isinstance(columns_cfg, list) and any('function' in c for c in columns_cfg)
32
+
33
+ if is_functional:
34
+ # --- Functional Strategy ---
35
+
36
+ # We need to build the "Main" DF from the backbone columns (those with multiple items)
37
+ # and "Attribute" DFs from single-item columns, then merge.
38
+
39
+ # 1. Group columns by their ItemOID signature (to identify backbone vs attributes)
40
+ # Keys are always required for joining.
41
+ # To prevent cross-form contamination (e.g. Form A Repeat 1 joining to Form B Repeat 1),
42
+ # we SHOULD include FormOID in the join keys if we are mixing forms.
43
+ # We assume FormOID is available in df_long.
44
+
45
+ config_keys = settings.get('keys', default_keys)
46
+ keys = config_keys # Alias for compatibility
47
+
48
+ # Check if we should enforce FormOID joining
49
+ # If simplified config, we likely want strict form isolation.
50
+ join_keys = config_keys.copy()
51
+ if 'FormOID' not in join_keys and 'FormOID' in df_long.columns:
52
+ join_keys.append('FormOID')
53
+
54
+ # Store resulting dataframes
55
+
56
+ # To identify the "Backbone" (which determines the number of rows), we look for the list of items.
57
+ # Actually, with the Naive Merge strategy (Full Outer Join or Left Join on Main?),
58
+ # we usually treat the "Test Code" column as the driver.
59
+
60
+ # Let's iterate and collect data for each column.
61
+
62
+ # Optimization: Cache `func_fa` results if forms/items are identical?
63
+
64
+ # We need a primary DF to merge into.
65
+ # Usually FATESTCD or FAORRES defines the rows.
66
+
67
+ # Let's try: Generate DF for EACH column, then merge them all.
68
+ # Merge logic:
69
+ # - If signature (forms, items) is identical: Merge on Keys + ItemOID.
70
+ # - If signature is different (e.g. single item vs list): Merge on Keys only?
71
+ # - Wait, if we merge 'DATE' (1 row per subject) to 'AESEV' (1 row per subject),
72
+ # on Keys -> Cartesian product if there are multiple AE records?
73
+ # SDTM Keys: USUBJID, FASEQ.
74
+ # ODM Keys: StudyOID, SubjectKey, ItemGroupRepeatKey.
75
+ # If keys are unique per finding, 1:1 merge works.
76
+ # If mult-items (Backbone) share keys?
77
+ # e.g. Form.AE has 1 record (RepeatKey 1). Contains AESEV, AEREL.
78
+ # func_fa(AESEV+AEREL) -> 2 rows (ItemOID AESEV, ItemOID AEREL). Same RepeatKey.
79
+ # func_fa(DATE) -> 1 row (ItemOID DATE). Same RepeatKey.
80
+ # Merge: DATE should join to BOTH AESEV and AEREL.
81
+ # So Left Join from Backbone to Attribute on Keys is correct.
82
+
83
+ primary_df = pd.DataFrame()
84
+ attribute_dfs = []
85
+
86
+ for col_def in columns_cfg:
87
+ name = col_def.get('name')
88
+ func_name = col_def.get('function')
89
+
90
+ if not func_name:
91
+ # Literal or simple mapping logic handles later?
92
+ # Or maybe standard 'source' mapping?
93
+ continue
94
+
95
+ if func_name == 'func_fa':
96
+ form_oids = col_def.get('formoid')
97
+ item_oids = col_def.get('itemoid')
98
+
99
+ if name == 'VISIT':
100
+ pass # Removed Debug
101
+
102
+ # Normalize to list
103
+ if not isinstance(form_oids, list):
104
+ form_oids = [form_oids] if form_oids else []
105
+ col_type = col_def.get('type', 'str')
106
+
107
+ # Determine return_col
108
+ # ... lines 105-108 ...
109
+ return_col = 'Value'
110
+ if name in ['FATESTCD', 'LBTESTCD', 'VSTESTCD', 'QSTESTCD']:
111
+ return_col = 'ItemOID'
112
+
113
+ # Call function
114
+ df_res = extract_value(df_long, form_oids, item_oids, return_col=return_col, keys=join_keys)
115
+
116
+ # Removed Debug
117
+
118
+ if df_res.empty:
119
+ print(f"Warning: No data for {name}")
120
+ continue
121
+
122
+ # Rename result column to target name
123
+ # func_fa returns Keys + [return_col]
124
+ # If return_col was 'Value', it's now named 'Value'.
125
+ # We rename it to `name` (e.g. FAORRES).
126
+ df_res = df_res.rename(columns={return_col: name})
127
+
128
+ # Identify if this is Backbone or Attribute
129
+ # Heuristic: If name implies Topic/Result (TESTCD, ORRES) AND it has multiple items,
130
+ # it is Backbone.
131
+ # Attributes (like DTC) might have multiple items (to search across forms) but are not Row Generators.
132
+
133
+ is_result = 'ORRES' in name or 'STRES' in name or 'STAT' in name or 'REASND' in name or 'TERM' in name or 'DECOD' in name or 'VISIT' in name or 'DTC' in name
134
+ is_backbone_col = name.endswith('TESTCD') or is_result or name.endswith('OBJ')
135
+ is_list = isinstance(item_oids, list) and len(item_oids) > 1 and is_backbone_col
136
+
137
+ if is_list:
138
+ # This is part of the Backbone.
139
+ # We merge these on Keys + ItemOID?
140
+ # Wait, if we have FATESTCD (Values: AESEV, AEREL) and FAORRES (Values: MILD, POSSIBLE).
141
+ # Both have 'ItemOID' column from source? NO.
142
+ # func_fa returns Keys + Result. It DROPS ItemOID column unless return_col='ItemOID'.
143
+ # Converting FATESTCD: returns Keys + FATESTCD (values are AESEV, AEREL).
144
+ # Converting FAORRES: returns Keys + FAORRES (values are MILD, POSSIBLE).
145
+ # BUT we lost the link that MILD belongs to AESEV!
146
+ # CRITICAL: func_fa MUST return ItemOID column to allow aligning Backbone columns!
147
+
148
+ # Re-calling func_fa to get 'ItemOID' for alignment.
149
+ # Or update func_fa to always return ItemOID?
150
+ # For FAORRES, we definitely need ItemOID to know which test it is.
151
+
152
+ # FIX for extract_value usage here:
153
+ # We always ask for ItemOID as a key for merging.
154
+ df_res_with_id = extract_value(df_long, form_oids, item_oids, return_col=return_col, keys=join_keys + ['ItemOID'])
155
+
156
+ # Create target column instead of renaming, to preserve ItemOID key
157
+ # If return_col is 'ItemOID', it's the same column.
158
+ # If return_col is 'Value', we have both.
159
+
160
+ # Handle duplicate column issue from func_fa if return_col in keys
161
+ df_res_with_id = df_res_with_id.loc[:, ~df_res_with_id.columns.duplicated()]
162
+
163
+ if return_col in df_res_with_id.columns:
164
+ df_res_with_id[name] = df_res_with_id[return_col]
165
+ if return_col != 'ItemOID' and return_col != name:
166
+ # Drop original value col if not needed and not ItemOID
167
+ # Actually, we keep ItemOID for join. We can drop 'Value' after copy.
168
+ df_res_with_id = df_res_with_id.drop(columns=[return_col])
169
+ else:
170
+ # Should not happen if func_fa works
171
+ print(f"Warning: return column {return_col} missing in func_fa result for {name}")
172
+
173
+ if is_backbone_col:
174
+ if primary_df.empty:
175
+ primary_df = df_res
176
+ else:
177
+ # Merge logic
178
+ primary_df = pd.merge(primary_df, df_res, on=join_keys, how='outer')
179
+ else:
180
+ # Attribute (Single item, e.g. DATE)
181
+ # We just need Keys + Value. ItemOID is not a join key for the backbone.
182
+ attribute_dfs.append(df_res)
183
+
184
+ else:
185
+ # Single Item Logic (df_res already computed and renamed at top)
186
+ if is_backbone_col:
187
+ if primary_df.empty:
188
+ primary_df = df_res
189
+ else:
190
+ primary_df = pd.merge(primary_df, df_res, on=join_keys, how='outer')
191
+ else:
192
+ attribute_dfs.append(df_res)
193
+
194
+ # If we built a primary DF, merge attributes
195
+ if not primary_df.empty:
196
+ final_df = primary_df
197
+
198
+ for att_df in attribute_dfs:
199
+ # Join on keys only
200
+ final_df = pd.merge(final_df, att_df, on=join_keys, how='left')
201
+
202
+ # Now populate other columns and Enforce Types
203
+ for col_def in columns_cfg:
204
+ name = col_def.get('name')
205
+ col_type = col_def.get('type', 'str')
206
+ source = col_def.get('source')
207
+
208
+ # logic to overwrite if source is present
209
+ if name in final_df.columns and not source:
210
+ # Already populated by func_fa and no override requested
211
+ pass
212
+ elif source and source in final_df.columns:
213
+ # Overwrite or Populate from Source
214
+ final_df[name] = final_df[source]
215
+ else:
216
+ # Populate missing columns
217
+ series = None
218
+ literal = col_def.get('literal')
219
+
220
+ if literal:
221
+ series = pd.Series([literal] * len(final_df), index=final_df.index)
222
+
223
+ # Defaults Logic (Legacy / Backup)
224
+ elif name == 'STUDYID' and 'StudyOID' in keys:
225
+ if 'StudyOID' in final_df.columns:
226
+ series = final_df['StudyOID']
227
+ elif name == 'USUBJID' and 'SubjectKey' in keys:
228
+ if 'SubjectKey' in final_df.columns:
229
+ series = final_df['SubjectKey']
230
+ elif name == 'FASEQ' and 'ItemGroupRepeatKey' in keys:
231
+ if 'ItemGroupRepeatKey' in final_df.columns:
232
+ series = final_df['ItemGroupRepeatKey']
233
+ elif name == 'DOMAIN':
234
+ series = pd.Series([domain_name] * len(final_df), index=final_df.index)
235
+
236
+ # Metadata Lookup
237
+ elif name == 'FATEST' and 'FATESTCD' in final_df.columns:
238
+ series = final_df['FATESTCD'].apply(lambda x: self.metadata.get(x, {}).get('test', x))
239
+ elif name == 'FAOBJ' and 'FATESTCD' in final_df.columns:
240
+ series = final_df['FATESTCD'].apply(lambda x: self.metadata.get(x, {}).get('obj', None))
241
+
242
+ else:
243
+ # Fallback: Empty
244
+ series = pd.Series([None] * len(final_df), index=final_df.index)
245
+
246
+ if series is not None:
247
+ final_df[name] = series
248
+
249
+ # Type Enforcement (Apply to ALL columns, whether func_fa or fallback)
250
+ if col_type == 'int':
251
+ final_df[name] = pd.to_numeric(final_df[name], errors='coerce').astype('Int64')
252
+ elif col_type == 'float':
253
+ final_df[name] = pd.to_numeric(final_df[name], errors='coerce')
254
+ else:
255
+ # String cleanup
256
+ final_df[name] = final_df[name].astype(str).replace('nan', None).replace('None', None)
257
+
258
+ # Store Variable Labels in df.attrs["labels"]
259
+ labels = {col['name']: col.get('label', '') for col in columns_cfg if 'label' in col}
260
+ final_df.attrs['labels'] = labels
261
+
262
+ domain_dfs.append(final_df)
263
+
264
+ else:
265
+ continue
266
+
267
+ return domain_dfs
@@ -0,0 +1,111 @@
1
+ import pandas as pd
2
+
3
+ class GeneralProcessor:
4
+ def process(self, domain_name, sources, df_long, default_keys):
5
+ domain_dfs = []
6
+
7
+ for settings in sources:
8
+ # 1. Filter by FormOID
9
+ form_oid = settings.get('formoid')
10
+ if form_oid:
11
+ try:
12
+ # Filter for specific FormOID
13
+ source_df = df_long[df_long['FormOID'] == form_oid].copy()
14
+ except Exception as e:
15
+ print(f"Error filtering for {domain_name} (FormOID={form_oid}): {e}")
16
+ continue
17
+ else:
18
+ print(f"Warning: No formoid specified for a block in {domain_name}")
19
+ continue
20
+
21
+ if source_df.empty:
22
+ continue
23
+
24
+ # 2. Key columns for pivoting (use block keys or defaults)
25
+ keys = settings.get('keys', default_keys)
26
+
27
+ # 3. Pivot
28
+ try:
29
+ pivoted = source_df.pivot_table(
30
+ index=keys,
31
+ columns='ItemOID',
32
+ values='Value',
33
+ aggfunc='first'
34
+ ).reset_index()
35
+ except Exception as e:
36
+ print(f"Error pivoting for {domain_name}: {e}")
37
+ continue
38
+
39
+ # 4. Map columns
40
+ final_df = pd.DataFrame()
41
+ mappings = settings.get('columns', {})
42
+
43
+ for target_col, col_config in mappings.items():
44
+ source_expr = None
45
+ literal_expr = None
46
+ target_type = None
47
+ value_map = None
48
+
49
+ # Check if simple string or object config
50
+ if isinstance(col_config, dict):
51
+ source_expr = col_config.get('source')
52
+ literal_expr = col_config.get('literal')
53
+ target_type = col_config.get('type')
54
+ value_map = col_config.get('value_mapping')
55
+ else:
56
+ source_expr = col_config
57
+ literal_expr = None
58
+
59
+ # Extract Data
60
+ series = None
61
+ if literal_expr is not None:
62
+ # Explicit literal value
63
+ series = pd.Series([literal_expr] * len(pivoted))
64
+ elif source_expr:
65
+ if source_expr in pivoted.columns:
66
+ series = pivoted[source_expr].copy()
67
+ else:
68
+ # Source defined but not found.
69
+ print(f"Warning: Source column '{source_expr}' not found for '{domain_name}.{target_col}'. Filling with NaN.")
70
+ series = pd.Series([None] * len(pivoted))
71
+ else:
72
+ print(f"Warning: No source or literal defined for '{domain_name}.{target_col}'. Filling with NaN.")
73
+ series = pd.Series([None] * len(pivoted))
74
+
75
+ # Apply Value Mapping
76
+ if value_map:
77
+ series = series.replace(value_map)
78
+
79
+ # Apply Type Conversion
80
+ if target_type:
81
+ try:
82
+ if target_type == 'int':
83
+ series = pd.to_numeric(series, errors='coerce').astype('Int64')
84
+ elif target_type == 'float':
85
+ series = pd.to_numeric(series, errors='coerce')
86
+ elif target_type == 'str':
87
+ series = series.astype(str)
88
+ elif target_type == 'bool':
89
+ series = series.astype(bool)
90
+ except Exception as e:
91
+ print(f"Error converting {target_col} to {target_type}: {e}")
92
+
93
+ final_df[target_col] = series
94
+
95
+ # Validation: max_missing_pct
96
+ if isinstance(col_config, dict):
97
+ max_missing = col_config.get('max_missing_pct')
98
+ if max_missing is not None:
99
+ missing_count = series.isna().sum()
100
+ if target_type == 'str':
101
+ missing_count += (series.isin(['nan', 'None'])).sum()
102
+
103
+ total = len(series)
104
+ if total > 0:
105
+ pct = (missing_count / total) * 100
106
+ if pct > max_missing:
107
+ print(f"WARNING: [Validation] {domain_name}.{target_col} missing {pct:.2f}% (Limit: {max_missing:})")
108
+
109
+ domain_dfs.append(final_df)
110
+
111
+ return domain_dfs
@@ -0,0 +1,57 @@
1
+ import yaml
2
+ import os
3
+
4
+ from .validate import load_schema, validate_domain_config
5
+
6
+ def load_config(config_dir):
7
+ """
8
+ Loads all YAML configuration files from the specified directory.
9
+ Validates them against schema.yaml.
10
+ """
11
+ config = {
12
+ 'domains': {},
13
+ 'defaults': {}
14
+ }
15
+
16
+ schema = load_schema()
17
+
18
+ if not os.path.exists(config_dir):
19
+ # Fallback to package data if default path doesn't exist?
20
+ # Assuming config_dir provided is valid or we expect empty.
21
+ return config
22
+
23
+ for filename in os.listdir(config_dir):
24
+ if filename.endswith(".yaml") or filename.endswith(".yml"):
25
+ file_path = os.path.join(config_dir, filename)
26
+
27
+ # Skip schema itself if present in same dir
28
+ if filename == "schema.yaml":
29
+ continue
30
+
31
+ with open(file_path, "r") as f:
32
+ try:
33
+ data = yaml.safe_load(f)
34
+
35
+ if filename == 'defaults.yaml':
36
+ # Defaults file - likely flat dict
37
+ config['defaults'].update(data)
38
+ continue
39
+
40
+ # Merge data
41
+ for key, value in data.items():
42
+ if key == 'defaults':
43
+ # Fallback if someone put defaults: inside another file
44
+ config['defaults'].update(value)
45
+ else:
46
+ # It's a domain
47
+ # Validate!
48
+ if schema:
49
+ if not validate_domain_config(key, value, schema):
50
+ print(f"Warning: {filename} failed schema validation. Proceeding with caution.")
51
+
52
+ config['domains'][key] = value
53
+
54
+ except yaml.YAMLError as exc:
55
+ print(f"Error parsing YAML file {filename}: {exc}")
56
+
57
+ return config
@@ -0,0 +1,49 @@
1
+ import pandas as pd
2
+
3
+ def extract_value(df_long, form_oids, item_oids, return_col='Value', keys=None):
4
+ """
5
+ Generic extraction function for Findings.
6
+ Args:
7
+ df_long: The source long format dataframe.
8
+ form_oids: List of FormOIDs to filter.
9
+ item_oids: List (or single string) of ItemOIDs to filter.
10
+ return_col: 'Value' (default) or 'ItemOID'. What to return as the column data.
11
+ keys: List of key columns to include/index by.
12
+ Returns:
13
+ DataFrame containing Keys and the requested data column (renamed to 'Result' or similar).
14
+ """
15
+ # 1. Normalize inputs
16
+ if not isinstance(form_oids, list):
17
+ form_oids = [form_oids] if form_oids else []
18
+ if isinstance(item_oids, str):
19
+ item_oids = [item_oids]
20
+
21
+ # 2. Filter Forms
22
+ # Optimization: pre-filter df_long if passed repeatedly?
23
+ # For now, just filter.
24
+ subset = df_long[df_long['FormOID'].isin(form_oids)].copy()
25
+
26
+ if subset.empty:
27
+ return pd.DataFrame()
28
+
29
+ # 3. Filter Items
30
+ # Note: If item_oids is empty/None, do we return everything? No, usually specific.
31
+ if item_oids:
32
+ subset = subset[subset['ItemOID'].isin(item_oids)]
33
+
34
+ if subset.empty:
35
+ return pd.DataFrame() # Return empty but valid DF?
36
+
37
+ # 4. Select Columns
38
+ # We always need Keys + the Return Col
39
+ cols_to_keep = keys + [return_col] if keys else [return_col]
40
+
41
+ # If keys are missing (logic error), handle gracefully
42
+ available_cols = [c for c in cols_to_keep if c in subset.columns]
43
+ result = subset[available_cols].copy()
44
+
45
+ # 5. Rename return column for clarity?
46
+ # The caller will rename it to the target column (e.g. FAORRES).
47
+ # But if return_col is 'Value' or 'ItemOID', we keep as is for now.
48
+
49
+ return result
@@ -0,0 +1,46 @@
1
+ import pandas as pd
2
+ import os
3
+ from .classes.general import GeneralProcessor
4
+ from .classes.findings import FindingsProcessor
5
+
6
+ def process_domain(domain_name, sources, df_long, default_keys, output_dir):
7
+ # Determine type of the first block (assumes all blocks in a domain are same type)
8
+ # process_domain receives 'sources' which is settings_entry.
9
+
10
+ # Normalize to list
11
+ if isinstance(sources, dict):
12
+ sources = [sources]
13
+
14
+ if not sources:
15
+ print(f"Warning: No configuration found for {domain_name}")
16
+ return
17
+
18
+ # Determine type from the first block
19
+ first_block = sources[0]
20
+ domain_type = first_block.get('type', "GENERAL")
21
+
22
+ processor = None
23
+ if domain_type == 'FINDINGS':
24
+ package_root = os.path.dirname(os.path.dirname(__file__)) # src/cdisc_builder
25
+ metadata_path = os.path.join(package_root, "metadata", "test_codes.yaml")
26
+
27
+ processor = FindingsProcessor(metadata_path)
28
+ else:
29
+ processor = GeneralProcessor()
30
+
31
+ domain_dfs = processor.process(domain_name, sources, df_long, default_keys)
32
+
33
+ if not domain_dfs:
34
+ print(f"Warning: No data found for domain {domain_name}")
35
+ return
36
+
37
+ # Concatenate all sources for this domain
38
+ combined_df = pd.concat(domain_dfs, ignore_index=True)
39
+
40
+ # Save to Parquet
41
+ if not os.path.exists(output_dir):
42
+ os.makedirs(output_dir)
43
+
44
+ output_path = os.path.join(output_dir, f"{domain_name}.parquet")
45
+ combined_df.to_parquet(output_path, index=False)
46
+ print(f"Saved {domain_name} to {output_path} (Shape: {combined_df.shape})")