PyPI - cdiscbuilder - Versions diffs - 0.1.1__py3-none-any.whl - Mend

cdiscbuilder 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

cdisc_builder/__init__.py +0 -0
cdisc_builder/cli.py +42 -0
cdisc_builder/engine/__init__.py +0 -0
cdisc_builder/engine/classes/__init__.py +0 -0
cdisc_builder/engine/classes/findings.py +267 -0
cdisc_builder/engine/classes/general.py +111 -0
cdisc_builder/engine/config.py +57 -0
cdisc_builder/engine/functions.py +49 -0
cdisc_builder/engine/processor.py +46 -0
cdisc_builder/engine/validate.py +99 -0
cdisc_builder/metadata/test_codes.yaml +33 -0
cdisc_builder/odm.py +82 -0
cdisc_builder/sdtm.py +29 -0
cdisc_builder/specs/AE.yaml +39 -0
cdisc_builder/specs/CM.yaml +55 -0
cdisc_builder/specs/DM.yaml +36 -0
cdisc_builder/specs/DS.yaml +41 -0
cdisc_builder/specs/FA.yaml +129 -0
cdisc_builder/specs/PE.yaml +24 -0
cdisc_builder/specs/SV.yaml +42 -0
cdisc_builder/specs/VS.yaml +74 -0
cdisc_builder/specs/defaults.yaml +1 -0
cdisc_builder/specs/schema.yaml +47 -0
cdiscbuilder-0.1.1.dist-info/METADATA +10 -0
cdiscbuilder-0.1.1.dist-info/RECORD +29 -0
cdiscbuilder-0.1.1.dist-info/WHEEL +5 -0
cdiscbuilder-0.1.1.dist-info/entry_points.txt +2 -0
cdiscbuilder-0.1.1.dist-info/licenses/LICENSE +21 -0
cdiscbuilder-0.1.1.dist-info/top_level.txt +1 -0

cdisc_builder/__init__.py ADDED Viewed

File without changes

cdisc_builder/cli.py ADDED Viewed

@@ -0,0 +1,42 @@
+import os
+import argparse
+from .odm import parse_odm_to_long_df
+from .sdtm import create_sdtm_datasets
+def main():
+    parser = argparse.ArgumentParser(description="Convert ODM XML to SDTM Datasets")
+    # Determine default config path inside package
+    current_dir = os.path.dirname(__file__)
+    default_config_path = os.path.join(current_dir, "specs")
+    parser.add_argument("--xml", required=True, help="Path to input ODM XML file")
+    parser.add_argument("--csv", default="odm_long.csv", help="Path to intermediate long CSV file")
+    parser.add_argument("--configs", default=default_config_path, help="Path to SDTM configuration directory")
+    parser.add_argument("--output", default="sdtm_output", help="Path to output SDTM directory")
+    args = parser.parse_args()
+    # Step 1: ODM XML -> Long CSV
+    print(f"--- Step 1: Parsing ODM XML from {args.xml} ---")
+    try:
+        df = parse_odm_to_long_df(args.xml)
+        print(f"Parsed {len(df)} rows.")
+        df.to_csv(args.csv, index=False)
+        print(f"Saved intermediate data to {args.csv}")
+    except Exception as e:
+        print(f"Error parsing XML: {e}")
+        return
+    # Step 2: Long CSV -> SDTM Datasets
+    print(f"\n--- Step 2: Generating SDTM Datasets using configs from {args.configs} ---")
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    try:
+        create_sdtm_datasets(args.configs, args.csv, args.output)
+        print(f"\nSuccess! SDTM datasets created in {args.output}")
+    except Exception as e:
+        print(f"Error creating SDTM datasets: {e}")
+if __name__ == "__main__":
+    main()

cdisc_builder/engine/__init__.py ADDED Viewed

File without changes

cdisc_builder/engine/classes/__init__.py ADDED Viewed

File without changes

cdisc_builder/engine/classes/findings.py ADDED Viewed

@@ -0,0 +1,267 @@
+import pandas as pd
+import yaml
+import os
+from ..functions import extract_value
+# Map string names to actual functions
+FUNCTION_MAP = {
+    'extract_value': extract_value
+}
+class FindingsProcessor:
+    def __init__(self, metadata_path):
+        self.metadata = {}
+        if os.path.exists(metadata_path):
+            with open(metadata_path, 'r') as f:
+                self.metadata = yaml.safe_load(f)
+        else:
+            print(f"Warning: Metadata file not found at {metadata_path}")
+    def process(self, domain_name, sources, df_long, default_keys):
+        domain_dfs = []
+        for settings in sources:
+            # Check for 'columns' list with functions (New Strategy)
+            columns_cfg = settings.get('columns', [])
+            # If columns is a dict (old style), convert or warn?
+            # The new FA.yaml uses list of dicts.
+            is_functional = isinstance(columns_cfg, list) and any('function' in c for c in columns_cfg)
+            if is_functional:
+                # --- Functional Strategy ---
+                # We need to build the "Main" DF from the backbone columns (those with multiple items)
+                # and "Attribute" DFs from single-item columns, then merge.
+                # 1. Group columns by their ItemOID signature (to identify backbone vs attributes)
+                # Keys are always required for joining.
+                # To prevent cross-form contamination (e.g. Form A Repeat 1 joining to Form B Repeat 1),
+                # we SHOULD include FormOID in the join keys if we are mixing forms.
+                # We assume FormOID is available in df_long.
+                config_keys = settings.get('keys', default_keys)
+                keys = config_keys # Alias for compatibility
+                # Check if we should enforce FormOID joining
+                # If simplified config, we likely want strict form isolation.
+                join_keys = config_keys.copy()
+                if 'FormOID' not in join_keys and 'FormOID' in df_long.columns:
+                     join_keys.append('FormOID')
+                # Store resulting dataframes
+                # To identify the "Backbone" (which determines the number of rows), we look for the list of items.
+                # Actually, with the Naive Merge strategy (Full Outer Join or Left Join on Main?),
+                # we usually treat the "Test Code" column as the driver.
+                # Let's iterate and collect data for each column.
+                # Optimization: Cache `func_fa` results if forms/items are identical?
+                # We need a primary DF to merge into.
+                # Usually FATESTCD or FAORRES defines the rows.
+                # Let's try: Generate DF for EACH column, then merge them all.
+                # Merge logic:
+                # - If signature (forms, items) is identical: Merge on Keys + ItemOID.
+                # - If signature is different (e.g. single item vs list): Merge on Keys only?
+                #   - Wait, if we merge 'DATE' (1 row per subject) to 'AESEV' (1 row per subject),
+                #     on Keys -> Cartesian product if there are multiple AE records?
+                #     SDTM Keys: USUBJID, FASEQ.
+                #     ODM Keys: StudyOID, SubjectKey, ItemGroupRepeatKey.
+                #     If keys are unique per finding, 1:1 merge works.
+                #     If mult-items (Backbone) share keys?
+                #     e.g. Form.AE has 1 record (RepeatKey 1). Contains AESEV, AEREL.
+                #     func_fa(AESEV+AEREL) -> 2 rows (ItemOID AESEV, ItemOID AEREL). Same RepeatKey.
+                #     func_fa(DATE) -> 1 row (ItemOID DATE). Same RepeatKey.
+                #     Merge: DATE should join to BOTH AESEV and AEREL.
+                #     So Left Join from Backbone to Attribute on Keys is correct.
+                primary_df = pd.DataFrame()
+                attribute_dfs = []
+                for col_def in columns_cfg:
+                    name = col_def.get('name')
+                    func_name = col_def.get('function')
+                    if not func_name:
+                        # Literal or simple mapping logic handles later?
+                        # Or maybe standard 'source' mapping?
+                        continue
+                    if func_name == 'func_fa':
+                        form_oids = col_def.get('formoid')
+                        item_oids = col_def.get('itemoid')
+                        if name == 'VISIT':
+                             pass # Removed Debug
+                        # Normalize to list
+                        if not isinstance(form_oids, list):
+                            form_oids = [form_oids] if form_oids else []
+                        col_type = col_def.get('type', 'str')
+                        # Determine return_col
+                        # ... lines 105-108 ...
+                        return_col = 'Value'
+                        if name in ['FATESTCD', 'LBTESTCD', 'VSTESTCD', 'QSTESTCD']:
+                            return_col = 'ItemOID'
+                # Call function
+                        df_res = extract_value(df_long, form_oids, item_oids, return_col=return_col, keys=join_keys)
+                        # Removed Debug
+                        if df_res.empty:
+                            print(f"Warning: No data for {name}")
+                            continue
+                        # Rename result column to target name
+                        # func_fa returns Keys + [return_col]
+                        # If return_col was 'Value', it's now named 'Value'.
+                        # We rename it to `name` (e.g. FAORRES).
+                        df_res = df_res.rename(columns={return_col: name})
+                        # Identify if this is Backbone or Attribute
+                        # Heuristic: If name implies Topic/Result (TESTCD, ORRES) AND it has multiple items,
+                        # it is Backbone.
+                        # Attributes (like DTC) might have multiple items (to search across forms) but are not Row Generators.
+                        is_result = 'ORRES' in name or 'STRES' in name or 'STAT' in name or 'REASND' in name or 'TERM' in name or 'DECOD' in name or 'VISIT' in name or 'DTC' in name
+                        is_backbone_col = name.endswith('TESTCD') or is_result or name.endswith('OBJ')
+                        is_list = isinstance(item_oids, list) and len(item_oids) > 1 and is_backbone_col
+                        if is_list:
+                            # This is part of the Backbone.
+                            # We merge these on Keys + ItemOID?
+                            # Wait, if we have FATESTCD (Values: AESEV, AEREL) and FAORRES (Values: MILD, POSSIBLE).
+                            # Both have 'ItemOID' column from source? NO.
+                            # func_fa returns Keys + Result. It DROPS ItemOID column unless return_col='ItemOID'.
+                            # Converting FATESTCD: returns Keys + FATESTCD (values are AESEV, AEREL).
+                            # Converting FAORRES:  returns Keys + FAORRES (values are MILD, POSSIBLE).
+                            # BUT we lost the link that MILD belongs to AESEV!
+                            # CRITICAL: func_fa MUST return ItemOID column to allow aligning Backbone columns!
+                            # Re-calling func_fa to get 'ItemOID' for alignment.
+                            # Or update func_fa to always return ItemOID?
+                            # For FAORRES, we definitely need ItemOID to know which test it is.
+                            # FIX for extract_value usage here:
+                            # We always ask for ItemOID as a key for merging.
+                            df_res_with_id = extract_value(df_long, form_oids, item_oids, return_col=return_col, keys=join_keys + ['ItemOID'])
+                            # Create target column instead of renaming, to preserve ItemOID key
+                            # If return_col is 'ItemOID', it's the same column.
+                            # If return_col is 'Value', we have both.
+                            # Handle duplicate column issue from func_fa if return_col in keys
+                            df_res_with_id = df_res_with_id.loc[:, ~df_res_with_id.columns.duplicated()]
+                            if return_col in df_res_with_id.columns:
+                                df_res_with_id[name] = df_res_with_id[return_col]
+                                if return_col != 'ItemOID' and return_col != name:
+                                     # Drop original value col if not needed and not ItemOID
+                                     # Actually, we keep ItemOID for join. We can drop 'Value' after copy.
+                                     df_res_with_id = df_res_with_id.drop(columns=[return_col])
+                            else:
+                                # Should not happen if func_fa works
+                                print(f"Warning: return column {return_col} missing in func_fa result for {name}")
+                            if is_backbone_col:
+                                if primary_df.empty:
+                                    primary_df = df_res
+                                else:
+                                    # Merge logic
+                                    primary_df = pd.merge(primary_df, df_res, on=join_keys, how='outer')
+                            else:
+                                # Attribute (Single item, e.g. DATE)
+                                # We just need Keys + Value. ItemOID is not a join key for the backbone.
+                                 attribute_dfs.append(df_res)
+                        else:
+                            # Single Item Logic (df_res already computed and renamed at top)
+                            if is_backbone_col:
+                                if primary_df.empty:
+                                    primary_df = df_res
+                                else:
+                                    primary_df = pd.merge(primary_df, df_res, on=join_keys, how='outer')
+                            else:
+                                attribute_dfs.append(df_res)
+                # If we built a primary DF, merge attributes
+                if not primary_df.empty:
+                    final_df = primary_df
+                    for att_df in attribute_dfs:
+                        # Join on keys only
+                        final_df = pd.merge(final_df, att_df, on=join_keys, how='left')
+                    # Now populate other columns and Enforce Types
+                    for col_def in columns_cfg:
+                        name = col_def.get('name')
+                        col_type = col_def.get('type', 'str')
+                        source = col_def.get('source')
+                        # logic to overwrite if source is present
+                        if name in final_df.columns and not source:
+                            # Already populated by func_fa and no override requested
+                            pass
+                        elif source and source in final_df.columns:
+                             # Overwrite or Populate from Source
+                             final_df[name] = final_df[source]
+                        else:
+                            # Populate missing columns
+                            series = None
+                            literal = col_def.get('literal')
+                            if literal:
+                                 series = pd.Series([literal] * len(final_df), index=final_df.index)
+                            # Defaults Logic (Legacy / Backup)
+                            elif name == 'STUDYID' and 'StudyOID' in keys:
+                                 if 'StudyOID' in final_df.columns:
+                                     series = final_df['StudyOID']
+                            elif name == 'USUBJID' and 'SubjectKey' in keys:
+                                 if 'SubjectKey' in final_df.columns:
+                                     series = final_df['SubjectKey']
+                            elif name == 'FASEQ' and 'ItemGroupRepeatKey' in keys:
+                                 if 'ItemGroupRepeatKey' in final_df.columns:
+                                     series = final_df['ItemGroupRepeatKey']
+                            elif name == 'DOMAIN':
+                                 series = pd.Series([domain_name] * len(final_df), index=final_df.index)
+                            # Metadata Lookup
+                            elif name == 'FATEST' and 'FATESTCD' in final_df.columns:
+                                 series = final_df['FATESTCD'].apply(lambda x: self.metadata.get(x, {}).get('test', x))
+                            elif name == 'FAOBJ' and 'FATESTCD' in final_df.columns:
+                                 series = final_df['FATESTCD'].apply(lambda x: self.metadata.get(x, {}).get('obj', None))
+                            else:
+                                # Fallback: Empty
+                                series = pd.Series([None] * len(final_df), index=final_df.index)
+                            if series is not None:
+                                final_df[name] = series
+                        # Type Enforcement (Apply to ALL columns, whether func_fa or fallback)
+                        if col_type == 'int':
+                             final_df[name] = pd.to_numeric(final_df[name], errors='coerce').astype('Int64')
+                        elif col_type == 'float':
+                             final_df[name] = pd.to_numeric(final_df[name], errors='coerce')
+                        else:
+                             # String cleanup
+                             final_df[name] = final_df[name].astype(str).replace('nan', None).replace('None', None)
+                    # Store Variable Labels in df.attrs["labels"]
+                    labels = {col['name']: col.get('label', '') for col in columns_cfg if 'label' in col}
+                    final_df.attrs['labels'] = labels
+                    domain_dfs.append(final_df)
+            else:
+                continue
+        return domain_dfs

cdisc_builder/engine/classes/general.py ADDED Viewed

@@ -0,0 +1,111 @@
+import pandas as pd
+class GeneralProcessor:
+    def process(self, domain_name, sources, df_long, default_keys):
+        domain_dfs = []
+        for settings in sources:
+            # 1. Filter by FormOID
+            form_oid = settings.get('formoid')
+            if form_oid:
+                try:
+                    # Filter for specific FormOID
+                    source_df = df_long[df_long['FormOID'] == form_oid].copy()
+                except Exception as e:
+                    print(f"Error filtering for {domain_name} (FormOID={form_oid}): {e}")
+                    continue
+            else:
+                print(f"Warning: No formoid specified for a block in {domain_name}")
+                continue
+            if source_df.empty:
+                continue
+            # 2. Key columns for pivoting (use block keys or defaults)
+            keys = settings.get('keys', default_keys)
+            # 3. Pivot
+            try:
+                pivoted = source_df.pivot_table(
+                    index=keys,
+                    columns='ItemOID',
+                    values='Value',
+                    aggfunc='first'
+                ).reset_index()
+            except Exception as e:
+                print(f"Error pivoting for {domain_name}: {e}")
+                continue
+            # 4. Map columns
+            final_df = pd.DataFrame()
+            mappings = settings.get('columns', {})
+            for target_col, col_config in mappings.items():
+                source_expr = None
+                literal_expr = None
+                target_type = None
+                value_map = None
+                # Check if simple string or object config
+                if isinstance(col_config, dict):
+                    source_expr = col_config.get('source')
+                    literal_expr = col_config.get('literal')
+                    target_type = col_config.get('type')
+                    value_map = col_config.get('value_mapping')
+                else:
+                    source_expr = col_config
+                    literal_expr = None
+                # Extract Data
+                series = None
+                if literal_expr is not None:
+                    # Explicit literal value
+                    series = pd.Series([literal_expr] * len(pivoted))
+                elif source_expr:
+                    if source_expr in pivoted.columns:
+                        series = pivoted[source_expr].copy()
+                    else:
+                        # Source defined but not found.
+                        print(f"Warning: Source column '{source_expr}' not found for '{domain_name}.{target_col}'. Filling with NaN.")
+                        series = pd.Series([None] * len(pivoted))
+                else:
+                    print(f"Warning: No source or literal defined for '{domain_name}.{target_col}'. Filling with NaN.")
+                    series = pd.Series([None] * len(pivoted))
+                # Apply Value Mapping
+                if value_map:
+                    series = series.replace(value_map)
+                # Apply Type Conversion
+                if target_type:
+                    try:
+                        if target_type == 'int':
+                            series = pd.to_numeric(series, errors='coerce').astype('Int64')
+                        elif target_type == 'float':
+                             series = pd.to_numeric(series, errors='coerce')
+                        elif target_type == 'str':
+                            series = series.astype(str)
+                        elif target_type == 'bool':
+                            series = series.astype(bool)
+                    except Exception as e:
+                        print(f"Error converting {target_col} to {target_type}: {e}")
+                final_df[target_col] = series
+                # Validation: max_missing_pct
+                if isinstance(col_config, dict):
+                    max_missing = col_config.get('max_missing_pct')
+                    if max_missing is not None:
+                        missing_count = series.isna().sum()
+                        if target_type == 'str':
+                             missing_count += (series.isin(['nan', 'None'])).sum()
+                        total = len(series)
+                        if total > 0:
+                            pct = (missing_count / total) * 100
+                            if pct > max_missing:
+                                print(f"WARNING: [Validation] {domain_name}.{target_col} missing {pct:.2f}% (Limit: {max_missing:})")
+            domain_dfs.append(final_df)
+        return domain_dfs

cdisc_builder/engine/config.py ADDED Viewed

@@ -0,0 +1,57 @@
+import yaml
+import os
+from .validate import load_schema, validate_domain_config
+def load_config(config_dir):
+    """
+    Loads all YAML configuration files from the specified directory.
+    Validates them against schema.yaml.
+    """
+    config = {
+        'domains': {},
+        'defaults': {}
+    }
+    schema = load_schema()
+    if not os.path.exists(config_dir):
+        # Fallback to package data if default path doesn't exist?
+        # Assuming config_dir provided is valid or we expect empty.
+        return config
+    for filename in os.listdir(config_dir):
+        if filename.endswith(".yaml") or filename.endswith(".yml"):
+            file_path = os.path.join(config_dir, filename)
+            # Skip schema itself if present in same dir
+            if filename == "schema.yaml":
+                continue
+            with open(file_path, "r") as f:
+                try:
+                    data = yaml.safe_load(f)
+                    if filename == 'defaults.yaml':
+                        # Defaults file - likely flat dict
+                        config['defaults'].update(data)
+                        continue
+                    # Merge data
+                    for key, value in data.items():
+                        if key == 'defaults':
+                             # Fallback if someone put defaults: inside another file
+                            config['defaults'].update(value)
+                        else:
+                            # It's a domain
+                            # Validate!
+                            if schema:
+                                if not validate_domain_config(key, value, schema):
+                                    print(f"Warning: {filename} failed schema validation. Proceeding with caution.")
+                            config['domains'][key] = value
+                except yaml.YAMLError as exc:
+                    print(f"Error parsing YAML file {filename}: {exc}")
+    return config

cdisc_builder/engine/functions.py ADDED Viewed

@@ -0,0 +1,49 @@
+import pandas as pd
+def extract_value(df_long, form_oids, item_oids, return_col='Value', keys=None):
+    """
+    Generic extraction function for Findings.
+    Args:
+        df_long: The source long format dataframe.
+        form_oids: List of FormOIDs to filter.
+        item_oids: List (or single string) of ItemOIDs to filter.
+        return_col: 'Value' (default) or 'ItemOID'. What to return as the column data.
+        keys: List of key columns to include/index by.
+    Returns:
+        DataFrame containing Keys and the requested data column (renamed to 'Result' or similar).
+    """
+    # 1. Normalize inputs
+    if not isinstance(form_oids, list):
+        form_oids = [form_oids] if form_oids else []
+    if isinstance(item_oids, str):
+        item_oids = [item_oids]
+    # 2. Filter Forms
+    # Optimization: pre-filter df_long if passed repeatedly?
+    # For now, just filter.
+    subset = df_long[df_long['FormOID'].isin(form_oids)].copy()
+    if subset.empty:
+        return pd.DataFrame()
+    # 3. Filter Items
+    # Note: If item_oids is empty/None, do we return everything? No, usually specific.
+    if item_oids:
+        subset = subset[subset['ItemOID'].isin(item_oids)]
+    if subset.empty:
+        return pd.DataFrame() # Return empty but valid DF?
+    # 4. Select Columns
+    # We always need Keys + the Return Col
+    cols_to_keep = keys + [return_col] if keys else [return_col]
+    # If keys are missing (logic error), handle gracefully
+    available_cols = [c for c in cols_to_keep if c in subset.columns]
+    result = subset[available_cols].copy()
+    # 5. Rename return column for clarity?
+    # The caller will rename it to the target column (e.g. FAORRES).
+    # But if return_col is 'Value' or 'ItemOID', we keep as is for now.
+    return result

cdisc_builder/engine/processor.py ADDED Viewed

@@ -0,0 +1,46 @@
+import pandas as pd
+import os
+from .classes.general import GeneralProcessor
+from .classes.findings import FindingsProcessor
+def process_domain(domain_name, sources, df_long, default_keys, output_dir):
+    # Determine type of the first block (assumes all blocks in a domain are same type)
+    # process_domain receives 'sources' which is settings_entry.
+    # Normalize to list
+    if isinstance(sources, dict):
+        sources = [sources]
+    if not sources:
+        print(f"Warning: No configuration found for {domain_name}")
+        return
+    # Determine type from the first block
+    first_block = sources[0]
+    domain_type = first_block.get('type', "GENERAL")
+    processor = None
+    if domain_type == 'FINDINGS':
+        package_root = os.path.dirname(os.path.dirname(__file__)) # src/cdisc_builder
+        metadata_path = os.path.join(package_root, "metadata", "test_codes.yaml")
+        processor = FindingsProcessor(metadata_path)
+    else:
+        processor = GeneralProcessor()
+    domain_dfs = processor.process(domain_name, sources, df_long, default_keys)
+    if not domain_dfs:
+        print(f"Warning: No data found for domain {domain_name}")
+        return
+    # Concatenate all sources for this domain
+    combined_df = pd.concat(domain_dfs, ignore_index=True)
+    # Save to Parquet
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    output_path = os.path.join(output_dir, f"{domain_name}.parquet")
+    combined_df.to_parquet(output_path, index=False)
+    print(f"Saved {domain_name} to {output_path} (Shape: {combined_df.shape})")