cdiscbuilder 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cdiscbuilder-1.2.0/src/cdiscbuilder.egg-info → cdiscbuilder-1.2.1}/PKG-INFO +1 -1
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/pyproject.toml +1 -1
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/__init__.py +1 -1
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/cli.py +15 -1
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/findings.py +6 -7
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/general.py +16 -7
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/processor.py +2 -2
- cdiscbuilder-1.2.1/src/cdiscbuilder/sdtm/engine/tests/test_boundary_standardization.py +135 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/odm_parser.py +40 -14
- cdiscbuilder-1.2.1/src/cdiscbuilder/sdtm/sdtm.py +71 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1/src/cdiscbuilder.egg-info}/PKG-INFO +1 -1
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/SOURCES.txt +1 -0
- cdiscbuilder-1.2.0/src/cdiscbuilder/sdtm/sdtm.py +0 -37
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/LICENSE +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/MANIFEST.in +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/README.md +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/setup.cfg +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/base.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/function_derivation.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/sql_derivation.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/engine.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/functions/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/functions/get_bmi.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/loaders/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/loaders/sdtm_loader.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/tests/test_engine.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/utils/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/utils/logger.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/adam_spec.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/merge_yaml.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/schema_validator.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/organization/adsl_common.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/project/adsl_project.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/schema.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/adam_study1.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/adsl_study1.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/final_adsl_study1.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/adam_study2.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/adsl_study2.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/final_adsl_study2.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_adam_spec.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_merge_yaml.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_schema_validator.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_validation/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_validation/data_validator.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/schema.yaml +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/base.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/events.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/interventions.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/special_purpose.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/config.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/functions.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_config.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_findings.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_general.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_metadata_extractor.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_processor.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_validate.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/validate.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/load.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/tests/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/tests/test_load.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/tlf/__init__.py +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/dependency_links.txt +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/entry_points.txt +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/requires.txt +0 -0
- {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "cdiscbuilder"
|
|
7
|
-
version = "1.2.
|
|
7
|
+
version = "1.2.1"
|
|
8
8
|
description = "A package to convert ODM XML to SDTM/ADaM Datasets"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{name = "Ming-Chun Chen", email = "hellomingchun@gmail.com"}]
|
|
@@ -30,10 +30,24 @@ def main():
|
|
|
30
30
|
|
|
31
31
|
args = parser.parse_args()
|
|
32
32
|
|
|
33
|
+
# Load defaults from configs directory if present
|
|
34
|
+
defaults = {}
|
|
35
|
+
if args.configs and os.path.exists(args.configs):
|
|
36
|
+
defaults_path = os.path.join(args.configs, "defaults.yaml")
|
|
37
|
+
if os.path.exists(defaults_path):
|
|
38
|
+
import yaml
|
|
39
|
+
try:
|
|
40
|
+
with open(defaults_path, "r") as f:
|
|
41
|
+
defaults = yaml.safe_load(f) or {}
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"Warning: Failed to load defaults.yaml from {defaults_path}: {e}")
|
|
44
|
+
|
|
45
|
+
xml_mapping = defaults.get("xml_mapping")
|
|
46
|
+
|
|
33
47
|
# Step 1: ODM XML -> Long CSV
|
|
34
48
|
print(f"--- Step 1: Parsing ODM XML from {args.xml} ---")
|
|
35
49
|
try:
|
|
36
|
-
df = parse_odm_to_long_df(args.xml)
|
|
50
|
+
df = parse_odm_to_long_df(args.xml, xml_mapping=xml_mapping)
|
|
37
51
|
print(f"Parsed {len(df)} rows.")
|
|
38
52
|
df.to_csv(args.csv, index=False)
|
|
39
53
|
print(f"Saved intermediate data to {args.csv}")
|
|
@@ -10,7 +10,7 @@ class FindingsProcessor:
|
|
|
10
10
|
def __init__(self):
|
|
11
11
|
self.class_name = "FINDINGS"
|
|
12
12
|
|
|
13
|
-
def process(self, domain_name, sources, df_long, default_keys):
|
|
13
|
+
def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None):
|
|
14
14
|
domain_dfs = []
|
|
15
15
|
|
|
16
16
|
for settings in sources:
|
|
@@ -20,9 +20,8 @@ class FindingsProcessor:
|
|
|
20
20
|
if form_oid:
|
|
21
21
|
if "FormOID" not in source_df.columns:
|
|
22
22
|
print(
|
|
23
|
-
f"Warning: 'FormOID' column missing in source data
|
|
23
|
+
f"Warning: 'FormOID' column missing in source data. Skipping FormOID filtering."
|
|
24
24
|
)
|
|
25
|
-
source_df = pd.DataFrame(columns=source_df.columns)
|
|
26
25
|
else:
|
|
27
26
|
if isinstance(form_oid, list):
|
|
28
27
|
source_df = source_df[source_df["FormOID"].isin(form_oid)]
|
|
@@ -34,9 +33,8 @@ class FindingsProcessor:
|
|
|
34
33
|
if item_group_match:
|
|
35
34
|
if "ItemGroupOID" not in source_df.columns:
|
|
36
35
|
print(
|
|
37
|
-
f"Warning: 'ItemGroupOID' column missing in source data
|
|
36
|
+
f"Warning: 'ItemGroupOID' column missing in source data. Skipping ItemGroupOID filtering."
|
|
38
37
|
)
|
|
39
|
-
source_df = pd.DataFrame(columns=source_df.columns)
|
|
40
38
|
else:
|
|
41
39
|
source_df = source_df[
|
|
42
40
|
source_df["ItemGroupOID"].str.match(item_group_match, na=False)
|
|
@@ -47,9 +45,8 @@ class FindingsProcessor:
|
|
|
47
45
|
if item_oid_match:
|
|
48
46
|
if "ItemOID" not in source_df.columns:
|
|
49
47
|
print(
|
|
50
|
-
f"Warning: 'ItemOID' column missing in source data
|
|
48
|
+
f"Warning: 'ItemOID' column missing in source data. Skipping ItemOID filtering."
|
|
51
49
|
)
|
|
52
|
-
source_df = pd.DataFrame(columns=source_df.columns)
|
|
53
50
|
else:
|
|
54
51
|
source_df = source_df[
|
|
55
52
|
source_df["ItemOID"].str.match(item_oid_match, na=False)
|
|
@@ -60,6 +57,8 @@ class FindingsProcessor:
|
|
|
60
57
|
|
|
61
58
|
# 3. Create Base DataFrame (No Pivot)
|
|
62
59
|
keys = settings.get("keys", default_keys)
|
|
60
|
+
if custom_to_standard:
|
|
61
|
+
keys = [custom_to_standard.get(k, k) for k in keys]
|
|
63
62
|
|
|
64
63
|
base_cols = keys + ["ItemOID", "Value"]
|
|
65
64
|
if "Question" in source_df.columns:
|
|
@@ -59,7 +59,7 @@ class GeneralProcessor:
|
|
|
59
59
|
|
|
60
60
|
return expanded_list
|
|
61
61
|
|
|
62
|
-
def process(self, domain_name, sources, df_long, default_keys):
|
|
62
|
+
def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None):
|
|
63
63
|
domain_dfs = []
|
|
64
64
|
|
|
65
65
|
# Pre-expand sources if they contain lists
|
|
@@ -76,25 +76,34 @@ class GeneralProcessor:
|
|
|
76
76
|
form_oid = settings.get("formoid")
|
|
77
77
|
if form_oid:
|
|
78
78
|
try:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
if "FormOID" in df_long.columns:
|
|
80
|
+
# Filter for specific FormOID(s)
|
|
81
|
+
if isinstance(form_oid, list):
|
|
82
|
+
source_df = df_long[df_long["FormOID"].isin(form_oid)].copy()
|
|
83
|
+
else:
|
|
84
|
+
source_df = df_long[df_long["FormOID"] == form_oid].copy()
|
|
82
85
|
else:
|
|
83
|
-
|
|
86
|
+
print(f"Warning: 'FormOID' column missing in source data. Skipping FormOID filtering.")
|
|
87
|
+
source_df = df_long.copy()
|
|
84
88
|
except Exception as e:
|
|
85
89
|
print(
|
|
86
90
|
f"Error filtering for {domain_name} (FormOID={form_oid}): {e}"
|
|
87
91
|
)
|
|
88
92
|
continue
|
|
89
93
|
else:
|
|
90
|
-
|
|
91
|
-
|
|
94
|
+
if "FormOID" in df_long.columns:
|
|
95
|
+
print(f"Warning: No formoid specified for a block in {domain_name}")
|
|
96
|
+
continue
|
|
97
|
+
else:
|
|
98
|
+
source_df = df_long.copy()
|
|
92
99
|
|
|
93
100
|
if source_df.empty:
|
|
94
101
|
continue
|
|
95
102
|
|
|
96
103
|
# 2. Key columns for pivoting (use block keys or defaults)
|
|
97
104
|
keys = settings.get("keys", default_keys)
|
|
105
|
+
if custom_to_standard:
|
|
106
|
+
keys = [custom_to_standard.get(k, k) for k in keys]
|
|
98
107
|
|
|
99
108
|
# 3. Pivot
|
|
100
109
|
try:
|
|
@@ -8,7 +8,7 @@ from .classes.findings import FindingsProcessor
|
|
|
8
8
|
from .classes.special_purpose import SpecialPurposeProcessor
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def process_domain(domain_name, sources, df_long, default_keys, output_dir):
|
|
11
|
+
def process_domain(domain_name, sources, df_long, default_keys, output_dir, custom_to_standard=None):
|
|
12
12
|
# Normalize to list
|
|
13
13
|
if isinstance(sources, dict):
|
|
14
14
|
sources = [sources]
|
|
@@ -31,7 +31,7 @@ def process_domain(domain_name, sources, df_long, default_keys, output_dir):
|
|
|
31
31
|
else:
|
|
32
32
|
processor = GeneralProcessor()
|
|
33
33
|
|
|
34
|
-
domain_dfs = processor.process(domain_name, sources, df_long, default_keys)
|
|
34
|
+
domain_dfs = processor.process(domain_name, sources, df_long, default_keys, custom_to_standard=custom_to_standard)
|
|
35
35
|
|
|
36
36
|
if not domain_dfs:
|
|
37
37
|
print(f"Warning: No data found for domain {domain_name}")
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pytest
|
|
5
|
+
import yaml
|
|
6
|
+
from cdiscbuilder.sdtm.odm_parser import parse_odm_to_long_df
|
|
7
|
+
from cdiscbuilder.sdtm.sdtm import create_sdtm_datasets
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_parse_odm_to_long_df_boundary_standardization():
|
|
11
|
+
# Simple XML string with Medidata style attributes
|
|
12
|
+
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
13
|
+
<ODM>
|
|
14
|
+
<ClinicalData StudyOID="STUDY_XYZ">
|
|
15
|
+
<SubjectData SubjectKey="SUBJ123">
|
|
16
|
+
<StudyEventData StudyEventOID="SE_VISIT" StartDate="2026-05-29">
|
|
17
|
+
<FormData FormOID="F_AE">
|
|
18
|
+
<ItemGroupData ItemGroupOID="IG_AE" RecordPosition="2">
|
|
19
|
+
<ItemData ItemOID="AE_TERM" Value="Headache"/>
|
|
20
|
+
</ItemGroupData>
|
|
21
|
+
</FormData>
|
|
22
|
+
</StudyEventData>
|
|
23
|
+
</SubjectData>
|
|
24
|
+
</ClinicalData>
|
|
25
|
+
</ODM>"""
|
|
26
|
+
|
|
27
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as f:
|
|
28
|
+
f.write(xml_content)
|
|
29
|
+
temp_xml_path = f.name
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
# Test parsing with custom xml_mapping.
|
|
33
|
+
# Should map the RecordPosition attribute to standard ItemGroupRepeatKey column in df
|
|
34
|
+
xml_mapping = {"item_group_repeat_key": "RecordPosition"}
|
|
35
|
+
|
|
36
|
+
df = parse_odm_to_long_df(temp_xml_path, xml_mapping=xml_mapping)
|
|
37
|
+
|
|
38
|
+
# Verify output columns are standard (no RecordPosition column is outputted;
|
|
39
|
+
# it is standardized to ItemGroupRepeatKey)
|
|
40
|
+
assert "ItemGroupRepeatKey" in df.columns
|
|
41
|
+
assert "RecordPosition" not in df.columns
|
|
42
|
+
assert df.iloc[0]["ItemGroupRepeatKey"] == "2"
|
|
43
|
+
finally:
|
|
44
|
+
try:
|
|
45
|
+
os.remove(temp_xml_path)
|
|
46
|
+
except Exception:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_create_sdtm_datasets_boundary_standardization(tmp_path):
|
|
51
|
+
# df_long with custom columns: RecordPosition, SubjectID, and no FormOID
|
|
52
|
+
df_long = pd.DataFrame(
|
|
53
|
+
[
|
|
54
|
+
{
|
|
55
|
+
"StudyOID": "STUDY01",
|
|
56
|
+
"SubjectID": "001",
|
|
57
|
+
"RecordPosition": 1,
|
|
58
|
+
"ItemOID": "AETERM",
|
|
59
|
+
"Value": "Headache",
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"StudyOID": "STUDY01",
|
|
63
|
+
"SubjectID": "001",
|
|
64
|
+
"RecordPosition": 1,
|
|
65
|
+
"ItemOID": "AESTDTC",
|
|
66
|
+
"Value": "2026-05-29",
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"StudyOID": "STUDY01",
|
|
70
|
+
"SubjectID": "001",
|
|
71
|
+
"RecordPosition": 2,
|
|
72
|
+
"ItemOID": "AETERM",
|
|
73
|
+
"Value": "Nausea",
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"StudyOID": "STUDY01",
|
|
77
|
+
"SubjectID": "001",
|
|
78
|
+
"RecordPosition": 2,
|
|
79
|
+
"ItemOID": "AESTDTC",
|
|
80
|
+
"Value": "2026-05-30",
|
|
81
|
+
},
|
|
82
|
+
]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
input_csv = tmp_path / "long.csv"
|
|
86
|
+
df_long.to_csv(input_csv, index=False)
|
|
87
|
+
|
|
88
|
+
# Create domain spec directory
|
|
89
|
+
spec_dir = tmp_path / "specs"
|
|
90
|
+
os.makedirs(spec_dir)
|
|
91
|
+
|
|
92
|
+
# Create defaults.yaml
|
|
93
|
+
defaults = {
|
|
94
|
+
"keys": ["StudyOID", "SubjectID", "RecordPosition"],
|
|
95
|
+
"csv_columns": {
|
|
96
|
+
"item_group_repeat_key": "RecordPosition",
|
|
97
|
+
"subject_key": "SubjectID",
|
|
98
|
+
},
|
|
99
|
+
}
|
|
100
|
+
with open(spec_dir / "defaults.yaml", "w") as f:
|
|
101
|
+
yaml.dump(defaults, f)
|
|
102
|
+
|
|
103
|
+
# Create AE.yaml
|
|
104
|
+
ae_config = {
|
|
105
|
+
"AE": [
|
|
106
|
+
{
|
|
107
|
+
# Note: no formoid specified! So we test proceeding without FormOID filtering.
|
|
108
|
+
"type": "events",
|
|
109
|
+
"columns": {
|
|
110
|
+
"STUDYID": {"source": "StudyOID"},
|
|
111
|
+
"USUBJID": {"source": "SubjectKey"},
|
|
112
|
+
"AETERM": {"source": "AETERM"},
|
|
113
|
+
"AESTDTC": {"source": "AESTDTC"},
|
|
114
|
+
"AESEQ": {"group": ["USUBJID"], "sort_by": ["AESTDTC"]},
|
|
115
|
+
},
|
|
116
|
+
}
|
|
117
|
+
]
|
|
118
|
+
}
|
|
119
|
+
with open(spec_dir / "AE.yaml", "w") as f:
|
|
120
|
+
yaml.dump(ae_config, f)
|
|
121
|
+
|
|
122
|
+
output_dir = tmp_path / "sdtm_out"
|
|
123
|
+
|
|
124
|
+
# Generate datasets
|
|
125
|
+
create_sdtm_datasets(str(spec_dir), str(input_csv), str(output_dir))
|
|
126
|
+
|
|
127
|
+
# Verify dataset exists
|
|
128
|
+
out_file = output_dir / "AE.parquet"
|
|
129
|
+
assert out_file.exists()
|
|
130
|
+
|
|
131
|
+
res_df = pd.read_parquet(out_file)
|
|
132
|
+
assert len(res_df) == 2
|
|
133
|
+
assert "AETERM" in res_df.columns
|
|
134
|
+
assert list(res_df["AETERM"]) == ["Headache", "Nausea"]
|
|
135
|
+
assert list(res_df["AESEQ"]) == [1, 2]
|
|
@@ -2,7 +2,26 @@ import xml.etree.ElementTree as ET
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
DEFAULT_XML_MAPPING = {
|
|
6
|
+
"study_oid": "StudyOID",
|
|
7
|
+
"subject_key": "SubjectKey",
|
|
8
|
+
"study_subject_id": "StudySubjectID",
|
|
9
|
+
"study_event_oid": "StudyEventOID",
|
|
10
|
+
"study_event_repeat_key": "StudyEventRepeatKey",
|
|
11
|
+
"study_event_start_date": "StartDate",
|
|
12
|
+
"form_oid": "FormOID",
|
|
13
|
+
"item_group_oid": "ItemGroupOID",
|
|
14
|
+
"item_group_repeat_key": "ItemGroupRepeatKey",
|
|
15
|
+
"item_oid": "ItemOID",
|
|
16
|
+
"value": "Value",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_odm_to_long_df(xml_file, xml_mapping=None):
|
|
21
|
+
merged_xml_mapping = DEFAULT_XML_MAPPING.copy()
|
|
22
|
+
if xml_mapping:
|
|
23
|
+
merged_xml_mapping.update(xml_mapping)
|
|
24
|
+
|
|
6
25
|
try:
|
|
7
26
|
tree = ET.parse(xml_file)
|
|
8
27
|
root = tree.getroot()
|
|
@@ -42,13 +61,15 @@ def parse_odm_to_long_df(xml_file):
|
|
|
42
61
|
|
|
43
62
|
for cd in root:
|
|
44
63
|
if get_local_name(cd.tag) == "ClinicalData":
|
|
45
|
-
study_oid = cd.get("
|
|
64
|
+
study_oid = cd.get(merged_xml_mapping["study_oid"])
|
|
46
65
|
for sd in cd:
|
|
47
66
|
if get_local_name(sd.tag) == "SubjectData":
|
|
48
|
-
subject_key = sd.get("
|
|
67
|
+
subject_key = sd.get(merged_xml_mapping["subject_key"])
|
|
49
68
|
|
|
50
69
|
# Helper for attributes
|
|
51
70
|
def get_attrib(elem, partial_name):
|
|
71
|
+
if not partial_name:
|
|
72
|
+
return None
|
|
52
73
|
if partial_name in elem.attrib:
|
|
53
74
|
return elem.attrib[partial_name]
|
|
54
75
|
for k, v in elem.attrib.items():
|
|
@@ -56,41 +77,46 @@ def parse_odm_to_long_df(xml_file):
|
|
|
56
77
|
return v
|
|
57
78
|
return None
|
|
58
79
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
80
|
+
study_subject_id_attr = merged_xml_mapping["study_subject_id"]
|
|
81
|
+
study_subject_id = get_attrib(sd, study_subject_id_attr)
|
|
82
|
+
if not study_subject_id and study_subject_id_attr:
|
|
83
|
+
# try case-insensitive or fallback
|
|
84
|
+
study_subject_id = get_attrib(sd, study_subject_id_attr.lower())
|
|
85
|
+
if not study_subject_id:
|
|
86
|
+
study_subject_id = get_attrib(sd, "studysubjectid")
|
|
87
|
+
|
|
62
88
|
if not subject_key:
|
|
63
89
|
subject_key = study_subject_id
|
|
64
90
|
|
|
65
91
|
for child in sd:
|
|
66
92
|
tag = get_local_name(child.tag)
|
|
67
93
|
if tag == "StudyEventData":
|
|
68
|
-
study_event_oid = child.get("
|
|
94
|
+
study_event_oid = child.get(merged_xml_mapping["study_event_oid"])
|
|
69
95
|
study_event_repeat_key = (
|
|
70
|
-
child.get("
|
|
96
|
+
child.get(merged_xml_mapping["study_event_repeat_key"]) or "1"
|
|
71
97
|
)
|
|
72
98
|
|
|
73
99
|
# Extract Namespaced StartDate
|
|
74
|
-
start_date = get_attrib(child, "
|
|
100
|
+
start_date = get_attrib(child, merged_xml_mapping["study_event_start_date"]) or ""
|
|
75
101
|
|
|
76
102
|
for form in child:
|
|
77
103
|
f_tag = get_local_name(form.tag)
|
|
78
104
|
if f_tag == "FormData":
|
|
79
|
-
form_oid = form.get("
|
|
105
|
+
form_oid = form.get(merged_xml_mapping["form_oid"])
|
|
80
106
|
|
|
81
107
|
for ig in form:
|
|
82
108
|
ig_tag = get_local_name(ig.tag)
|
|
83
109
|
if ig_tag == "ItemGroupData":
|
|
84
|
-
item_group_oid = ig.get("
|
|
110
|
+
item_group_oid = ig.get(merged_xml_mapping["item_group_oid"])
|
|
85
111
|
item_group_repeat_key = ig.get(
|
|
86
|
-
"
|
|
112
|
+
merged_xml_mapping["item_group_repeat_key"]
|
|
87
113
|
)
|
|
88
114
|
|
|
89
115
|
for item in ig:
|
|
90
116
|
i_tag = get_local_name(item.tag)
|
|
91
117
|
if i_tag == "ItemData":
|
|
92
|
-
item_oid = item.get("
|
|
93
|
-
value = item.get("
|
|
118
|
+
item_oid = item.get(merged_xml_mapping["item_oid"])
|
|
119
|
+
value = item.get(merged_xml_mapping["value"])
|
|
94
120
|
|
|
95
121
|
meta = item_metadata.get(
|
|
96
122
|
item_oid, {}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from .engine.config import load_config
|
|
3
|
+
from .engine.processor import process_domain
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def create_sdtm_datasets(config_input, input_csv, output_dir):
|
|
7
|
+
if isinstance(config_input, dict):
|
|
8
|
+
config = config_input
|
|
9
|
+
# We assume it's already structured correctly or validated
|
|
10
|
+
else:
|
|
11
|
+
config = load_config(config_input)
|
|
12
|
+
|
|
13
|
+
# Get global defaults
|
|
14
|
+
defaults = config.get("defaults", {})
|
|
15
|
+
default_keys = defaults.get(
|
|
16
|
+
"keys", ["StudyOID", "SubjectKey", "ItemGroupRepeatKey", "StudyEventOID"]
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
print(f"Loading data from {input_csv}...")
|
|
20
|
+
df_long = pd.read_csv(input_csv)
|
|
21
|
+
|
|
22
|
+
# Invert mapping to go from custom CSV column name -> standard logical column name
|
|
23
|
+
# e.g., "RecordPosition" -> "ItemGroupRepeatKey"
|
|
24
|
+
csv_columns = defaults.get("csv_columns") or {}
|
|
25
|
+
STANDARD_LOGICAL_COLUMNS = {
|
|
26
|
+
"study_oid": "StudyOID",
|
|
27
|
+
"subject_key": "SubjectKey",
|
|
28
|
+
"study_subject_id": "StudySubjectID",
|
|
29
|
+
"study_event_oid": "StudyEventOID",
|
|
30
|
+
"study_event_repeat_key": "StudyEventRepeatKey",
|
|
31
|
+
"study_event_start_date": "StudyEventStartDate",
|
|
32
|
+
"form_oid": "FormOID",
|
|
33
|
+
"item_group_oid": "ItemGroupOID",
|
|
34
|
+
"item_group_repeat_key": "ItemGroupRepeatKey",
|
|
35
|
+
"item_oid": "ItemOID",
|
|
36
|
+
"value": "Value",
|
|
37
|
+
"question": "Question",
|
|
38
|
+
"item_name": "ItemName",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
custom_to_standard = {}
|
|
42
|
+
rename_map = {}
|
|
43
|
+
for logical_key, custom_col in csv_columns.items():
|
|
44
|
+
if logical_key in STANDARD_LOGICAL_COLUMNS:
|
|
45
|
+
standard_col = STANDARD_LOGICAL_COLUMNS[logical_key]
|
|
46
|
+
rename_map[custom_col] = standard_col
|
|
47
|
+
custom_to_standard[custom_col] = standard_col
|
|
48
|
+
|
|
49
|
+
# Perform rename if map is not empty
|
|
50
|
+
if rename_map:
|
|
51
|
+
df_long.rename(columns=rename_map, inplace=True)
|
|
52
|
+
# Translate default_keys to match the standardized DataFrame
|
|
53
|
+
default_keys = [custom_to_standard.get(k, k) for k in default_keys]
|
|
54
|
+
|
|
55
|
+
# Prioritize DM domain processing
|
|
56
|
+
domains = list(config["domains"].keys())
|
|
57
|
+
if "DM" in domains:
|
|
58
|
+
domains.remove("DM")
|
|
59
|
+
domains.insert(0, "DM")
|
|
60
|
+
|
|
61
|
+
for domain in domains:
|
|
62
|
+
settings_entry = config["domains"][domain]
|
|
63
|
+
print(f"Processing domain: {domain}")
|
|
64
|
+
|
|
65
|
+
# Normalize to list.
|
|
66
|
+
if isinstance(settings_entry, list):
|
|
67
|
+
sources = settings_entry
|
|
68
|
+
else:
|
|
69
|
+
sources = [settings_entry]
|
|
70
|
+
|
|
71
|
+
process_domain(domain, sources, df_long, default_keys, output_dir, custom_to_standard=custom_to_standard)
|
|
@@ -59,6 +59,7 @@ src/cdiscbuilder/sdtm/engine/classes/findings.py
|
|
|
59
59
|
src/cdiscbuilder/sdtm/engine/classes/general.py
|
|
60
60
|
src/cdiscbuilder/sdtm/engine/classes/interventions.py
|
|
61
61
|
src/cdiscbuilder/sdtm/engine/classes/special_purpose.py
|
|
62
|
+
src/cdiscbuilder/sdtm/engine/tests/test_boundary_standardization.py
|
|
62
63
|
src/cdiscbuilder/sdtm/engine/tests/test_config.py
|
|
63
64
|
src/cdiscbuilder/sdtm/engine/tests/test_findings.py
|
|
64
65
|
src/cdiscbuilder/sdtm/engine/tests/test_general.py
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from .engine.config import load_config
|
|
3
|
-
from .engine.processor import process_domain
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def create_sdtm_datasets(config_input, input_csv, output_dir):
|
|
7
|
-
if isinstance(config_input, dict):
|
|
8
|
-
config = config_input
|
|
9
|
-
# We assume it's already structured correctly or validated
|
|
10
|
-
else:
|
|
11
|
-
config = load_config(config_input)
|
|
12
|
-
|
|
13
|
-
# Get global defaults
|
|
14
|
-
default_keys = config.get("defaults", {}).get(
|
|
15
|
-
"keys", ["StudyOID", "SubjectKey", "ItemGroupRepeatKey", "StudyEventOID"]
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
print(f"Loading data from {input_csv}...")
|
|
19
|
-
df_long = pd.read_csv(input_csv)
|
|
20
|
-
|
|
21
|
-
# Prioritize DM domain processing
|
|
22
|
-
domains = list(config["domains"].keys())
|
|
23
|
-
if "DM" in domains:
|
|
24
|
-
domains.remove("DM")
|
|
25
|
-
domains.insert(0, "DM")
|
|
26
|
-
|
|
27
|
-
for domain in domains:
|
|
28
|
-
settings_entry = config["domains"][domain]
|
|
29
|
-
print(f"Processing domain: {domain}")
|
|
30
|
-
|
|
31
|
-
# Normalize to list.
|
|
32
|
-
if isinstance(settings_entry, list):
|
|
33
|
-
sources = settings_entry
|
|
34
|
-
else:
|
|
35
|
-
sources = [settings_entry]
|
|
36
|
-
|
|
37
|
-
process_domain(domain, sources, df_long, default_keys, output_dir)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/functions/get_bmi.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/loaders/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/tests/test_engine.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/utils/__init__.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/utils/logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/schema_validator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_adam_spec.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_merge_yaml.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_validation/data_validator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/interventions.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/special_purpose.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_findings.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_general.py
RENAMED
|
File without changes
|
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_processor.py
RENAMED
|
File without changes
|
{cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_validate.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|