cdiscbuilder 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {cdiscbuilder-1.2.0/src/cdiscbuilder.egg-info → cdiscbuilder-1.2.1}/PKG-INFO +1 -1
  2. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/pyproject.toml +1 -1
  3. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/__init__.py +1 -1
  4. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/cli.py +15 -1
  5. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/findings.py +6 -7
  6. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/general.py +16 -7
  7. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/processor.py +2 -2
  8. cdiscbuilder-1.2.1/src/cdiscbuilder/sdtm/engine/tests/test_boundary_standardization.py +135 -0
  9. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/odm_parser.py +40 -14
  10. cdiscbuilder-1.2.1/src/cdiscbuilder/sdtm/sdtm.py +71 -0
  11. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1/src/cdiscbuilder.egg-info}/PKG-INFO +1 -1
  12. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/SOURCES.txt +1 -0
  13. cdiscbuilder-1.2.0/src/cdiscbuilder/sdtm/sdtm.py +0 -37
  14. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/LICENSE +0 -0
  15. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/MANIFEST.in +0 -0
  16. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/README.md +0 -0
  17. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/setup.cfg +0 -0
  18. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/__init__.py +0 -0
  19. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/__init__.py +0 -0
  20. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/__init__.py +0 -0
  21. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/base.py +0 -0
  22. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/function_derivation.py +0 -0
  23. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/derivations/sql_derivation.py +0 -0
  24. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/engine.py +0 -0
  25. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/functions/__init__.py +0 -0
  26. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/functions/get_bmi.py +0 -0
  27. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/loaders/__init__.py +0 -0
  28. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/loaders/sdtm_loader.py +0 -0
  29. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/tests/test_engine.py +0 -0
  30. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/utils/__init__.py +0 -0
  31. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_derivation/utils/logger.py +0 -0
  32. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/__init__.py +0 -0
  33. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/adam_spec.py +0 -0
  34. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/merge_yaml.py +0 -0
  35. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/schema_validator.py +0 -0
  36. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/__init__.py +0 -0
  37. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/organization/adsl_common.yaml +0 -0
  38. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/project/adsl_project.yaml +0 -0
  39. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/schema.yaml +0 -0
  40. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/adam_study1.yaml +0 -0
  41. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/adsl_study1.yaml +0 -0
  42. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/final_adsl_study1.yaml +0 -0
  43. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/adam_study2.yaml +0 -0
  44. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/adsl_study2.yaml +0 -0
  45. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/final_adsl_study2.yaml +0 -0
  46. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_adam_spec.py +0 -0
  47. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_merge_yaml.py +0 -0
  48. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_spec/tests/test_schema_validator.py +0 -0
  49. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_validation/__init__.py +0 -0
  50. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/adam_validation/data_validator.py +0 -0
  51. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/adam/schema.yaml +0 -0
  52. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/__init__.py +0 -0
  53. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/__init__.py +0 -0
  54. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/__init__.py +0 -0
  55. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/base.py +0 -0
  56. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/events.py +0 -0
  57. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/interventions.py +0 -0
  58. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/classes/special_purpose.py +0 -0
  59. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/config.py +0 -0
  60. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/functions.py +0 -0
  61. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_config.py +0 -0
  62. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_findings.py +0 -0
  63. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_general.py +0 -0
  64. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_metadata_extractor.py +0 -0
  65. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_processor.py +0 -0
  66. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/tests/test_validate.py +0 -0
  67. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/engine/validate.py +0 -0
  68. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/__init__.py +0 -0
  69. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/load.py +0 -0
  70. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/tests/__init__.py +0 -0
  71. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/sdtm/loader/tests/test_load.py +0 -0
  72. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder/tlf/__init__.py +0 -0
  73. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/dependency_links.txt +0 -0
  74. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/entry_points.txt +0 -0
  75. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/requires.txt +0 -0
  76. {cdiscbuilder-1.2.0 → cdiscbuilder-1.2.1}/src/cdiscbuilder.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cdiscbuilder
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: A package to convert ODM XML to SDTM/ADaM Datasets
5
5
  Author-email: Ming-Chun Chen <hellomingchun@gmail.com>
6
6
  Requires-Python: >=3.9
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cdiscbuilder"
7
- version = "1.2.0"
7
+ version = "1.2.1"
8
8
  description = "A package to convert ODM XML to SDTM/ADaM Datasets"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Ming-Chun Chen", email = "hellomingchun@gmail.com"}]
@@ -1,4 +1,4 @@
1
- __version__ = "1.2.0"
1
+ __version__ = "1.2.1"
2
2
 
3
3
  from . import adam as adam
4
4
  from . import tlf as tlf
@@ -30,10 +30,24 @@ def main():
30
30
 
31
31
  args = parser.parse_args()
32
32
 
33
+ # Load defaults from configs directory if present
34
+ defaults = {}
35
+ if args.configs and os.path.exists(args.configs):
36
+ defaults_path = os.path.join(args.configs, "defaults.yaml")
37
+ if os.path.exists(defaults_path):
38
+ import yaml
39
+ try:
40
+ with open(defaults_path, "r") as f:
41
+ defaults = yaml.safe_load(f) or {}
42
+ except Exception as e:
43
+ print(f"Warning: Failed to load defaults.yaml from {defaults_path}: {e}")
44
+
45
+ xml_mapping = defaults.get("xml_mapping")
46
+
33
47
  # Step 1: ODM XML -> Long CSV
34
48
  print(f"--- Step 1: Parsing ODM XML from {args.xml} ---")
35
49
  try:
36
- df = parse_odm_to_long_df(args.xml)
50
+ df = parse_odm_to_long_df(args.xml, xml_mapping=xml_mapping)
37
51
  print(f"Parsed {len(df)} rows.")
38
52
  df.to_csv(args.csv, index=False)
39
53
  print(f"Saved intermediate data to {args.csv}")
@@ -10,7 +10,7 @@ class FindingsProcessor:
10
10
  def __init__(self):
11
11
  self.class_name = "FINDINGS"
12
12
 
13
- def process(self, domain_name, sources, df_long, default_keys):
13
+ def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None):
14
14
  domain_dfs = []
15
15
 
16
16
  for settings in sources:
@@ -20,9 +20,8 @@ class FindingsProcessor:
20
20
  if form_oid:
21
21
  if "FormOID" not in source_df.columns:
22
22
  print(
23
- f"Warning: 'FormOID' column missing in source data for domain {domain_name}"
23
+ f"Warning: 'FormOID' column missing in source data. Skipping FormOID filtering."
24
24
  )
25
- source_df = pd.DataFrame(columns=source_df.columns)
26
25
  else:
27
26
  if isinstance(form_oid, list):
28
27
  source_df = source_df[source_df["FormOID"].isin(form_oid)]
@@ -34,9 +33,8 @@ class FindingsProcessor:
34
33
  if item_group_match:
35
34
  if "ItemGroupOID" not in source_df.columns:
36
35
  print(
37
- f"Warning: 'ItemGroupOID' column missing in source data for domain {domain_name}"
36
+ f"Warning: 'ItemGroupOID' column missing in source data. Skipping ItemGroupOID filtering."
38
37
  )
39
- source_df = pd.DataFrame(columns=source_df.columns)
40
38
  else:
41
39
  source_df = source_df[
42
40
  source_df["ItemGroupOID"].str.match(item_group_match, na=False)
@@ -47,9 +45,8 @@ class FindingsProcessor:
47
45
  if item_oid_match:
48
46
  if "ItemOID" not in source_df.columns:
49
47
  print(
50
- f"Warning: 'ItemOID' column missing in source data for domain {domain_name}"
48
+ f"Warning: 'ItemOID' column missing in source data. Skipping ItemOID filtering."
51
49
  )
52
- source_df = pd.DataFrame(columns=source_df.columns)
53
50
  else:
54
51
  source_df = source_df[
55
52
  source_df["ItemOID"].str.match(item_oid_match, na=False)
@@ -60,6 +57,8 @@ class FindingsProcessor:
60
57
 
61
58
  # 3. Create Base DataFrame (No Pivot)
62
59
  keys = settings.get("keys", default_keys)
60
+ if custom_to_standard:
61
+ keys = [custom_to_standard.get(k, k) for k in keys]
63
62
 
64
63
  base_cols = keys + ["ItemOID", "Value"]
65
64
  if "Question" in source_df.columns:
@@ -59,7 +59,7 @@ class GeneralProcessor:
59
59
 
60
60
  return expanded_list
61
61
 
62
- def process(self, domain_name, sources, df_long, default_keys):
62
+ def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None):
63
63
  domain_dfs = []
64
64
 
65
65
  # Pre-expand sources if they contain lists
@@ -76,25 +76,34 @@ class GeneralProcessor:
76
76
  form_oid = settings.get("formoid")
77
77
  if form_oid:
78
78
  try:
79
- # Filter for specific FormOID(s)
80
- if isinstance(form_oid, list):
81
- source_df = df_long[df_long["FormOID"].isin(form_oid)].copy()
79
+ if "FormOID" in df_long.columns:
80
+ # Filter for specific FormOID(s)
81
+ if isinstance(form_oid, list):
82
+ source_df = df_long[df_long["FormOID"].isin(form_oid)].copy()
83
+ else:
84
+ source_df = df_long[df_long["FormOID"] == form_oid].copy()
82
85
  else:
83
- source_df = df_long[df_long["FormOID"] == form_oid].copy()
86
+ print(f"Warning: 'FormOID' column missing in source data. Skipping FormOID filtering.")
87
+ source_df = df_long.copy()
84
88
  except Exception as e:
85
89
  print(
86
90
  f"Error filtering for {domain_name} (FormOID={form_oid}): {e}"
87
91
  )
88
92
  continue
89
93
  else:
90
- print(f"Warning: No formoid specified for a block in {domain_name}")
91
- continue
94
+ if "FormOID" in df_long.columns:
95
+ print(f"Warning: No formoid specified for a block in {domain_name}")
96
+ continue
97
+ else:
98
+ source_df = df_long.copy()
92
99
 
93
100
  if source_df.empty:
94
101
  continue
95
102
 
96
103
  # 2. Key columns for pivoting (use block keys or defaults)
97
104
  keys = settings.get("keys", default_keys)
105
+ if custom_to_standard:
106
+ keys = [custom_to_standard.get(k, k) for k in keys]
98
107
 
99
108
  # 3. Pivot
100
109
  try:
@@ -8,7 +8,7 @@ from .classes.findings import FindingsProcessor
8
8
  from .classes.special_purpose import SpecialPurposeProcessor
9
9
 
10
10
 
11
- def process_domain(domain_name, sources, df_long, default_keys, output_dir):
11
+ def process_domain(domain_name, sources, df_long, default_keys, output_dir, custom_to_standard=None):
12
12
  # Normalize to list
13
13
  if isinstance(sources, dict):
14
14
  sources = [sources]
@@ -31,7 +31,7 @@ def process_domain(domain_name, sources, df_long, default_keys, output_dir):
31
31
  else:
32
32
  processor = GeneralProcessor()
33
33
 
34
- domain_dfs = processor.process(domain_name, sources, df_long, default_keys)
34
+ domain_dfs = processor.process(domain_name, sources, df_long, default_keys, custom_to_standard=custom_to_standard)
35
35
 
36
36
  if not domain_dfs:
37
37
  print(f"Warning: No data found for domain {domain_name}")
@@ -0,0 +1,135 @@
1
+ import os
2
+ import tempfile
3
+ import pandas as pd
4
+ import pytest
5
+ import yaml
6
+ from cdiscbuilder.sdtm.odm_parser import parse_odm_to_long_df
7
+ from cdiscbuilder.sdtm.sdtm import create_sdtm_datasets
8
+
9
+
10
+ def test_parse_odm_to_long_df_boundary_standardization():
11
+ # Simple XML string with Medidata style attributes
12
+ xml_content = """<?xml version="1.0" encoding="UTF-8"?>
13
+ <ODM>
14
+ <ClinicalData StudyOID="STUDY_XYZ">
15
+ <SubjectData SubjectKey="SUBJ123">
16
+ <StudyEventData StudyEventOID="SE_VISIT" StartDate="2026-05-29">
17
+ <FormData FormOID="F_AE">
18
+ <ItemGroupData ItemGroupOID="IG_AE" RecordPosition="2">
19
+ <ItemData ItemOID="AE_TERM" Value="Headache"/>
20
+ </ItemGroupData>
21
+ </FormData>
22
+ </StudyEventData>
23
+ </SubjectData>
24
+ </ClinicalData>
25
+ </ODM>"""
26
+
27
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as f:
28
+ f.write(xml_content)
29
+ temp_xml_path = f.name
30
+
31
+ try:
32
+ # Test parsing with custom xml_mapping.
33
+ # Should map the RecordPosition attribute to standard ItemGroupRepeatKey column in df
34
+ xml_mapping = {"item_group_repeat_key": "RecordPosition"}
35
+
36
+ df = parse_odm_to_long_df(temp_xml_path, xml_mapping=xml_mapping)
37
+
38
+ # Verify output columns are standard (no RecordPosition column is outputted;
39
+ # it is standardized to ItemGroupRepeatKey)
40
+ assert "ItemGroupRepeatKey" in df.columns
41
+ assert "RecordPosition" not in df.columns
42
+ assert df.iloc[0]["ItemGroupRepeatKey"] == "2"
43
+ finally:
44
+ try:
45
+ os.remove(temp_xml_path)
46
+ except Exception:
47
+ pass
48
+
49
+
50
+ def test_create_sdtm_datasets_boundary_standardization(tmp_path):
51
+ # df_long with custom columns: RecordPosition, SubjectID, and no FormOID
52
+ df_long = pd.DataFrame(
53
+ [
54
+ {
55
+ "StudyOID": "STUDY01",
56
+ "SubjectID": "001",
57
+ "RecordPosition": 1,
58
+ "ItemOID": "AETERM",
59
+ "Value": "Headache",
60
+ },
61
+ {
62
+ "StudyOID": "STUDY01",
63
+ "SubjectID": "001",
64
+ "RecordPosition": 1,
65
+ "ItemOID": "AESTDTC",
66
+ "Value": "2026-05-29",
67
+ },
68
+ {
69
+ "StudyOID": "STUDY01",
70
+ "SubjectID": "001",
71
+ "RecordPosition": 2,
72
+ "ItemOID": "AETERM",
73
+ "Value": "Nausea",
74
+ },
75
+ {
76
+ "StudyOID": "STUDY01",
77
+ "SubjectID": "001",
78
+ "RecordPosition": 2,
79
+ "ItemOID": "AESTDTC",
80
+ "Value": "2026-05-30",
81
+ },
82
+ ]
83
+ )
84
+
85
+ input_csv = tmp_path / "long.csv"
86
+ df_long.to_csv(input_csv, index=False)
87
+
88
+ # Create domain spec directory
89
+ spec_dir = tmp_path / "specs"
90
+ os.makedirs(spec_dir)
91
+
92
+ # Create defaults.yaml
93
+ defaults = {
94
+ "keys": ["StudyOID", "SubjectID", "RecordPosition"],
95
+ "csv_columns": {
96
+ "item_group_repeat_key": "RecordPosition",
97
+ "subject_key": "SubjectID",
98
+ },
99
+ }
100
+ with open(spec_dir / "defaults.yaml", "w") as f:
101
+ yaml.dump(defaults, f)
102
+
103
+ # Create AE.yaml
104
+ ae_config = {
105
+ "AE": [
106
+ {
107
+ # Note: no formoid specified! So we test proceeding without FormOID filtering.
108
+ "type": "events",
109
+ "columns": {
110
+ "STUDYID": {"source": "StudyOID"},
111
+ "USUBJID": {"source": "SubjectKey"},
112
+ "AETERM": {"source": "AETERM"},
113
+ "AESTDTC": {"source": "AESTDTC"},
114
+ "AESEQ": {"group": ["USUBJID"], "sort_by": ["AESTDTC"]},
115
+ },
116
+ }
117
+ ]
118
+ }
119
+ with open(spec_dir / "AE.yaml", "w") as f:
120
+ yaml.dump(ae_config, f)
121
+
122
+ output_dir = tmp_path / "sdtm_out"
123
+
124
+ # Generate datasets
125
+ create_sdtm_datasets(str(spec_dir), str(input_csv), str(output_dir))
126
+
127
+ # Verify dataset exists
128
+ out_file = output_dir / "AE.parquet"
129
+ assert out_file.exists()
130
+
131
+ res_df = pd.read_parquet(out_file)
132
+ assert len(res_df) == 2
133
+ assert "AETERM" in res_df.columns
134
+ assert list(res_df["AETERM"]) == ["Headache", "Nausea"]
135
+ assert list(res_df["AESEQ"]) == [1, 2]
@@ -2,7 +2,26 @@ import xml.etree.ElementTree as ET
2
2
  import pandas as pd
3
3
 
4
4
 
5
- def parse_odm_to_long_df(xml_file):
5
+ DEFAULT_XML_MAPPING = {
6
+ "study_oid": "StudyOID",
7
+ "subject_key": "SubjectKey",
8
+ "study_subject_id": "StudySubjectID",
9
+ "study_event_oid": "StudyEventOID",
10
+ "study_event_repeat_key": "StudyEventRepeatKey",
11
+ "study_event_start_date": "StartDate",
12
+ "form_oid": "FormOID",
13
+ "item_group_oid": "ItemGroupOID",
14
+ "item_group_repeat_key": "ItemGroupRepeatKey",
15
+ "item_oid": "ItemOID",
16
+ "value": "Value",
17
+ }
18
+
19
+
20
+ def parse_odm_to_long_df(xml_file, xml_mapping=None):
21
+ merged_xml_mapping = DEFAULT_XML_MAPPING.copy()
22
+ if xml_mapping:
23
+ merged_xml_mapping.update(xml_mapping)
24
+
6
25
  try:
7
26
  tree = ET.parse(xml_file)
8
27
  root = tree.getroot()
@@ -42,13 +61,15 @@ def parse_odm_to_long_df(xml_file):
42
61
 
43
62
  for cd in root:
44
63
  if get_local_name(cd.tag) == "ClinicalData":
45
- study_oid = cd.get("StudyOID")
64
+ study_oid = cd.get(merged_xml_mapping["study_oid"])
46
65
  for sd in cd:
47
66
  if get_local_name(sd.tag) == "SubjectData":
48
- subject_key = sd.get("SubjectKey")
67
+ subject_key = sd.get(merged_xml_mapping["subject_key"])
49
68
 
50
69
  # Helper for attributes
51
70
  def get_attrib(elem, partial_name):
71
+ if not partial_name:
72
+ return None
52
73
  if partial_name in elem.attrib:
53
74
  return elem.attrib[partial_name]
54
75
  for k, v in elem.attrib.items():
@@ -56,41 +77,46 @@ def parse_odm_to_long_df(xml_file):
56
77
  return v
57
78
  return None
58
79
 
59
- study_subject_id = get_attrib(sd, "StudySubjectID") or get_attrib(
60
- sd, "studysubjectid"
61
- )
80
+ study_subject_id_attr = merged_xml_mapping["study_subject_id"]
81
+ study_subject_id = get_attrib(sd, study_subject_id_attr)
82
+ if not study_subject_id and study_subject_id_attr:
83
+ # try case-insensitive or fallback
84
+ study_subject_id = get_attrib(sd, study_subject_id_attr.lower())
85
+ if not study_subject_id:
86
+ study_subject_id = get_attrib(sd, "studysubjectid")
87
+
62
88
  if not subject_key:
63
89
  subject_key = study_subject_id
64
90
 
65
91
  for child in sd:
66
92
  tag = get_local_name(child.tag)
67
93
  if tag == "StudyEventData":
68
- study_event_oid = child.get("StudyEventOID")
94
+ study_event_oid = child.get(merged_xml_mapping["study_event_oid"])
69
95
  study_event_repeat_key = (
70
- child.get("StudyEventRepeatKey") or "1"
96
+ child.get(merged_xml_mapping["study_event_repeat_key"]) or "1"
71
97
  )
72
98
 
73
99
  # Extract Namespaced StartDate
74
- start_date = get_attrib(child, "StartDate") or ""
100
+ start_date = get_attrib(child, merged_xml_mapping["study_event_start_date"]) or ""
75
101
 
76
102
  for form in child:
77
103
  f_tag = get_local_name(form.tag)
78
104
  if f_tag == "FormData":
79
- form_oid = form.get("FormOID")
105
+ form_oid = form.get(merged_xml_mapping["form_oid"])
80
106
 
81
107
  for ig in form:
82
108
  ig_tag = get_local_name(ig.tag)
83
109
  if ig_tag == "ItemGroupData":
84
- item_group_oid = ig.get("ItemGroupOID")
110
+ item_group_oid = ig.get(merged_xml_mapping["item_group_oid"])
85
111
  item_group_repeat_key = ig.get(
86
- "ItemGroupRepeatKey"
112
+ merged_xml_mapping["item_group_repeat_key"]
87
113
  )
88
114
 
89
115
  for item in ig:
90
116
  i_tag = get_local_name(item.tag)
91
117
  if i_tag == "ItemData":
92
- item_oid = item.get("ItemOID")
93
- value = item.get("Value")
118
+ item_oid = item.get(merged_xml_mapping["item_oid"])
119
+ value = item.get(merged_xml_mapping["value"])
94
120
 
95
121
  meta = item_metadata.get(
96
122
  item_oid, {}
@@ -0,0 +1,71 @@
1
+ import pandas as pd
2
+ from .engine.config import load_config
3
+ from .engine.processor import process_domain
4
+
5
+
6
+ def create_sdtm_datasets(config_input, input_csv, output_dir):
7
+ if isinstance(config_input, dict):
8
+ config = config_input
9
+ # We assume it's already structured correctly or validated
10
+ else:
11
+ config = load_config(config_input)
12
+
13
+ # Get global defaults
14
+ defaults = config.get("defaults", {})
15
+ default_keys = defaults.get(
16
+ "keys", ["StudyOID", "SubjectKey", "ItemGroupRepeatKey", "StudyEventOID"]
17
+ )
18
+
19
+ print(f"Loading data from {input_csv}...")
20
+ df_long = pd.read_csv(input_csv)
21
+
22
+ # Invert mapping to go from custom CSV column name -> standard logical column name
23
+ # e.g., "RecordPosition" -> "ItemGroupRepeatKey"
24
+ csv_columns = defaults.get("csv_columns") or {}
25
+ STANDARD_LOGICAL_COLUMNS = {
26
+ "study_oid": "StudyOID",
27
+ "subject_key": "SubjectKey",
28
+ "study_subject_id": "StudySubjectID",
29
+ "study_event_oid": "StudyEventOID",
30
+ "study_event_repeat_key": "StudyEventRepeatKey",
31
+ "study_event_start_date": "StudyEventStartDate",
32
+ "form_oid": "FormOID",
33
+ "item_group_oid": "ItemGroupOID",
34
+ "item_group_repeat_key": "ItemGroupRepeatKey",
35
+ "item_oid": "ItemOID",
36
+ "value": "Value",
37
+ "question": "Question",
38
+ "item_name": "ItemName",
39
+ }
40
+
41
+ custom_to_standard = {}
42
+ rename_map = {}
43
+ for logical_key, custom_col in csv_columns.items():
44
+ if logical_key in STANDARD_LOGICAL_COLUMNS:
45
+ standard_col = STANDARD_LOGICAL_COLUMNS[logical_key]
46
+ rename_map[custom_col] = standard_col
47
+ custom_to_standard[custom_col] = standard_col
48
+
49
+ # Perform rename if map is not empty
50
+ if rename_map:
51
+ df_long.rename(columns=rename_map, inplace=True)
52
+ # Translate default_keys to match the standardized DataFrame
53
+ default_keys = [custom_to_standard.get(k, k) for k in default_keys]
54
+
55
+ # Prioritize DM domain processing
56
+ domains = list(config["domains"].keys())
57
+ if "DM" in domains:
58
+ domains.remove("DM")
59
+ domains.insert(0, "DM")
60
+
61
+ for domain in domains:
62
+ settings_entry = config["domains"][domain]
63
+ print(f"Processing domain: {domain}")
64
+
65
+ # Normalize to list.
66
+ if isinstance(settings_entry, list):
67
+ sources = settings_entry
68
+ else:
69
+ sources = [settings_entry]
70
+
71
+ process_domain(domain, sources, df_long, default_keys, output_dir, custom_to_standard=custom_to_standard)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cdiscbuilder
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: A package to convert ODM XML to SDTM/ADaM Datasets
5
5
  Author-email: Ming-Chun Chen <hellomingchun@gmail.com>
6
6
  Requires-Python: >=3.9
@@ -59,6 +59,7 @@ src/cdiscbuilder/sdtm/engine/classes/findings.py
59
59
  src/cdiscbuilder/sdtm/engine/classes/general.py
60
60
  src/cdiscbuilder/sdtm/engine/classes/interventions.py
61
61
  src/cdiscbuilder/sdtm/engine/classes/special_purpose.py
62
+ src/cdiscbuilder/sdtm/engine/tests/test_boundary_standardization.py
62
63
  src/cdiscbuilder/sdtm/engine/tests/test_config.py
63
64
  src/cdiscbuilder/sdtm/engine/tests/test_findings.py
64
65
  src/cdiscbuilder/sdtm/engine/tests/test_general.py
@@ -1,37 +0,0 @@
1
- import pandas as pd
2
- from .engine.config import load_config
3
- from .engine.processor import process_domain
4
-
5
-
6
- def create_sdtm_datasets(config_input, input_csv, output_dir):
7
- if isinstance(config_input, dict):
8
- config = config_input
9
- # We assume it's already structured correctly or validated
10
- else:
11
- config = load_config(config_input)
12
-
13
- # Get global defaults
14
- default_keys = config.get("defaults", {}).get(
15
- "keys", ["StudyOID", "SubjectKey", "ItemGroupRepeatKey", "StudyEventOID"]
16
- )
17
-
18
- print(f"Loading data from {input_csv}...")
19
- df_long = pd.read_csv(input_csv)
20
-
21
- # Prioritize DM domain processing
22
- domains = list(config["domains"].keys())
23
- if "DM" in domains:
24
- domains.remove("DM")
25
- domains.insert(0, "DM")
26
-
27
- for domain in domains:
28
- settings_entry = config["domains"][domain]
29
- print(f"Processing domain: {domain}")
30
-
31
- # Normalize to list.
32
- if isinstance(settings_entry, list):
33
- sources = settings_entry
34
- else:
35
- sources = [settings_entry]
36
-
37
- process_domain(domain, sources, df_long, default_keys, output_dir)
File without changes
File without changes
File without changes
File without changes