cdiscbuilder 1.2.3__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cdiscbuilder-1.2.3/src/cdiscbuilder.egg-info → cdiscbuilder-1.3.1}/PKG-INFO +1 -1
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/pyproject.toml +2 -2
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/__init__.py +1 -1
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/functions/__init__.py +14 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/functions/calculate_study_day.py +33 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/functions/coalesce.py +22 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/functions/get_dose_dates.py +86 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/functions/get_earliest_informed_consent_date.py +93 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/functions/get_last_participation_date.py +55 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/base.py +4 -1
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/findings.py +1 -1
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/general.py +106 -19
- cdiscbuilder-1.3.1/src/cdiscbuilder/sdtm/engine/processor.py +285 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/tests/test_general.py +47 -1
- cdiscbuilder-1.3.1/src/cdiscbuilder/sdtm/engine/tests/test_iso8601.py +39 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/sdtm/engine/tests/test_processor.py +615 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/sdtm/engine/utils/iso8601.py +97 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/sdtm/sdtm.py +153 -0
- cdiscbuilder-1.3.1/src/cdiscbuilder/sdtm/specs/schema.yaml +92 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1/src/cdiscbuilder.egg-info}/PKG-INFO +1 -1
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder.egg-info/SOURCES.txt +7 -0
- cdiscbuilder-1.2.3/src/cdiscbuilder/functions/calculate_study_day.py +0 -23
- cdiscbuilder-1.2.3/src/cdiscbuilder/sdtm/engine/processor.py +0 -144
- cdiscbuilder-1.2.3/src/cdiscbuilder/sdtm/engine/tests/test_processor.py +0 -179
- cdiscbuilder-1.2.3/src/cdiscbuilder/sdtm/sdtm.py +0 -71
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/LICENSE +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/MANIFEST.in +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/README.md +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/setup.cfg +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/derivations/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/derivations/base.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/derivations/function_derivation.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/derivations/sql_derivation.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/engine.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/functions/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/functions/get_bmi.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/loaders/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/loaders/sdtm_loader.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/tests/test_engine.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/utils/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_derivation/utils/logger.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/adam_spec.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/merge_yaml.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/schema_validator.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/organization/adsl_common.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/project/adsl_project.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/schema.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/adam_study1.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/adsl_study1.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study1/final_adsl_study1.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/adam_study2.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/adsl_study2.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/data/scenarios/study2/final_adsl_study2.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/test_adam_spec.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/test_merge_yaml.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_spec/tests/test_schema_validator.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_validation/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/adam_validation/data_validator.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/adam/schema.yaml +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/cli.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/functions/extract_value.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/functions/get_bmi.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/events.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/interventions.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/classes/special_purpose.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/config.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/tests/test_boundary_standardization.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/tests/test_config.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/tests/test_findings.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/tests/test_metadata_extractor.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/tests/test_validate.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/engine/validate.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/loader/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/loader/load.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/loader/tests/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/loader/tests/test_load.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/sdtm/odm_parser.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder/tlf/__init__.py +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder.egg-info/dependency_links.txt +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder.egg-info/entry_points.txt +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder.egg-info/requires.txt +0 -0
- {cdiscbuilder-1.2.3 → cdiscbuilder-1.3.1}/src/cdiscbuilder.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "cdiscbuilder"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.3.1"
|
|
8
8
|
description = "A package to convert ODM XML to SDTM/ADaM Datasets"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{name = "Ming-Chun Chen", email = "hellomingchun@gmail.com"}]
|
|
@@ -20,7 +20,7 @@ requires-python = ">=3.9"
|
|
|
20
20
|
cdisc-sdtm = "cdiscbuilder.cli:main"
|
|
21
21
|
|
|
22
22
|
[tool.setuptools.package-data]
|
|
23
|
-
cdiscbuilder = ["adam/schema.yaml", "adam/adam_spec/tests/data/scenarios/**/*.yaml"]
|
|
23
|
+
cdiscbuilder = ["adam/schema.yaml", "adam/adam_spec/tests/data/scenarios/**/*.yaml", "sdtm/specs/schema.yaml"]
|
|
24
24
|
|
|
25
25
|
[tool.setuptools.packages.find]
|
|
26
26
|
where = ["src"]
|
|
@@ -6,6 +6,10 @@ Maps short function names to full module paths for cleaner specifications.
|
|
|
6
6
|
from .get_bmi import get_bmi
|
|
7
7
|
from .calculate_study_day import calculate_study_day
|
|
8
8
|
from .extract_value import extract_value
|
|
9
|
+
from .get_dose_dates import get_first_dose_date, get_last_dose_date
|
|
10
|
+
from .coalesce import coalesce
|
|
11
|
+
from .get_last_participation_date import get_last_participation_date
|
|
12
|
+
from .get_earliest_informed_consent_date import get_earliest_informed_consent_date
|
|
9
13
|
|
|
10
14
|
# Function registry mapping short names to full paths
|
|
11
15
|
FUNCTION_REGISTRY = {
|
|
@@ -15,6 +19,11 @@ FUNCTION_REGISTRY = {
|
|
|
15
19
|
# SDTM functions
|
|
16
20
|
"calculate_study_day": "cdiscbuilder.functions.calculate_study_day.calculate_study_day",
|
|
17
21
|
"extract_value": "cdiscbuilder.functions.extract_value.extract_value",
|
|
22
|
+
"get_first_dose_date": "cdiscbuilder.functions.get_dose_dates.get_first_dose_date",
|
|
23
|
+
"get_last_dose_date": "cdiscbuilder.functions.get_dose_dates.get_last_dose_date",
|
|
24
|
+
"coalesce": "cdiscbuilder.functions.coalesce.coalesce",
|
|
25
|
+
"get_last_participation_date": "cdiscbuilder.functions.get_last_participation_date.get_last_participation_date",
|
|
26
|
+
"get_earliest_informed_consent_date": "cdiscbuilder.functions.get_earliest_informed_consent_date.get_earliest_informed_consent_date",
|
|
18
27
|
}
|
|
19
28
|
|
|
20
29
|
|
|
@@ -61,6 +70,11 @@ __all__ = [
|
|
|
61
70
|
"get_bmi",
|
|
62
71
|
"calculate_study_day",
|
|
63
72
|
"extract_value",
|
|
73
|
+
"get_first_dose_date",
|
|
74
|
+
"get_last_dose_date",
|
|
75
|
+
"coalesce",
|
|
76
|
+
"get_last_participation_date",
|
|
77
|
+
"get_earliest_informed_consent_date",
|
|
64
78
|
"get_function_path",
|
|
65
79
|
"list_available_functions",
|
|
66
80
|
"register_function",
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def calculate_study_day(date_series, rfstdtc_series):
|
|
5
|
+
"""
|
|
6
|
+
Calculates SDTM Study Day (--DY).
|
|
7
|
+
SDTM Rule:
|
|
8
|
+
- If date is on or after RFSTDTC: (date - RFSTDTC) + 1
|
|
9
|
+
- If date is before RFSTDTC: (date - RFSTDTC)
|
|
10
|
+
- There is no Day 0.
|
|
11
|
+
- Partial dates (missing day or month) cannot be used to calculate study day.
|
|
12
|
+
"""
|
|
13
|
+
# Filter out partial dates (ISO 8601 YYYY-MM-DD is at least 10 chars)
|
|
14
|
+
# This prevents pd.to_datetime from assuming the 1st of the month for 'YYYY-MM'
|
|
15
|
+
valid_d = date_series.where(date_series.astype(str).str.len() >= 10)
|
|
16
|
+
valid_rf = rfstdtc_series.where(rfstdtc_series.astype(str).str.len() >= 10)
|
|
17
|
+
|
|
18
|
+
# Convert to datetime (utc=True prevents tz-naive/tz-aware subtraction issues if times exist)
|
|
19
|
+
d = pd.to_datetime(valid_d, errors="coerce", utc=True)
|
|
20
|
+
rf = pd.to_datetime(valid_rf, errors="coerce", utc=True)
|
|
21
|
+
|
|
22
|
+
# Normalize to midnight to remove time components safely
|
|
23
|
+
d = d.dt.normalize()
|
|
24
|
+
rf = rf.dt.normalize()
|
|
25
|
+
|
|
26
|
+
# Calculate difference in days
|
|
27
|
+
diff = (d - rf).dt.days
|
|
28
|
+
|
|
29
|
+
# Apply SDTM rules (no Day 0)
|
|
30
|
+
dy = diff.apply(lambda x: (x + 1) if pd.notnull(x) and x >= 0 else x)
|
|
31
|
+
|
|
32
|
+
# Cast to Int64 (nullable integer)
|
|
33
|
+
return dy.astype("Int64")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def coalesce(*series_list, **kwargs):
|
|
4
|
+
"""
|
|
5
|
+
Returns the first non-null value across a list of Pandas Series.
|
|
6
|
+
Similar to SQL COALESCE.
|
|
7
|
+
"""
|
|
8
|
+
if not series_list:
|
|
9
|
+
raise ValueError("coalesce requires at least one argument")
|
|
10
|
+
|
|
11
|
+
# Start with the first series
|
|
12
|
+
result = series_list[0].copy()
|
|
13
|
+
|
|
14
|
+
# Iterate through remaining series and fill missing values
|
|
15
|
+
for s in series_list[1:]:
|
|
16
|
+
# Ensure 's' is a Series (in case a literal was passed somehow, though general.py doesn't currently do that)
|
|
17
|
+
if not isinstance(s, pd.Series):
|
|
18
|
+
s = pd.Series([s] * len(result), index=result.index)
|
|
19
|
+
|
|
20
|
+
result = result.combine_first(s)
|
|
21
|
+
|
|
22
|
+
return result
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def _get_dose_date(usubjid_series, built_domains, mode="first", **kwargs):
|
|
4
|
+
"""
|
|
5
|
+
Core logic to find the first or last dose date from EX or EC domains
|
|
6
|
+
where the dose > 0.
|
|
7
|
+
"""
|
|
8
|
+
if not built_domains:
|
|
9
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
10
|
+
|
|
11
|
+
ex_df = built_domains.get("EX")
|
|
12
|
+
ec_df = built_domains.get("EC")
|
|
13
|
+
|
|
14
|
+
# Get custom dose column names if provided in kwargs
|
|
15
|
+
ex_dose_col = kwargs.get("ex_dose_col", "EXDOSE")
|
|
16
|
+
ec_dose_col = kwargs.get("ec_dose_col", "ECDOSE")
|
|
17
|
+
|
|
18
|
+
dfs = []
|
|
19
|
+
|
|
20
|
+
# Process EX
|
|
21
|
+
if ex_df is not None and not ex_df.empty:
|
|
22
|
+
if "USUBJID" in ex_df.columns:
|
|
23
|
+
date_col = "EXSTDTC" if mode == "first" else "EXENDTC"
|
|
24
|
+
if date_col in ex_df.columns:
|
|
25
|
+
valid = ex_df
|
|
26
|
+
if ex_dose_col in ex_df.columns:
|
|
27
|
+
dose = pd.to_numeric(ex_df[ex_dose_col], errors='coerce')
|
|
28
|
+
valid = ex_df[dose > 0]
|
|
29
|
+
dfs.append(valid[["USUBJID", date_col]].rename(columns={date_col: "DATE"}))
|
|
30
|
+
|
|
31
|
+
# Process EC
|
|
32
|
+
if ec_df is not None and not ec_df.empty:
|
|
33
|
+
if "USUBJID" in ec_df.columns:
|
|
34
|
+
date_col = "ECSTDTC" if mode == "first" else "ECENDTC"
|
|
35
|
+
if date_col in ec_df.columns:
|
|
36
|
+
valid = ec_df
|
|
37
|
+
if ec_dose_col in ec_df.columns:
|
|
38
|
+
dose = pd.to_numeric(ec_df[ec_dose_col], errors='coerce')
|
|
39
|
+
valid = ec_df[dose > 0]
|
|
40
|
+
dfs.append(valid[["USUBJID", date_col]].rename(columns={date_col: "DATE"}))
|
|
41
|
+
|
|
42
|
+
if not dfs:
|
|
43
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
44
|
+
|
|
45
|
+
combined = pd.concat(dfs, ignore_index=True)
|
|
46
|
+
combined = combined.dropna(subset=["DATE"])
|
|
47
|
+
|
|
48
|
+
if combined.empty:
|
|
49
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
50
|
+
|
|
51
|
+
# Convert to proper datetime objects for robust chronological sorting
|
|
52
|
+
# We do this to ensure "2026-05-01T08:00" and "2026-05-01" sort correctly as actual time
|
|
53
|
+
# rather than just alphabetical strings.
|
|
54
|
+
combined["DATETIME"] = pd.to_datetime(combined["DATE"], errors='coerce', utc=True)
|
|
55
|
+
|
|
56
|
+
# Drop rows that couldn't be parsed as dates (e.g. completely invalid garbage)
|
|
57
|
+
combined = combined.dropna(subset=["DATETIME"])
|
|
58
|
+
|
|
59
|
+
if combined.empty:
|
|
60
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
61
|
+
|
|
62
|
+
# Find the index of the min/max datetime per subject
|
|
63
|
+
if mode == "first":
|
|
64
|
+
idx = combined.groupby("USUBJID")["DATETIME"].idxmin()
|
|
65
|
+
else:
|
|
66
|
+
idx = combined.groupby("USUBJID")["DATETIME"].idxmax()
|
|
67
|
+
|
|
68
|
+
# Extract the original string format (with the 'T') using the found indices
|
|
69
|
+
res = combined.loc[idx].set_index("USUBJID")["DATE"]
|
|
70
|
+
|
|
71
|
+
# Map back to the exact input series sequence
|
|
72
|
+
return usubjid_series.map(res)
|
|
73
|
+
|
|
74
|
+
def get_first_dose_date(usubjid_series, built_domains=None, **kwargs):
|
|
75
|
+
"""
|
|
76
|
+
Calculates RFXSTDTC (First Study Treatment Date).
|
|
77
|
+
Extracts minimum start date from EX or EC domains where dose > 0.
|
|
78
|
+
"""
|
|
79
|
+
return _get_dose_date(usubjid_series, built_domains, mode="first", **kwargs)
|
|
80
|
+
|
|
81
|
+
def get_last_dose_date(usubjid_series, built_domains=None, **kwargs):
|
|
82
|
+
"""
|
|
83
|
+
Calculates RFXENDTC (Last Study Treatment Date).
|
|
84
|
+
Extracts maximum end date from EX or EC domains where dose > 0.
|
|
85
|
+
"""
|
|
86
|
+
return _get_dose_date(usubjid_series, built_domains, mode="last", **kwargs)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def get_earliest_informed_consent_date(usubjid_series, built_domains=None, df_long=None, **kwargs):
|
|
4
|
+
"""
|
|
5
|
+
Calculates RFICDTC (Date/Time of Informed Consent).
|
|
6
|
+
Can scan the built DS domain OR raw df_long (to avoid circular dependencies).
|
|
7
|
+
|
|
8
|
+
kwargs:
|
|
9
|
+
raw_mode: bool. If True, searches df_long instead of built_domains (default: False)
|
|
10
|
+
raw_formoid: If raw_mode=True, the FormOID to filter on (optional)
|
|
11
|
+
term_col: The column (or ItemOID) to check for consent terms (default: 'DSDECOD' or 'DSTERM')
|
|
12
|
+
consent_terms: List of terms indicating consent (default: ['INFORMED CONSENT OBTAINED'])
|
|
13
|
+
date_col: The date column (or ItemOID) to extract (default: 'DSSTDTC' or 'DSSTDAT')
|
|
14
|
+
"""
|
|
15
|
+
raw_mode = kwargs.get("raw_mode", False)
|
|
16
|
+
consent_terms = kwargs.get("consent_terms", ["INFORMED CONSENT OBTAINED"])
|
|
17
|
+
upper_terms = [t.upper() for t in consent_terms]
|
|
18
|
+
|
|
19
|
+
if raw_mode:
|
|
20
|
+
if df_long is None or df_long.empty:
|
|
21
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
22
|
+
|
|
23
|
+
term_col = kwargs.get("term_col", "DSTERM")
|
|
24
|
+
date_col = kwargs.get("date_col", "DSSTDAT")
|
|
25
|
+
formoid = kwargs.get("raw_formoid")
|
|
26
|
+
|
|
27
|
+
subset = df_long
|
|
28
|
+
if formoid:
|
|
29
|
+
subset = df_long[df_long["FormOID"] == formoid]
|
|
30
|
+
|
|
31
|
+
# We need to find the SubjectKey where term_col has a consent term
|
|
32
|
+
term_mask = (subset["ItemOID"] == term_col) & (subset["Value"].astype(str).str.upper().isin(upper_terms))
|
|
33
|
+
consent_subjects = subset[term_mask]["SubjectKey"].unique()
|
|
34
|
+
|
|
35
|
+
# Now find the date_col for those subjects
|
|
36
|
+
# Wait, if multiple repeats exist, we should match on ItemGroupRepeatKey too
|
|
37
|
+
# To keep it robust, let's just get all date_cols for those subjects on that form
|
|
38
|
+
# and take the minimum date.
|
|
39
|
+
date_mask = (subset["ItemOID"] == date_col) & (subset["SubjectKey"].isin(consent_subjects))
|
|
40
|
+
valid = subset[date_mask].copy()
|
|
41
|
+
|
|
42
|
+
if valid.empty:
|
|
43
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
44
|
+
|
|
45
|
+
valid["DATETIME"] = pd.to_datetime(valid["Value"], errors='coerce', utc=True)
|
|
46
|
+
valid = valid.dropna(subset=["DATETIME"])
|
|
47
|
+
|
|
48
|
+
if valid.empty:
|
|
49
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
50
|
+
|
|
51
|
+
idx = valid.groupby("SubjectKey")["DATETIME"].idxmin()
|
|
52
|
+
# Map raw SubjectKey to our usubjid_series (assuming USUBJID ends with SubjectKey)
|
|
53
|
+
# Note: In CDISC, USUBJID = STUDYID-SubjectKey.
|
|
54
|
+
# But this function receives usubjid_series, so we might need to map via raw SubjectKey.
|
|
55
|
+
# Instead, let's just match SubjectKey directly.
|
|
56
|
+
res = valid.loc[idx].set_index("SubjectKey")["Value"]
|
|
57
|
+
|
|
58
|
+
# Because usubjid_series is standard USUBJID (e.g. 'STUDY-001'), we need to strip study to match
|
|
59
|
+
# Let's extract the subject key from the end
|
|
60
|
+
subject_keys_from_usubjid = usubjid_series.astype(str).str.split("-").str[-1]
|
|
61
|
+
mapped = subject_keys_from_usubjid.map(res)
|
|
62
|
+
return mapped
|
|
63
|
+
|
|
64
|
+
# --- BUILT DOMAIN MODE ---
|
|
65
|
+
if not built_domains:
|
|
66
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
67
|
+
|
|
68
|
+
ds = built_domains.get("DS")
|
|
69
|
+
if ds is None or ds.empty or "USUBJID" not in ds.columns:
|
|
70
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
71
|
+
|
|
72
|
+
term_col = kwargs.get("term_col", "DSDECOD")
|
|
73
|
+
date_col = kwargs.get("date_col", "DSSTDTC")
|
|
74
|
+
|
|
75
|
+
if term_col not in ds.columns or date_col not in ds.columns:
|
|
76
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
77
|
+
|
|
78
|
+
mask = ds[term_col].astype(str).str.upper().isin(upper_terms)
|
|
79
|
+
valid = ds[mask].copy()
|
|
80
|
+
|
|
81
|
+
if valid.empty:
|
|
82
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
83
|
+
|
|
84
|
+
valid["DATETIME"] = pd.to_datetime(valid[date_col], errors='coerce', utc=True)
|
|
85
|
+
valid = valid.dropna(subset=["DATETIME"])
|
|
86
|
+
|
|
87
|
+
if valid.empty:
|
|
88
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
89
|
+
|
|
90
|
+
idx = valid.groupby("USUBJID")["DATETIME"].idxmin()
|
|
91
|
+
res = valid.loc[idx].set_index("USUBJID")[date_col]
|
|
92
|
+
|
|
93
|
+
return usubjid_series.map(res)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def get_last_participation_date(usubjid_series, built_domains=None, **kwargs):
|
|
4
|
+
"""
|
|
5
|
+
Calculates RFENDTC (Reference End Date).
|
|
6
|
+
Finds the absolute maximum date across specified domains and date columns
|
|
7
|
+
for each subject.
|
|
8
|
+
|
|
9
|
+
kwargs:
|
|
10
|
+
domain_dates: dictionary mapping domain names to a list of date columns to check.
|
|
11
|
+
Defaults to scanning DS, EX, AE, and SV.
|
|
12
|
+
"""
|
|
13
|
+
if not built_domains:
|
|
14
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
15
|
+
|
|
16
|
+
# Default domains and columns to scan if not explicitly provided
|
|
17
|
+
domain_dates = kwargs.get("domain_dates", {
|
|
18
|
+
"DS": ["DSSTDTC"],
|
|
19
|
+
"EX": ["EXSTDTC", "EXENDTC"],
|
|
20
|
+
"EC": ["ECSTDTC", "ECENDTC"],
|
|
21
|
+
"AE": ["AESTDTC", "AEENDTC"],
|
|
22
|
+
"SV": ["SVSTDTC", "SVENDTC"]
|
|
23
|
+
})
|
|
24
|
+
|
|
25
|
+
dfs = []
|
|
26
|
+
|
|
27
|
+
for domain, cols in domain_dates.items():
|
|
28
|
+
df = built_domains.get(domain)
|
|
29
|
+
if df is not None and not df.empty and "USUBJID" in df.columns:
|
|
30
|
+
# For each specified date column, melt it down so we can find the global max
|
|
31
|
+
for col in cols:
|
|
32
|
+
if col in df.columns:
|
|
33
|
+
valid = df[["USUBJID", col]].dropna(subset=[col]).rename(columns={col: "DATE"})
|
|
34
|
+
dfs.append(valid)
|
|
35
|
+
|
|
36
|
+
if not dfs:
|
|
37
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
38
|
+
|
|
39
|
+
combined = pd.concat(dfs, ignore_index=True)
|
|
40
|
+
|
|
41
|
+
# Convert to datetime for mathematical maximum comparison
|
|
42
|
+
combined["DATETIME"] = pd.to_datetime(combined["DATE"], errors='coerce', utc=True)
|
|
43
|
+
combined = combined.dropna(subset=["DATETIME"])
|
|
44
|
+
|
|
45
|
+
if combined.empty:
|
|
46
|
+
return pd.Series([None] * len(usubjid_series), index=usubjid_series.index)
|
|
47
|
+
|
|
48
|
+
# Find the index of the max datetime per subject
|
|
49
|
+
idx = combined.groupby("USUBJID")["DATETIME"].idxmax()
|
|
50
|
+
|
|
51
|
+
# Extract the exact string (preserving any 'T' time component)
|
|
52
|
+
res = combined.loc[idx].set_index("USUBJID")["DATE"]
|
|
53
|
+
|
|
54
|
+
# Map back to the input series sequence
|
|
55
|
+
return usubjid_series.map(res)
|
|
@@ -7,7 +7,7 @@ class BaseProcessor(ABC):
|
|
|
7
7
|
self.class_name = "GENERAL"
|
|
8
8
|
|
|
9
9
|
@abstractmethod
|
|
10
|
-
def process(self, domain_name, sources, df_long, default_keys):
|
|
10
|
+
def process(self, domain_name, sources, df_long, default_keys, built_domains=None):
|
|
11
11
|
"""Main entry point for processing a domain."""
|
|
12
12
|
pass
|
|
13
13
|
|
|
@@ -55,6 +55,9 @@ class BaseProcessor(ABC):
|
|
|
55
55
|
series = pd.to_datetime(
|
|
56
56
|
series, errors="coerce", format="mixed"
|
|
57
57
|
).dt.strftime("%Y-%m-%d")
|
|
58
|
+
elif target_type == "iso8601":
|
|
59
|
+
from cdiscbuilder.sdtm.engine.utils.iso8601 import parse_iso8601
|
|
60
|
+
series = series.apply(parse_iso8601)
|
|
58
61
|
elif target_type == "str":
|
|
59
62
|
series = series.astype(str).replace("nan", None)
|
|
60
63
|
except Exception as e:
|
|
@@ -10,7 +10,7 @@ class FindingsProcessor:
|
|
|
10
10
|
def __init__(self):
|
|
11
11
|
self.class_name = "FINDINGS"
|
|
12
12
|
|
|
13
|
-
def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None):
|
|
13
|
+
def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None, built_domains=None):
|
|
14
14
|
domain_dfs = []
|
|
15
15
|
|
|
16
16
|
for settings in sources:
|
|
@@ -59,7 +59,54 @@ class GeneralProcessor:
|
|
|
59
59
|
|
|
60
60
|
return expanded_list
|
|
61
61
|
|
|
62
|
-
def
|
|
62
|
+
def _resolve_cross_domain(self, source_expr, col_config, final_df, pivoted, built_domains):
|
|
63
|
+
"""
|
|
64
|
+
Resolve a cross-domain reference (e.g., 'DM.RFSTDTC') by merging from built_domains.
|
|
65
|
+
Returns (series, resolved) where resolved is True if successfully resolved.
|
|
66
|
+
"""
|
|
67
|
+
if not (isinstance(source_expr, str) and "." in source_expr):
|
|
68
|
+
return None, False
|
|
69
|
+
|
|
70
|
+
ref_domain, ref_col = source_expr.split(".", 1)
|
|
71
|
+
if not (ref_domain.isupper() and 2 <= len(ref_domain) <= 4):
|
|
72
|
+
return None, False
|
|
73
|
+
|
|
74
|
+
if not built_domains or ref_domain not in built_domains:
|
|
75
|
+
print(f"Warning: Referenced domain '{ref_domain}' not available for cross-domain ref '{source_expr}'")
|
|
76
|
+
return pd.Series([None] * len(pivoted)), True
|
|
77
|
+
|
|
78
|
+
ref_df = built_domains[ref_domain]
|
|
79
|
+
if ref_col not in ref_df.columns:
|
|
80
|
+
print(f"Warning: Column '{ref_col}' not found in domain '{ref_domain}' for cross-domain ref '{source_expr}'")
|
|
81
|
+
return pd.Series([None] * len(pivoted)), True
|
|
82
|
+
|
|
83
|
+
# Determine merge key
|
|
84
|
+
merge_key = col_config.get("merge_on", ["USUBJID"]) if isinstance(col_config, dict) else ["USUBJID"]
|
|
85
|
+
if isinstance(merge_key, str):
|
|
86
|
+
merge_key = [merge_key]
|
|
87
|
+
|
|
88
|
+
# Validate merge keys exist in both DataFrames
|
|
89
|
+
valid_keys = [k for k in merge_key if k in final_df.columns and k in ref_df.columns]
|
|
90
|
+
|
|
91
|
+
if not valid_keys:
|
|
92
|
+
print(f"Warning: Merge keys {merge_key} missing for cross-domain ref '{source_expr}'")
|
|
93
|
+
return pd.Series([None] * len(pivoted)), True
|
|
94
|
+
|
|
95
|
+
# Get unique ref values to avoid duplicating rows
|
|
96
|
+
ref_cols_needed = valid_keys + [ref_col]
|
|
97
|
+
ref_subset = ref_df[ref_cols_needed].drop_duplicates(subset=valid_keys)
|
|
98
|
+
|
|
99
|
+
# Merge into final_df temporarily
|
|
100
|
+
merged = final_df[valid_keys].merge(ref_subset, on=valid_keys, how="left")
|
|
101
|
+
series = merged[ref_col]
|
|
102
|
+
series.index = final_df.index # Re-align index
|
|
103
|
+
|
|
104
|
+
match_count = series.notna().sum()
|
|
105
|
+
print(f" ↳ Resolved cross-domain ref: {source_expr} ({match_count} matches via {valid_keys})")
|
|
106
|
+
|
|
107
|
+
return series, True
|
|
108
|
+
|
|
109
|
+
def process(self, domain_name, sources, df_long, default_keys, custom_to_standard=None, built_domains=None):
|
|
63
110
|
domain_dfs = []
|
|
64
111
|
|
|
65
112
|
# Pre-expand sources if they contain lists
|
|
@@ -214,35 +261,27 @@ class GeneralProcessor:
|
|
|
214
261
|
elif isinstance(col_config, dict) and col_config.get("function"):
|
|
215
262
|
func_name = col_config.get("function")
|
|
216
263
|
args = col_config.get("args", [])
|
|
264
|
+
kwargs = col_config.get("kwargs", {})
|
|
217
265
|
|
|
218
266
|
# Resolve Args
|
|
219
267
|
arg_series = []
|
|
220
268
|
for arg in args:
|
|
221
|
-
# Support cross-domain lookup
|
|
269
|
+
# Support cross-domain lookup
|
|
222
270
|
if arg in final_df.columns:
|
|
223
271
|
arg_series.append(final_df[arg])
|
|
224
272
|
elif arg in pivoted.columns:
|
|
225
273
|
arg_series.append(pivoted[arg])
|
|
226
274
|
else:
|
|
227
|
-
# Try
|
|
275
|
+
# Try cross-domain resolution
|
|
228
276
|
if isinstance(arg, str) and "." in arg:
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
# For now, let's assume RFSTDTC was merged into DM block or AE block already
|
|
235
|
-
# Or we pass it in.
|
|
236
|
-
# For the demo, let's assume RFSTDTC is in the dataset or handled as a string
|
|
237
|
-
print(
|
|
238
|
-
f"Warning: Cross-domain arg {arg} resolution not fully implemented in GeneralProcessor"
|
|
239
|
-
)
|
|
240
|
-
arg_series.append(pd.Series([None] * len(pivoted)))
|
|
277
|
+
cross_series, resolved = self._resolve_cross_domain(
|
|
278
|
+
arg, {}, final_df, pivoted, built_domains
|
|
279
|
+
)
|
|
280
|
+
if resolved and cross_series is not None:
|
|
281
|
+
arg_series.append(cross_series)
|
|
241
282
|
else:
|
|
242
283
|
arg_series.append(pd.Series([None] * len(pivoted)))
|
|
243
284
|
else:
|
|
244
|
-
# Not found locally, and not a string with a dot. Treat as a literal or unresolved?
|
|
245
|
-
# Wait, ADaM's args might be string constants. Let's just append None for now as before.
|
|
246
285
|
arg_series.append(pd.Series([None] * len(pivoted)))
|
|
247
286
|
|
|
248
287
|
import importlib
|
|
@@ -272,8 +311,18 @@ class GeneralProcessor:
|
|
|
272
311
|
raise ImportError(f"Function {fname} not found")
|
|
273
312
|
|
|
274
313
|
try:
|
|
314
|
+
import inspect
|
|
275
315
|
func = _load_function(func_name)
|
|
276
|
-
|
|
316
|
+
sig = inspect.signature(func)
|
|
317
|
+
|
|
318
|
+
func_kwargs = kwargs.copy()
|
|
319
|
+
if "built_domains" in sig.parameters:
|
|
320
|
+
func_kwargs["built_domains"] = built_domains
|
|
321
|
+
if "df_long" in sig.parameters:
|
|
322
|
+
func_kwargs["df_long"] = df_long
|
|
323
|
+
|
|
324
|
+
series = func(*arg_series, **func_kwargs)
|
|
325
|
+
|
|
277
326
|
if not isinstance(series, pd.Series):
|
|
278
327
|
series = pd.Series(series)
|
|
279
328
|
except Exception as e:
|
|
@@ -282,11 +331,49 @@ class GeneralProcessor:
|
|
|
282
331
|
)
|
|
283
332
|
series = pd.Series([None] * len(pivoted))
|
|
284
333
|
|
|
334
|
+
elif isinstance(col_config, dict) and col_config.get("conditions"):
|
|
335
|
+
import numpy as np
|
|
336
|
+
conditions_config = col_config.get("conditions")
|
|
337
|
+
|
|
338
|
+
# Create an evaluation context combining raw domain data and current final_df
|
|
339
|
+
eval_df = pivoted.copy()
|
|
340
|
+
for c in final_df.columns:
|
|
341
|
+
eval_df[c] = final_df[c]
|
|
342
|
+
|
|
343
|
+
cond_list = []
|
|
344
|
+
choice_list = []
|
|
345
|
+
|
|
346
|
+
for cond in conditions_config:
|
|
347
|
+
expr = cond.get("if")
|
|
348
|
+
then_val = cond.get("then")
|
|
349
|
+
try:
|
|
350
|
+
# Evaluate condition string
|
|
351
|
+
mask = eval_df.eval(expr)
|
|
352
|
+
cond_list.append(mask)
|
|
353
|
+
choice_list.append(then_val)
|
|
354
|
+
except Exception as e:
|
|
355
|
+
print(f"Warning: Failed to evaluate condition '{expr}': {e}")
|
|
356
|
+
cond_list.append(pd.Series(False, index=eval_df.index))
|
|
357
|
+
choice_list.append(then_val)
|
|
358
|
+
|
|
359
|
+
default_val = col_config.get("default", None)
|
|
360
|
+
if cond_list:
|
|
361
|
+
# np.select evaluates conditions in order
|
|
362
|
+
series = pd.Series(np.select(cond_list, choice_list, default=default_val), index=eval_df.index)
|
|
363
|
+
else:
|
|
364
|
+
series = pd.Series([default_val] * len(eval_df), index=eval_df.index)
|
|
365
|
+
|
|
285
366
|
elif literal_expr is not None:
|
|
286
367
|
# Explicit literal value
|
|
287
368
|
series = pd.Series([literal_expr] * len(pivoted))
|
|
288
369
|
elif source_expr:
|
|
289
|
-
|
|
370
|
+
# Check for cross-domain reference first
|
|
371
|
+
cross_series, resolved = self._resolve_cross_domain(
|
|
372
|
+
source_expr, col_config, final_df, pivoted, built_domains
|
|
373
|
+
)
|
|
374
|
+
if resolved:
|
|
375
|
+
series = cross_series
|
|
376
|
+
elif source_expr in pivoted.columns:
|
|
290
377
|
series = pivoted[source_expr].copy()
|
|
291
378
|
elif source_expr in final_df.columns:
|
|
292
379
|
series = final_df[source_expr].copy()
|