virola 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vir/__init__.py +1 -0
- vir/adapters/__init__.py +46 -0
- vir/adapters/aktin/__init__.py +184 -0
- vir/adapters/aktin/bin/.gitkeep +0 -0
- vir/adapters/aktin/cleaning.py +87 -0
- vir/adapters/aktin/notebooks/.gitkeep +0 -0
- vir/adapters/aktin/notebooks/eda_aktin.py +272 -0
- vir/adapters/aktin/references/README.md +18 -0
- vir/adapters/aktin/references/icd10gm2025syst_kodes.txt +16817 -0
- vir/adapters/aktin/terminology.py +44 -0
- vir/adapters/aktin/viz.py +24 -0
- vir/adapters/base.py +80 -0
- vir/adapters/datasus/__init__.py +91 -0
- vir/adapters/datasus/bin/cities.txt +10 -0
- vir/adapters/datasus/bin/enrich_latest_run.sh +145 -0
- vir/adapters/datasus/bin/run_pipeline.sh +70 -0
- vir/adapters/datasus/bin/slurm_ablation.sh +67 -0
- vir/adapters/datasus/bin/slurm_pipeline.sh +71 -0
- vir/adapters/datasus/cleaning.py +65 -0
- vir/adapters/datasus/features.py +144 -0
- vir/adapters/datasus/helpers.py +55 -0
- vir/adapters/datasus/notebooks/data_cleaning.py +350 -0
- vir/adapters/datasus/notebooks/eda_raw_data.py +648 -0
- vir/adapters/datasus/notebooks/non_indicative_codes.py +149 -0
- vir/adapters/datasus/notebooks/sanity_checks/sc_clinical.py +608 -0
- vir/adapters/datasus/notebooks/sanity_checks/sc_demographic.py +437 -0
- vir/adapters/datasus/notebooks/sanity_checks/sc_temporal.py +862 -0
- vir/adapters/datasus/notebooks/sanity_checks/temporal_distance_comparison.py +618 -0
- vir/adapters/datasus/notebooks/validation/cross_city.py +1651 -0
- vir/adapters/datasus/notebooks/validation/validation_ablation.py +826 -0
- vir/adapters/datasus/notebooks/validation/validation_reference_syndromes.py +1736 -0
- vir/adapters/datasus/notebooks/validation/validation_temporal_c.py +358 -0
- vir/adapters/datasus/notebooks/view_demographic.py +948 -0
- vir/adapters/datasus/notebooks/viz_snf_clusters.py +588 -0
- vir/adapters/datasus/references/RepositorioTerminologia_202506/tb_cid.csv +14240 -0
- vir/adapters/datasus/references/abp_ciap2.csv +24 -0
- vir/adapters/datasus/references/aesop/code_list_arbovirus_apr2024.csv +17 -0
- vir/adapters/datasus/references/aesop/code_list_uri_apr2024.csv +51 -0
- vir/adapters/datasus/references/ciap-2-wicc.csv +92 -0
- vir/adapters/datasus/references/ciap2-cid10.csv +687 -0
- vir/adapters/datasus/references/indicative_codes_yes_no_75perc.csv +112 -0
- vir/adapters/datasus/references/indicative_codes_yes_no__2022_2025__90perc.csv +356 -0
- vir/adapters/datasus/references/osi_template.json +25 -0
- vir/adapters/datasus/references/terminology_mapping.csv +24 -0
- vir/adapters/datasus/terminologies/__init__.py +0 -0
- vir/adapters/datasus/terminologies/terminology_mapping.py +70 -0
- vir/adapters/datasus/terminology.py +100 -0
- vir/adapters/datasus/viz.py +29 -0
- vir/cleaning/__init__.py +0 -0
- vir/cleaning/data_preparation.py +180 -0
- vir/cleaning/indicative_template.py +43 -0
- vir/cleaning/prepare.py +89 -0
- vir/cli.py +910 -0
- vir/clusters.py +355 -0
- vir/config.py +66 -0
- vir/data_filters.py +119 -0
- vir/db.py +1377 -0
- vir/helpers.py +88 -0
- vir/metrics.py +82 -0
- vir/notebook_widgets.py +179 -0
- vir/osi.py +98 -0
- vir/terminologies/__init__.py +0 -0
- vir/terminologies/embeddings_model.py +204 -0
- vir/views/__init__.py +20 -0
- vir/views/base.py +314 -0
- vir/views/clinical.py +163 -0
- vir/views/demographic.py +181 -0
- vir/views/temporal.py +221 -0
- vir/viz.py +58 -0
- virola-0.0.1.dist-info/METADATA +405 -0
- virola-0.0.1.dist-info/RECORD +74 -0
- virola-0.0.1.dist-info/WHEEL +4 -0
- virola-0.0.1.dist-info/entry_points.txt +3 -0
- virola-0.0.1.dist-info/licenses/LICENSE +9 -0
vir/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from vir import config # noqa: F401
|
vir/adapters/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from vir.adapters.aktin import AktinAdapter
|
|
2
|
+
from vir.adapters.base import Adapter
|
|
3
|
+
from vir.adapters.datasus import DatasusAdapter
|
|
4
|
+
|
|
5
|
+
ADAPTERS: dict[str, Adapter] = {
|
|
6
|
+
"datasus": DatasusAdapter(),
|
|
7
|
+
"aktin": AktinAdapter(),
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def known_strata() -> set[str]:
|
|
12
|
+
"""Return the union of strata declared by every registered adapter."""
|
|
13
|
+
return {stratum for adapter in ADAPTERS.values() for stratum in adapter.strata}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def adapter_for_stratum(stratum: str) -> Adapter:
|
|
17
|
+
"""Return the unique adapter that declares this stratum.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
LookupError: if no adapter claims the stratum, or if more than one does.
|
|
21
|
+
"""
|
|
22
|
+
owners = [adapter for adapter in ADAPTERS.values() if stratum in adapter.strata]
|
|
23
|
+
if not owners:
|
|
24
|
+
raise LookupError(f"unknown stratum: {stratum!r}")
|
|
25
|
+
if len(owners) > 1:
|
|
26
|
+
raise LookupError(f"ambiguous stratum: {stratum!r} is claimed by {len(owners)} adapters")
|
|
27
|
+
return owners[0]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def stratum_metadata(stratum: str) -> dict:
|
|
31
|
+
"""Resolve ``{city, region, population_range}`` for a stratum or dataset slug.
|
|
32
|
+
|
|
33
|
+
Iterates the registered adapters and asks each to interpret the value.
|
|
34
|
+
The first adapter that recognises it (returns a non-null ``city``) wins.
|
|
35
|
+
Falls back to all-None when no adapter claims the value — matching the
|
|
36
|
+
historical behaviour of the framework's ``_city_metadata`` helper.
|
|
37
|
+
|
|
38
|
+
This accepts both display-form stratum names (e.g. ``"Belo Horizonte"``)
|
|
39
|
+
and dataset-name slugs (e.g. ``"belo_horizonte"``); each adapter is
|
|
40
|
+
responsible for normalising what it owns.
|
|
41
|
+
"""
|
|
42
|
+
for adapter in ADAPTERS.values():
|
|
43
|
+
metadata = adapter.stratum_metadata(stratum)
|
|
44
|
+
if metadata.get("city") is not None:
|
|
45
|
+
return metadata
|
|
46
|
+
return {"city": None, "region": None, "population_range": None}
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import zipfile
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from loguru import logger
|
|
10
|
+
|
|
11
|
+
from vir.adapters.aktin.cleaning import clean_aktin_raw
|
|
12
|
+
from vir.adapters.aktin.terminology import build_icd10gm_labels
|
|
13
|
+
from vir.adapters.aktin.viz import AGE_GROUP_COLOR, CODE_TYPE_COLOR, SEX_COLOR
|
|
14
|
+
from vir.adapters.base import Adapter
|
|
15
|
+
|
|
16
|
+
_FileSource = Union[Path, tuple[Path, str]]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class _Site:
|
|
21
|
+
"""One AKTIN site's pair of source streams (either filesystem paths or zip members)."""
|
|
22
|
+
|
|
23
|
+
name: str
|
|
24
|
+
case_source: _FileSource
|
|
25
|
+
diag_source: _FileSource
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _read_aktin_tsv(source: _FileSource) -> pl.DataFrame:
|
|
29
|
+
"""Read a tab-separated AKTIN file either directly or from inside a zip.
|
|
30
|
+
|
|
31
|
+
Uses ``truncate_ragged_lines=True``: AKTIN free-text columns (e.g.
|
|
32
|
+
``isolation_reason``, ``zuweisung``) occasionally contain embedded tabs in
|
|
33
|
+
real exports, producing rows with more fields than the header declares.
|
|
34
|
+
Truncating the trailing fields lets the row survive — the dropped data is
|
|
35
|
+
in narrative columns that we don't consume downstream.
|
|
36
|
+
"""
|
|
37
|
+
read_kwargs = dict(separator="\t", infer_schema_length=0, truncate_ragged_lines=True)
|
|
38
|
+
if isinstance(source, tuple):
|
|
39
|
+
zip_path, member = source
|
|
40
|
+
with zipfile.ZipFile(zip_path) as archive, archive.open(member) as stream:
|
|
41
|
+
payload = stream.read()
|
|
42
|
+
return pl.read_csv(io.BytesIO(payload), **read_kwargs)
|
|
43
|
+
return pl.read_csv(source, **read_kwargs)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _discover_sites(raw_path: Path) -> list[_Site]:
|
|
47
|
+
"""Find every AKTIN site under ``raw_path``.
|
|
48
|
+
|
|
49
|
+
Recognises two layouts: an unpacked ``<site>/`` containing both
|
|
50
|
+
``case_data.txt`` and ``diag_data.txt``, and a ``<site>_result.zip``
|
|
51
|
+
whose members include both.
|
|
52
|
+
|
|
53
|
+
Raises ``FileNotFoundError`` for a site that has one of the two files but
|
|
54
|
+
not the other — silent orphan-site loss is the worst failure mode (every
|
|
55
|
+
encounter from that site would disappear at the inner-join).
|
|
56
|
+
"""
|
|
57
|
+
sites: list[_Site] = []
|
|
58
|
+
|
|
59
|
+
for case_path in sorted(raw_path.glob("**/case_data.txt")):
|
|
60
|
+
diag_path = case_path.parent / "diag_data.txt"
|
|
61
|
+
if not diag_path.exists():
|
|
62
|
+
raise FileNotFoundError(
|
|
63
|
+
f"Site {case_path.parent.name!r} has case_data.txt but no diag_data.txt"
|
|
64
|
+
)
|
|
65
|
+
sites.append(
|
|
66
|
+
_Site(
|
|
67
|
+
name=case_path.parent.name or "root", case_source=case_path, diag_source=diag_path
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
for zip_path in sorted(raw_path.glob("**/*.zip")):
|
|
72
|
+
with zipfile.ZipFile(zip_path) as archive:
|
|
73
|
+
members = set(archive.namelist())
|
|
74
|
+
has_case = "case_data.txt" in members
|
|
75
|
+
has_diag = "diag_data.txt" in members
|
|
76
|
+
if not has_case and not has_diag:
|
|
77
|
+
continue
|
|
78
|
+
if has_case != has_diag:
|
|
79
|
+
raise FileNotFoundError(
|
|
80
|
+
f"Zip {zip_path.name!r} has only one of case_data.txt / diag_data.txt"
|
|
81
|
+
)
|
|
82
|
+
sites.append(
|
|
83
|
+
_Site(
|
|
84
|
+
name=zip_path.stem,
|
|
85
|
+
case_source=(zip_path, "case_data.txt"),
|
|
86
|
+
diag_source=(zip_path, "diag_data.txt"),
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return sites
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _namespace_encounter_num(frame: pl.DataFrame, site_name: str) -> pl.DataFrame:
|
|
94
|
+
"""Prefix ``a_encounter_num`` with the site name so cross-site encounter-id collisions don't merge."""
|
|
95
|
+
return frame.with_columns(
|
|
96
|
+
a_encounter_num=pl.concat_str(
|
|
97
|
+
[pl.lit(f"{site_name}:"), pl.col("a_encounter_num").cast(pl.Utf8)]
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class AktinAdapter(Adapter):
|
|
103
|
+
"""Adapter for the German AKTIN emergency-department dataset.
|
|
104
|
+
|
|
105
|
+
Single stratum (``"germany"`` — the country as a whole), per-encounter
|
|
106
|
+
diagnoses (one row per encounter × confirmed diagnosis after cleaning),
|
|
107
|
+
ICD-10-GM labels from the vendored BfArM 2025 release.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
name = "aktin"
|
|
111
|
+
default_model = os.getenv(
|
|
112
|
+
"VIROLA_AKTIN_MODEL", "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
|
|
113
|
+
)
|
|
114
|
+
terminology_csv_filename = "icd10gm_aktin.csv"
|
|
115
|
+
|
|
116
|
+
def clean(
|
|
117
|
+
self,
|
|
118
|
+
raw_path: Path,
|
|
119
|
+
output_path: Path,
|
|
120
|
+
stratum: str | None = None,
|
|
121
|
+
**options,
|
|
122
|
+
) -> Path:
|
|
123
|
+
"""Pool every AKTIN site under ``raw_path`` into a single canonical interim parquet.
|
|
124
|
+
|
|
125
|
+
Each site (unpacked directory or ``<site>_result.zip``) must contribute
|
|
126
|
+
both ``case_data.txt`` and ``diag_data.txt`` — orphan sites raise.
|
|
127
|
+
``a_encounter_num`` is namespaced per site so id collisions don't merge.
|
|
128
|
+
"""
|
|
129
|
+
sites = _discover_sites(raw_path)
|
|
130
|
+
if not sites:
|
|
131
|
+
raise FileNotFoundError(
|
|
132
|
+
f"No AKTIN sites found under {raw_path}. Expected ``<site>/case_data.txt`` + "
|
|
133
|
+
"``diag_data.txt`` pairs or ``<site>_result.zip`` archives."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
logger.info(f"AKTIN clean: pooling {len(sites)} site(s) under {raw_path}")
|
|
137
|
+
|
|
138
|
+
case_frames = [
|
|
139
|
+
_namespace_encounter_num(_read_aktin_tsv(site.case_source), site.name)
|
|
140
|
+
for site in sites
|
|
141
|
+
]
|
|
142
|
+
diag_frames = [
|
|
143
|
+
_namespace_encounter_num(_read_aktin_tsv(site.diag_source), site.name)
|
|
144
|
+
for site in sites
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
cases = pl.concat(case_frames, how="diagonal_relaxed")
|
|
148
|
+
diagnoses = pl.concat(diag_frames, how="diagonal_relaxed")
|
|
149
|
+
|
|
150
|
+
cleaned = clean_aktin_raw(cases, diagnoses)
|
|
151
|
+
cleaned.write_parquet(output_path)
|
|
152
|
+
return output_path
|
|
153
|
+
|
|
154
|
+
def build_terminology_labels(self) -> pl.DataFrame:
|
|
155
|
+
return build_icd10gm_labels()
|
|
156
|
+
|
|
157
|
+
def stratum_metadata(self, stratum: str) -> dict:
|
|
158
|
+
if "germany" in stratum.lower():
|
|
159
|
+
return {"city": "germany", "region": None, "population_range": None}
|
|
160
|
+
return {"city": None, "region": None, "population_range": None}
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def strata(self) -> list[str]:
|
|
164
|
+
return ["germany"]
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def skip_patterns(self) -> list[tuple[str, str]]:
|
|
168
|
+
return [("icd10gm", r"^Z")]
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def code_types(self) -> list[str]:
|
|
172
|
+
return ["icd10gm"]
|
|
173
|
+
|
|
174
|
+
def cleaned_filename(self, stratum: str | None) -> str:
|
|
175
|
+
slug = (stratum or "all").lower().replace(" ", "_")
|
|
176
|
+
return f"cleaned_aktin_{slug}.parquet"
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def viz_palettes(self) -> dict[str, dict[str, str]]:
|
|
180
|
+
return {
|
|
181
|
+
"sex": SEX_COLOR,
|
|
182
|
+
"age_group": AGE_GROUP_COLOR,
|
|
183
|
+
"code_type": CODE_TYPE_COLOR,
|
|
184
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""AKTIN raw → canonical interim transformation.
|
|
2
|
+
|
|
3
|
+
Takes the per-encounter ``case_data`` and per-diagnosis ``diag_data`` tables
|
|
4
|
+
exported from AKTIN and produces one row per (encounter × confirmed diagnosis)
|
|
5
|
+
in the canonical interim shape that ``prepare_dataframe`` consumes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
AKTIN_AGE_BUCKETS: dict[str, str] = {
|
|
11
|
+
"0-4": "0-4",
|
|
12
|
+
"00-04": "0-4",
|
|
13
|
+
"5-9": "5-9",
|
|
14
|
+
"05-09": "5-9",
|
|
15
|
+
"10-14": "10-14",
|
|
16
|
+
"15-19": "15-19",
|
|
17
|
+
"20-24": "20-39",
|
|
18
|
+
"25-29": "20-39",
|
|
19
|
+
"30-34": "20-39",
|
|
20
|
+
"35-39": "20-39",
|
|
21
|
+
"40-44": "40-59",
|
|
22
|
+
"45-49": "40-59",
|
|
23
|
+
"50-54": "40-59",
|
|
24
|
+
"55-59": "40-59",
|
|
25
|
+
"60-64": "60-79",
|
|
26
|
+
"65-69": "60-79",
|
|
27
|
+
"70-74": "60-79",
|
|
28
|
+
"75-79": "60-79",
|
|
29
|
+
"80+": "80+",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
_REQUIRED_CASE_FIELDS: tuple[str, ...] = ("geschlecht", "aufnahme_ts", "age_group")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _null_when_empty(column: str) -> pl.Expr:
|
|
36
|
+
return pl.when(pl.col(column).eq("")).then(None).otherwise(pl.col(column)).alias(column)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def clean_aktin_raw(cases: pl.DataFrame, diagnoses: pl.DataFrame) -> pl.DataFrame:
|
|
40
|
+
"""Transform AKTIN ``case_data`` + ``diag_data`` into the canonical interim shape.
|
|
41
|
+
|
|
42
|
+
Steps:
|
|
43
|
+
|
|
44
|
+
1. Drop case rows with empty / null ``geschlecht``, ``aufnahme_ts`` or
|
|
45
|
+
``age_group``.
|
|
46
|
+
2. Parse ``aufnahme_ts`` accepting both ``Z`` and ``+00:00`` offsets;
|
|
47
|
+
unparseable timestamps drop the row rather than ride through as a null
|
|
48
|
+
year/week.
|
|
49
|
+
3. Collapse the native 5-year ``age_group`` to one of the 8 AKTIN buckets;
|
|
50
|
+
any value outside the bucket map drops the row.
|
|
51
|
+
4. Keep only diagnoses with ``diagnose_zusatz == 'G'`` (confirmed).
|
|
52
|
+
5. Inner-join cases × confirmed diagnoses on ``a_encounter_num``.
|
|
53
|
+
6. Add ``city = "germany"``, ``code_type = "icd10gm"``, ``quantity = 1``.
|
|
54
|
+
7. Drop ``plz_kurz``. Rename ``geschlecht`` → ``sex``. Retain
|
|
55
|
+
``a_encounter_num`` and the case_data extras (``triage``, vitals,
|
|
56
|
+
``verbleib``, ...) for later statistics — they are dropped later by
|
|
57
|
+
``prepare_dataframe`` when projecting onto the canonical row.
|
|
58
|
+
"""
|
|
59
|
+
cleaned_cases = (
|
|
60
|
+
cases.with_columns([_null_when_empty(column) for column in _REQUIRED_CASE_FIELDS])
|
|
61
|
+
.drop_nulls(subset=list(_REQUIRED_CASE_FIELDS))
|
|
62
|
+
.with_columns(
|
|
63
|
+
_admission=pl.col("aufnahme_ts")
|
|
64
|
+
.str.replace(r"Z$", "+00:00")
|
|
65
|
+
.str.to_datetime("%Y-%m-%dT%H:%M:%S%z", strict=False),
|
|
66
|
+
age_group=pl.col("age_group").replace_strict(AKTIN_AGE_BUCKETS, default=None),
|
|
67
|
+
)
|
|
68
|
+
.drop_nulls(subset=["_admission", "age_group"])
|
|
69
|
+
.with_columns(
|
|
70
|
+
year=pl.col("_admission").dt.iso_year(),
|
|
71
|
+
week=pl.col("_admission").dt.week(),
|
|
72
|
+
)
|
|
73
|
+
.drop(["_admission", "plz_kurz"])
|
|
74
|
+
.rename({"geschlecht": "sex"})
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
confirmed_diagnoses = diagnoses.filter(pl.col("diagnose_zusatz").eq("G")).select(
|
|
78
|
+
code=pl.col("icd_code"),
|
|
79
|
+
a_encounter_num=pl.col("a_encounter_num"),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
joined = cleaned_cases.join(confirmed_diagnoses, on="a_encounter_num", how="inner")
|
|
83
|
+
return joined.with_columns(
|
|
84
|
+
city=pl.lit("germany"),
|
|
85
|
+
code_type=pl.lit("icd10gm"),
|
|
86
|
+
quantity=pl.lit(1),
|
|
87
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import marimo
|
|
2
|
+
|
|
3
|
+
__generated_with = "0.23.4"
|
|
4
|
+
app = marimo.App(width="medium")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@app.cell
|
|
8
|
+
def _():
|
|
9
|
+
import marimo as mo
|
|
10
|
+
import plotly.express as px
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from vir.config import INTERIM_DATA_DIR
|
|
14
|
+
|
|
15
|
+
return INTERIM_DATA_DIR, mo, pl, px
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@app.cell
|
|
19
|
+
def _(mo):
|
|
20
|
+
mo.md(r"""
|
|
21
|
+
# AKTIN — Exploratory data analysis
|
|
22
|
+
|
|
23
|
+
Basic shape, demographic distribution, code coverage, and temporal distribution
|
|
24
|
+
of the cleaned AKTIN interim parquet (one row per encounter × confirmed
|
|
25
|
+
diagnosis).
|
|
26
|
+
""")
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@app.cell
|
|
31
|
+
def _(INTERIM_DATA_DIR, mo):
|
|
32
|
+
available_files = sorted(INTERIM_DATA_DIR.glob("cleaned_aktin_*.parquet"))
|
|
33
|
+
file_dropdown = mo.ui.dropdown(
|
|
34
|
+
options={path.name: str(path) for path in available_files},
|
|
35
|
+
value=available_files[0].name if available_files else None,
|
|
36
|
+
label="Cleaned AKTIN parquet",
|
|
37
|
+
)
|
|
38
|
+
file_dropdown
|
|
39
|
+
return (file_dropdown,)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@app.cell
|
|
43
|
+
def _(file_dropdown, mo, pl):
|
|
44
|
+
mo.stop(
|
|
45
|
+
not file_dropdown.value,
|
|
46
|
+
mo.callout(
|
|
47
|
+
mo.md("No `cleaned_aktin_*.parquet` files found under `INTERIM_DATA_DIR`."),
|
|
48
|
+
kind="warn",
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
df = pl.read_parquet(file_dropdown.value)
|
|
52
|
+
return (df,)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@app.cell
|
|
56
|
+
def _(mo):
|
|
57
|
+
mo.md(r"""
|
|
58
|
+
## Shape
|
|
59
|
+
""")
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@app.cell
|
|
64
|
+
def _(df, mo):
|
|
65
|
+
n_rows = df.height
|
|
66
|
+
n_encounters = df["a_encounter_num"].n_unique()
|
|
67
|
+
n_codes = df["code"].n_unique()
|
|
68
|
+
year_min = df["year"].min()
|
|
69
|
+
year_max = df["year"].max()
|
|
70
|
+
week_min = df["week"].min()
|
|
71
|
+
week_max = df["week"].max()
|
|
72
|
+
diagnoses_per_encounter = n_rows / n_encounters if n_encounters else 0
|
|
73
|
+
|
|
74
|
+
mo.md(f"""
|
|
75
|
+
| metric | value |
|
|
76
|
+
|---|---|
|
|
77
|
+
| rows (encounter × diagnosis) | {n_rows:,} |
|
|
78
|
+
| distinct encounters | {n_encounters:,} |
|
|
79
|
+
| distinct ICD-10-GM codes | {n_codes:,} |
|
|
80
|
+
| mean diagnoses per encounter | {diagnoses_per_encounter:.2f} |
|
|
81
|
+
| ISO year range | {year_min} – {year_max} |
|
|
82
|
+
| ISO week range | {week_min} – {week_max} |
|
|
83
|
+
""")
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@app.cell
|
|
88
|
+
def _(mo):
|
|
89
|
+
mo.md(r"""
|
|
90
|
+
## Demographics
|
|
91
|
+
""")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@app.cell
|
|
96
|
+
def _():
|
|
97
|
+
age_order = ["0-4", "5-9", "10-14", "15-19", "20-39", "40-59", "60-79", "80+"]
|
|
98
|
+
return (age_order,)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@app.cell
|
|
102
|
+
def _(df, mo, px):
|
|
103
|
+
sex_per_encounter = df.unique(subset=["a_encounter_num"]).group_by("sex").len().sort("sex")
|
|
104
|
+
sex_chart = px.bar(
|
|
105
|
+
sex_per_encounter,
|
|
106
|
+
x="sex",
|
|
107
|
+
y="len",
|
|
108
|
+
title="Encounters by sex",
|
|
109
|
+
labels={"len": "encounters"},
|
|
110
|
+
)
|
|
111
|
+
mo.ui.plotly(sex_chart)
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@app.cell
|
|
116
|
+
def _(age_order, df, mo, pl, px):
|
|
117
|
+
age_per_encounter = (
|
|
118
|
+
df.unique(subset=["a_encounter_num"])
|
|
119
|
+
.group_by("age_group")
|
|
120
|
+
.len()
|
|
121
|
+
.with_columns(pl.col("age_group").cast(pl.Enum(age_order)).alias("age_group_ordered"))
|
|
122
|
+
.sort("age_group_ordered")
|
|
123
|
+
)
|
|
124
|
+
age_chart = px.bar(
|
|
125
|
+
age_per_encounter,
|
|
126
|
+
x="age_group",
|
|
127
|
+
y="len",
|
|
128
|
+
title="Encounters by age group",
|
|
129
|
+
category_orders={"age_group": age_order},
|
|
130
|
+
labels={"len": "encounters"},
|
|
131
|
+
)
|
|
132
|
+
mo.ui.plotly(age_chart)
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@app.cell
|
|
137
|
+
def _(age_order, df, mo, px):
|
|
138
|
+
sex_age = df.unique(subset=["a_encounter_num"]).group_by(["age_group", "sex"]).len()
|
|
139
|
+
sex_age_chart = px.bar(
|
|
140
|
+
sex_age,
|
|
141
|
+
x="age_group",
|
|
142
|
+
y="len",
|
|
143
|
+
color="sex",
|
|
144
|
+
barmode="group",
|
|
145
|
+
title="Encounters by age group × sex",
|
|
146
|
+
category_orders={"age_group": age_order},
|
|
147
|
+
labels={"len": "encounters"},
|
|
148
|
+
)
|
|
149
|
+
mo.ui.plotly(sex_age_chart)
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@app.cell
|
|
154
|
+
def _(mo):
|
|
155
|
+
mo.md(r"""
|
|
156
|
+
## ICD-10-GM coverage
|
|
157
|
+
""")
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@app.cell
|
|
162
|
+
def _(df, mo, pl, px):
|
|
163
|
+
chapter_distribution = (
|
|
164
|
+
df.with_columns(pl.col("code").str.slice(0, 1).alias("chapter_letter"))
|
|
165
|
+
.group_by("chapter_letter")
|
|
166
|
+
.len()
|
|
167
|
+
.sort("chapter_letter")
|
|
168
|
+
)
|
|
169
|
+
chapter_chart = px.bar(
|
|
170
|
+
chapter_distribution,
|
|
171
|
+
x="chapter_letter",
|
|
172
|
+
y="len",
|
|
173
|
+
title="Diagnoses by ICD-10-GM chapter letter",
|
|
174
|
+
labels={"len": "diagnoses", "chapter_letter": "chapter (A–Z, U)"},
|
|
175
|
+
)
|
|
176
|
+
mo.ui.plotly(chapter_chart)
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@app.cell
|
|
181
|
+
def _(df, mo, px):
|
|
182
|
+
top_codes = df.group_by("code").len().sort("len", descending=True).head(20)
|
|
183
|
+
top_codes_chart = px.bar(
|
|
184
|
+
top_codes,
|
|
185
|
+
x="code",
|
|
186
|
+
y="len",
|
|
187
|
+
title="Top-20 ICD-10-GM codes by diagnosis count",
|
|
188
|
+
labels={"len": "diagnoses"},
|
|
189
|
+
)
|
|
190
|
+
mo.ui.plotly(top_codes_chart)
|
|
191
|
+
return (top_codes,)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@app.cell
|
|
195
|
+
def _(mo, top_codes):
|
|
196
|
+
mo.ui.table(top_codes, label="Top-20 codes")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@app.cell
|
|
201
|
+
def _(mo):
|
|
202
|
+
mo.md(r"""
|
|
203
|
+
## Temporal distribution
|
|
204
|
+
""")
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@app.cell
|
|
209
|
+
def _(df, pl):
|
|
210
|
+
has_aufnahme_ts = "aufnahme_ts" in df.columns
|
|
211
|
+
timestamps_df = (
|
|
212
|
+
df.with_columns(
|
|
213
|
+
pl.col("aufnahme_ts").str.to_datetime("%Y-%m-%dT%H:%M:%SZ").alias("_admission")
|
|
214
|
+
)
|
|
215
|
+
if has_aufnahme_ts
|
|
216
|
+
else df
|
|
217
|
+
)
|
|
218
|
+
return has_aufnahme_ts, timestamps_df
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@app.cell
|
|
222
|
+
def _(has_aufnahme_ts, mo, pl, px, timestamps_df):
|
|
223
|
+
mo.stop(
|
|
224
|
+
not has_aufnahme_ts,
|
|
225
|
+
mo.callout(
|
|
226
|
+
mo.md(
|
|
227
|
+
"``aufnahme_ts`` was dropped from the interim parquet — month-level chart unavailable."
|
|
228
|
+
),
|
|
229
|
+
kind="warn",
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
monthly = (
|
|
233
|
+
timestamps_df.unique(subset=["a_encounter_num"])
|
|
234
|
+
.with_columns(pl.col("_admission").dt.strftime("%Y-%m").alias("year_month"))
|
|
235
|
+
.group_by("year_month")
|
|
236
|
+
.len()
|
|
237
|
+
.sort("year_month")
|
|
238
|
+
)
|
|
239
|
+
monthly_chart = px.bar(
|
|
240
|
+
monthly,
|
|
241
|
+
x="year_month",
|
|
242
|
+
y="len",
|
|
243
|
+
title="Encounters per month",
|
|
244
|
+
labels={"len": "encounters", "year_month": "year-month"},
|
|
245
|
+
)
|
|
246
|
+
mo.ui.plotly(monthly_chart)
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@app.cell
|
|
251
|
+
def _(df, mo, px):
|
|
252
|
+
weekly = (
|
|
253
|
+
df.unique(subset=["a_encounter_num"])
|
|
254
|
+
.group_by(["year", "week"])
|
|
255
|
+
.len()
|
|
256
|
+
.sort(["year", "week"])
|
|
257
|
+
)
|
|
258
|
+
weekly_chart = px.scatter(
|
|
259
|
+
weekly,
|
|
260
|
+
x="week",
|
|
261
|
+
y="len",
|
|
262
|
+
color="year",
|
|
263
|
+
title="Encounters per ISO week, by year",
|
|
264
|
+
labels={"len": "encounters", "week": "ISO week"},
|
|
265
|
+
)
|
|
266
|
+
weekly_chart.update_layout(xaxis=dict(tickmode="linear", dtick=4))
|
|
267
|
+
mo.ui.plotly(weekly_chart)
|
|
268
|
+
return
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
app.run()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# AKTIN adapter references
|
|
2
|
+
|
|
3
|
+
## icd10gm2025syst_kodes.txt
|
|
4
|
+
|
|
5
|
+
Source: BfArM (Bundesinstitut für Arzneimittel und Medizinprodukte), ICD-10-GM
|
|
6
|
+
Version 2025 — Systematisches Verzeichnis (classification file
|
|
7
|
+
`icd10gm2025syst-meta.zip` → `Klassifikationsdateien/icd10gm2025syst_kodes.txt`).
|
|
8
|
+
|
|
9
|
+
Stand der Klassifikation: 2024-09-13. OID `1.2.276.0.76.5.548`.
|
|
10
|
+
|
|
11
|
+
Format: semicolon-separated, no header, one row per code. ~16,800 rows (4 MB).
|
|
12
|
+
|
|
13
|
+
Redistribution / attribution: subject to BfArM download conditions
|
|
14
|
+
(`downloadbedingungen-2024.pdf`). License terms must be confirmed before
|
|
15
|
+
making the public repo containing this file generally available — tracked
|
|
16
|
+
in `AKTIN-PLAN.md § Stakeholder check-ins`.
|
|
17
|
+
|
|
18
|
+
Parsing of this file lives in `vir/adapters/aktin/terminology.py`.
|