virola 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. vir/__init__.py +1 -0
  2. vir/adapters/__init__.py +46 -0
  3. vir/adapters/aktin/__init__.py +184 -0
  4. vir/adapters/aktin/bin/.gitkeep +0 -0
  5. vir/adapters/aktin/cleaning.py +87 -0
  6. vir/adapters/aktin/notebooks/.gitkeep +0 -0
  7. vir/adapters/aktin/notebooks/eda_aktin.py +272 -0
  8. vir/adapters/aktin/references/README.md +18 -0
  9. vir/adapters/aktin/references/icd10gm2025syst_kodes.txt +16817 -0
  10. vir/adapters/aktin/terminology.py +44 -0
  11. vir/adapters/aktin/viz.py +24 -0
  12. vir/adapters/base.py +80 -0
  13. vir/adapters/datasus/__init__.py +91 -0
  14. vir/adapters/datasus/bin/cities.txt +10 -0
  15. vir/adapters/datasus/bin/enrich_latest_run.sh +145 -0
  16. vir/adapters/datasus/bin/run_pipeline.sh +70 -0
  17. vir/adapters/datasus/bin/slurm_ablation.sh +67 -0
  18. vir/adapters/datasus/bin/slurm_pipeline.sh +71 -0
  19. vir/adapters/datasus/cleaning.py +65 -0
  20. vir/adapters/datasus/features.py +144 -0
  21. vir/adapters/datasus/helpers.py +55 -0
  22. vir/adapters/datasus/notebooks/data_cleaning.py +350 -0
  23. vir/adapters/datasus/notebooks/eda_raw_data.py +648 -0
  24. vir/adapters/datasus/notebooks/non_indicative_codes.py +149 -0
  25. vir/adapters/datasus/notebooks/sanity_checks/sc_clinical.py +608 -0
  26. vir/adapters/datasus/notebooks/sanity_checks/sc_demographic.py +437 -0
  27. vir/adapters/datasus/notebooks/sanity_checks/sc_temporal.py +862 -0
  28. vir/adapters/datasus/notebooks/sanity_checks/temporal_distance_comparison.py +618 -0
  29. vir/adapters/datasus/notebooks/validation/cross_city.py +1651 -0
  30. vir/adapters/datasus/notebooks/validation/validation_ablation.py +826 -0
  31. vir/adapters/datasus/notebooks/validation/validation_reference_syndromes.py +1736 -0
  32. vir/adapters/datasus/notebooks/validation/validation_temporal_c.py +358 -0
  33. vir/adapters/datasus/notebooks/view_demographic.py +948 -0
  34. vir/adapters/datasus/notebooks/viz_snf_clusters.py +588 -0
  35. vir/adapters/datasus/references/RepositorioTerminologia_202506/tb_cid.csv +14240 -0
  36. vir/adapters/datasus/references/abp_ciap2.csv +24 -0
  37. vir/adapters/datasus/references/aesop/code_list_arbovirus_apr2024.csv +17 -0
  38. vir/adapters/datasus/references/aesop/code_list_uri_apr2024.csv +51 -0
  39. vir/adapters/datasus/references/ciap-2-wicc.csv +92 -0
  40. vir/adapters/datasus/references/ciap2-cid10.csv +687 -0
  41. vir/adapters/datasus/references/indicative_codes_yes_no_75perc.csv +112 -0
  42. vir/adapters/datasus/references/indicative_codes_yes_no__2022_2025__90perc.csv +356 -0
  43. vir/adapters/datasus/references/osi_template.json +25 -0
  44. vir/adapters/datasus/references/terminology_mapping.csv +24 -0
  45. vir/adapters/datasus/terminologies/__init__.py +0 -0
  46. vir/adapters/datasus/terminologies/terminology_mapping.py +70 -0
  47. vir/adapters/datasus/terminology.py +100 -0
  48. vir/adapters/datasus/viz.py +29 -0
  49. vir/cleaning/__init__.py +0 -0
  50. vir/cleaning/data_preparation.py +180 -0
  51. vir/cleaning/indicative_template.py +43 -0
  52. vir/cleaning/prepare.py +89 -0
  53. vir/cli.py +910 -0
  54. vir/clusters.py +355 -0
  55. vir/config.py +66 -0
  56. vir/data_filters.py +119 -0
  57. vir/db.py +1377 -0
  58. vir/helpers.py +88 -0
  59. vir/metrics.py +82 -0
  60. vir/notebook_widgets.py +179 -0
  61. vir/osi.py +98 -0
  62. vir/terminologies/__init__.py +0 -0
  63. vir/terminologies/embeddings_model.py +204 -0
  64. vir/views/__init__.py +20 -0
  65. vir/views/base.py +314 -0
  66. vir/views/clinical.py +163 -0
  67. vir/views/demographic.py +181 -0
  68. vir/views/temporal.py +221 -0
  69. vir/viz.py +58 -0
  70. virola-0.0.1.dist-info/METADATA +405 -0
  71. virola-0.0.1.dist-info/RECORD +74 -0
  72. virola-0.0.1.dist-info/WHEEL +4 -0
  73. virola-0.0.1.dist-info/entry_points.txt +3 -0
  74. virola-0.0.1.dist-info/licenses/LICENSE +9 -0
vir/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from vir import config # noqa: F401
@@ -0,0 +1,46 @@
1
+ from vir.adapters.aktin import AktinAdapter
2
+ from vir.adapters.base import Adapter
3
+ from vir.adapters.datasus import DatasusAdapter
4
+
5
+ ADAPTERS: dict[str, Adapter] = {
6
+ "datasus": DatasusAdapter(),
7
+ "aktin": AktinAdapter(),
8
+ }
9
+
10
+
11
+ def known_strata() -> set[str]:
12
+ """Return the union of strata declared by every registered adapter."""
13
+ return {stratum for adapter in ADAPTERS.values() for stratum in adapter.strata}
14
+
15
+
16
+ def adapter_for_stratum(stratum: str) -> Adapter:
17
+ """Return the unique adapter that declares this stratum.
18
+
19
+ Raises:
20
+ LookupError: if no adapter claims the stratum, or if more than one does.
21
+ """
22
+ owners = [adapter for adapter in ADAPTERS.values() if stratum in adapter.strata]
23
+ if not owners:
24
+ raise LookupError(f"unknown stratum: {stratum!r}")
25
+ if len(owners) > 1:
26
+ raise LookupError(f"ambiguous stratum: {stratum!r} is claimed by {len(owners)} adapters")
27
+ return owners[0]
28
+
29
+
30
+ def stratum_metadata(stratum: str) -> dict:
31
+ """Resolve ``{city, region, population_range}`` for a stratum or dataset slug.
32
+
33
+ Iterates the registered adapters and asks each to interpret the value.
34
+ The first adapter that recognises it (returns a non-null ``city``) wins.
35
+ Falls back to all-None when no adapter claims the value — matching the
36
+ historical behaviour of the framework's ``_city_metadata`` helper.
37
+
38
+ This accepts both display-form stratum names (e.g. ``"Belo Horizonte"``)
39
+ and dataset-name slugs (e.g. ``"belo_horizonte"``); each adapter is
40
+ responsible for normalising what it owns.
41
+ """
42
+ for adapter in ADAPTERS.values():
43
+ metadata = adapter.stratum_metadata(stratum)
44
+ if metadata.get("city") is not None:
45
+ return metadata
46
+ return {"city": None, "region": None, "population_range": None}
@@ -0,0 +1,184 @@
1
+ import io
2
+ import os
3
+ import zipfile
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Union
7
+
8
+ import polars as pl
9
+ from loguru import logger
10
+
11
+ from vir.adapters.aktin.cleaning import clean_aktin_raw
12
+ from vir.adapters.aktin.terminology import build_icd10gm_labels
13
+ from vir.adapters.aktin.viz import AGE_GROUP_COLOR, CODE_TYPE_COLOR, SEX_COLOR
14
+ from vir.adapters.base import Adapter
15
+
16
+ _FileSource = Union[Path, tuple[Path, str]]
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class _Site:
21
+ """One AKTIN site's pair of source streams (either filesystem paths or zip members)."""
22
+
23
+ name: str
24
+ case_source: _FileSource
25
+ diag_source: _FileSource
26
+
27
+
28
+ def _read_aktin_tsv(source: _FileSource) -> pl.DataFrame:
29
+ """Read a tab-separated AKTIN file either directly or from inside a zip.
30
+
31
+ Uses ``truncate_ragged_lines=True``: AKTIN free-text columns (e.g.
32
+ ``isolation_reason``, ``zuweisung``) occasionally contain embedded tabs in
33
+ real exports, producing rows with more fields than the header declares.
34
+ Truncating the trailing fields lets the row survive — the dropped data is
35
+ in narrative columns that we don't consume downstream.
36
+ """
37
+ read_kwargs = dict(separator="\t", infer_schema_length=0, truncate_ragged_lines=True)
38
+ if isinstance(source, tuple):
39
+ zip_path, member = source
40
+ with zipfile.ZipFile(zip_path) as archive, archive.open(member) as stream:
41
+ payload = stream.read()
42
+ return pl.read_csv(io.BytesIO(payload), **read_kwargs)
43
+ return pl.read_csv(source, **read_kwargs)
44
+
45
+
46
+ def _discover_sites(raw_path: Path) -> list[_Site]:
47
+ """Find every AKTIN site under ``raw_path``.
48
+
49
+ Recognises two layouts: an unpacked ``<site>/`` containing both
50
+ ``case_data.txt`` and ``diag_data.txt``, and a ``<site>_result.zip``
51
+ whose members include both.
52
+
53
+ Raises ``FileNotFoundError`` for a site that has one of the two files but
54
+ not the other — silent orphan-site loss is the worst failure mode (every
55
+ encounter from that site would disappear at the inner-join).
56
+ """
57
+ sites: list[_Site] = []
58
+
59
+ for case_path in sorted(raw_path.glob("**/case_data.txt")):
60
+ diag_path = case_path.parent / "diag_data.txt"
61
+ if not diag_path.exists():
62
+ raise FileNotFoundError(
63
+ f"Site {case_path.parent.name!r} has case_data.txt but no diag_data.txt"
64
+ )
65
+ sites.append(
66
+ _Site(
67
+ name=case_path.parent.name or "root", case_source=case_path, diag_source=diag_path
68
+ )
69
+ )
70
+
71
+ for zip_path in sorted(raw_path.glob("**/*.zip")):
72
+ with zipfile.ZipFile(zip_path) as archive:
73
+ members = set(archive.namelist())
74
+ has_case = "case_data.txt" in members
75
+ has_diag = "diag_data.txt" in members
76
+ if not has_case and not has_diag:
77
+ continue
78
+ if has_case != has_diag:
79
+ raise FileNotFoundError(
80
+ f"Zip {zip_path.name!r} has only one of case_data.txt / diag_data.txt"
81
+ )
82
+ sites.append(
83
+ _Site(
84
+ name=zip_path.stem,
85
+ case_source=(zip_path, "case_data.txt"),
86
+ diag_source=(zip_path, "diag_data.txt"),
87
+ )
88
+ )
89
+
90
+ return sites
91
+
92
+
93
+ def _namespace_encounter_num(frame: pl.DataFrame, site_name: str) -> pl.DataFrame:
94
+ """Prefix ``a_encounter_num`` with the site name so cross-site encounter-id collisions don't merge."""
95
+ return frame.with_columns(
96
+ a_encounter_num=pl.concat_str(
97
+ [pl.lit(f"{site_name}:"), pl.col("a_encounter_num").cast(pl.Utf8)]
98
+ )
99
+ )
100
+
101
+
102
+ class AktinAdapter(Adapter):
103
+ """Adapter for the German AKTIN emergency-department dataset.
104
+
105
+ Single stratum (``"germany"`` — the country as a whole), per-encounter
106
+ diagnoses (one row per encounter × confirmed diagnosis after cleaning),
107
+ ICD-10-GM labels from the vendored BfArM 2025 release.
108
+ """
109
+
110
+ name = "aktin"
111
+ default_model = os.getenv(
112
+ "VIROLA_AKTIN_MODEL", "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
113
+ )
114
+ terminology_csv_filename = "icd10gm_aktin.csv"
115
+
116
+ def clean(
117
+ self,
118
+ raw_path: Path,
119
+ output_path: Path,
120
+ stratum: str | None = None,
121
+ **options,
122
+ ) -> Path:
123
+ """Pool every AKTIN site under ``raw_path`` into a single canonical interim parquet.
124
+
125
+ Each site (unpacked directory or ``<site>_result.zip``) must contribute
126
+ both ``case_data.txt`` and ``diag_data.txt`` — orphan sites raise.
127
+ ``a_encounter_num`` is namespaced per site so id collisions don't merge.
128
+ """
129
+ sites = _discover_sites(raw_path)
130
+ if not sites:
131
+ raise FileNotFoundError(
132
+ f"No AKTIN sites found under {raw_path}. Expected ``<site>/case_data.txt`` + "
133
+ "``diag_data.txt`` pairs or ``<site>_result.zip`` archives."
134
+ )
135
+
136
+ logger.info(f"AKTIN clean: pooling {len(sites)} site(s) under {raw_path}")
137
+
138
+ case_frames = [
139
+ _namespace_encounter_num(_read_aktin_tsv(site.case_source), site.name)
140
+ for site in sites
141
+ ]
142
+ diag_frames = [
143
+ _namespace_encounter_num(_read_aktin_tsv(site.diag_source), site.name)
144
+ for site in sites
145
+ ]
146
+
147
+ cases = pl.concat(case_frames, how="diagonal_relaxed")
148
+ diagnoses = pl.concat(diag_frames, how="diagonal_relaxed")
149
+
150
+ cleaned = clean_aktin_raw(cases, diagnoses)
151
+ cleaned.write_parquet(output_path)
152
+ return output_path
153
+
154
+ def build_terminology_labels(self) -> pl.DataFrame:
155
+ return build_icd10gm_labels()
156
+
157
+ def stratum_metadata(self, stratum: str) -> dict:
158
+ if "germany" in stratum.lower():
159
+ return {"city": "germany", "region": None, "population_range": None}
160
+ return {"city": None, "region": None, "population_range": None}
161
+
162
+ @property
163
+ def strata(self) -> list[str]:
164
+ return ["germany"]
165
+
166
+ @property
167
+ def skip_patterns(self) -> list[tuple[str, str]]:
168
+ return [("icd10gm", r"^Z")]
169
+
170
+ @property
171
+ def code_types(self) -> list[str]:
172
+ return ["icd10gm"]
173
+
174
+ def cleaned_filename(self, stratum: str | None) -> str:
175
+ slug = (stratum or "all").lower().replace(" ", "_")
176
+ return f"cleaned_aktin_{slug}.parquet"
177
+
178
+ @property
179
+ def viz_palettes(self) -> dict[str, dict[str, str]]:
180
+ return {
181
+ "sex": SEX_COLOR,
182
+ "age_group": AGE_GROUP_COLOR,
183
+ "code_type": CODE_TYPE_COLOR,
184
+ }
File without changes
@@ -0,0 +1,87 @@
1
+ """AKTIN raw → canonical interim transformation.
2
+
3
+ Takes the per-encounter ``case_data`` and per-diagnosis ``diag_data`` tables
4
+ exported from AKTIN and produces one row per (encounter × confirmed diagnosis)
5
+ in the canonical interim shape that ``prepare_dataframe`` consumes.
6
+ """
7
+
8
+ import polars as pl
9
+
10
+ AKTIN_AGE_BUCKETS: dict[str, str] = {
11
+ "0-4": "0-4",
12
+ "00-04": "0-4",
13
+ "5-9": "5-9",
14
+ "05-09": "5-9",
15
+ "10-14": "10-14",
16
+ "15-19": "15-19",
17
+ "20-24": "20-39",
18
+ "25-29": "20-39",
19
+ "30-34": "20-39",
20
+ "35-39": "20-39",
21
+ "40-44": "40-59",
22
+ "45-49": "40-59",
23
+ "50-54": "40-59",
24
+ "55-59": "40-59",
25
+ "60-64": "60-79",
26
+ "65-69": "60-79",
27
+ "70-74": "60-79",
28
+ "75-79": "60-79",
29
+ "80+": "80+",
30
+ }
31
+
32
+ _REQUIRED_CASE_FIELDS: tuple[str, ...] = ("geschlecht", "aufnahme_ts", "age_group")
33
+
34
+
35
+ def _null_when_empty(column: str) -> pl.Expr:
36
+ return pl.when(pl.col(column).eq("")).then(None).otherwise(pl.col(column)).alias(column)
37
+
38
+
39
+ def clean_aktin_raw(cases: pl.DataFrame, diagnoses: pl.DataFrame) -> pl.DataFrame:
40
+ """Transform AKTIN ``case_data`` + ``diag_data`` into the canonical interim shape.
41
+
42
+ Steps:
43
+
44
+ 1. Drop case rows with empty / null ``geschlecht``, ``aufnahme_ts`` or
45
+ ``age_group``.
46
+ 2. Parse ``aufnahme_ts`` accepting both ``Z`` and ``+00:00`` offsets;
47
+ unparseable timestamps drop the row rather than ride through as a null
48
+ year/week.
49
+ 3. Collapse the native 5-year ``age_group`` to one of the 8 AKTIN buckets;
50
+ any value outside the bucket map drops the row.
51
+ 4. Keep only diagnoses with ``diagnose_zusatz == 'G'`` (confirmed).
52
+ 5. Inner-join cases × confirmed diagnoses on ``a_encounter_num``.
53
+ 6. Add ``city = "germany"``, ``code_type = "icd10gm"``, ``quantity = 1``.
54
+ 7. Drop ``plz_kurz``. Rename ``geschlecht`` → ``sex``. Retain
55
+ ``a_encounter_num`` and the case_data extras (``triage``, vitals,
56
+ ``verbleib``, ...) for later statistics — they are dropped later by
57
+ ``prepare_dataframe`` when projecting onto the canonical row.
58
+ """
59
+ cleaned_cases = (
60
+ cases.with_columns([_null_when_empty(column) for column in _REQUIRED_CASE_FIELDS])
61
+ .drop_nulls(subset=list(_REQUIRED_CASE_FIELDS))
62
+ .with_columns(
63
+ _admission=pl.col("aufnahme_ts")
64
+ .str.replace(r"Z$", "+00:00")
65
+ .str.to_datetime("%Y-%m-%dT%H:%M:%S%z", strict=False),
66
+ age_group=pl.col("age_group").replace_strict(AKTIN_AGE_BUCKETS, default=None),
67
+ )
68
+ .drop_nulls(subset=["_admission", "age_group"])
69
+ .with_columns(
70
+ year=pl.col("_admission").dt.iso_year(),
71
+ week=pl.col("_admission").dt.week(),
72
+ )
73
+ .drop(["_admission", "plz_kurz"])
74
+ .rename({"geschlecht": "sex"})
75
+ )
76
+
77
+ confirmed_diagnoses = diagnoses.filter(pl.col("diagnose_zusatz").eq("G")).select(
78
+ code=pl.col("icd_code"),
79
+ a_encounter_num=pl.col("a_encounter_num"),
80
+ )
81
+
82
+ joined = cleaned_cases.join(confirmed_diagnoses, on="a_encounter_num", how="inner")
83
+ return joined.with_columns(
84
+ city=pl.lit("germany"),
85
+ code_type=pl.lit("icd10gm"),
86
+ quantity=pl.lit(1),
87
+ )
File without changes
@@ -0,0 +1,272 @@
1
+ import marimo
2
+
3
+ __generated_with = "0.23.4"
4
+ app = marimo.App(width="medium")
5
+
6
+
7
+ @app.cell
8
+ def _():
9
+ import marimo as mo
10
+ import plotly.express as px
11
+ import polars as pl
12
+
13
+ from vir.config import INTERIM_DATA_DIR
14
+
15
+ return INTERIM_DATA_DIR, mo, pl, px
16
+
17
+
18
+ @app.cell
19
+ def _(mo):
20
+ mo.md(r"""
21
+ # AKTIN — Exploratory data analysis
22
+
23
+ Basic shape, demographic distribution, code coverage, and temporal distribution
24
+ of the cleaned AKTIN interim parquet (one row per encounter × confirmed
25
+ diagnosis).
26
+ """)
27
+ return
28
+
29
+
30
+ @app.cell
31
+ def _(INTERIM_DATA_DIR, mo):
32
+ available_files = sorted(INTERIM_DATA_DIR.glob("cleaned_aktin_*.parquet"))
33
+ file_dropdown = mo.ui.dropdown(
34
+ options={path.name: str(path) for path in available_files},
35
+ value=available_files[0].name if available_files else None,
36
+ label="Cleaned AKTIN parquet",
37
+ )
38
+ file_dropdown
39
+ return (file_dropdown,)
40
+
41
+
42
+ @app.cell
43
+ def _(file_dropdown, mo, pl):
44
+ mo.stop(
45
+ not file_dropdown.value,
46
+ mo.callout(
47
+ mo.md("No `cleaned_aktin_*.parquet` files found under `INTERIM_DATA_DIR`."),
48
+ kind="warn",
49
+ ),
50
+ )
51
+ df = pl.read_parquet(file_dropdown.value)
52
+ return (df,)
53
+
54
+
55
+ @app.cell
56
+ def _(mo):
57
+ mo.md(r"""
58
+ ## Shape
59
+ """)
60
+ return
61
+
62
+
63
+ @app.cell
64
+ def _(df, mo):
65
+ n_rows = df.height
66
+ n_encounters = df["a_encounter_num"].n_unique()
67
+ n_codes = df["code"].n_unique()
68
+ year_min = df["year"].min()
69
+ year_max = df["year"].max()
70
+ week_min = df["week"].min()
71
+ week_max = df["week"].max()
72
+ diagnoses_per_encounter = n_rows / n_encounters if n_encounters else 0
73
+
74
+ mo.md(f"""
75
+ | metric | value |
76
+ |---|---|
77
+ | rows (encounter × diagnosis) | {n_rows:,} |
78
+ | distinct encounters | {n_encounters:,} |
79
+ | distinct ICD-10-GM codes | {n_codes:,} |
80
+ | mean diagnoses per encounter | {diagnoses_per_encounter:.2f} |
81
+ | ISO year range | {year_min} – {year_max} |
82
+ | ISO week range | {week_min} – {week_max} |
83
+ """)
84
+ return
85
+
86
+
87
+ @app.cell
88
+ def _(mo):
89
+ mo.md(r"""
90
+ ## Demographics
91
+ """)
92
+ return
93
+
94
+
95
+ @app.cell
96
+ def _():
97
+ age_order = ["0-4", "5-9", "10-14", "15-19", "20-39", "40-59", "60-79", "80+"]
98
+ return (age_order,)
99
+
100
+
101
+ @app.cell
102
+ def _(df, mo, px):
103
+ sex_per_encounter = df.unique(subset=["a_encounter_num"]).group_by("sex").len().sort("sex")
104
+ sex_chart = px.bar(
105
+ sex_per_encounter,
106
+ x="sex",
107
+ y="len",
108
+ title="Encounters by sex",
109
+ labels={"len": "encounters"},
110
+ )
111
+ mo.ui.plotly(sex_chart)
112
+ return
113
+
114
+
115
+ @app.cell
116
+ def _(age_order, df, mo, pl, px):
117
+ age_per_encounter = (
118
+ df.unique(subset=["a_encounter_num"])
119
+ .group_by("age_group")
120
+ .len()
121
+ .with_columns(pl.col("age_group").cast(pl.Enum(age_order)).alias("age_group_ordered"))
122
+ .sort("age_group_ordered")
123
+ )
124
+ age_chart = px.bar(
125
+ age_per_encounter,
126
+ x="age_group",
127
+ y="len",
128
+ title="Encounters by age group",
129
+ category_orders={"age_group": age_order},
130
+ labels={"len": "encounters"},
131
+ )
132
+ mo.ui.plotly(age_chart)
133
+ return
134
+
135
+
136
+ @app.cell
137
+ def _(age_order, df, mo, px):
138
+ sex_age = df.unique(subset=["a_encounter_num"]).group_by(["age_group", "sex"]).len()
139
+ sex_age_chart = px.bar(
140
+ sex_age,
141
+ x="age_group",
142
+ y="len",
143
+ color="sex",
144
+ barmode="group",
145
+ title="Encounters by age group × sex",
146
+ category_orders={"age_group": age_order},
147
+ labels={"len": "encounters"},
148
+ )
149
+ mo.ui.plotly(sex_age_chart)
150
+ return
151
+
152
+
153
+ @app.cell
154
+ def _(mo):
155
+ mo.md(r"""
156
+ ## ICD-10-GM coverage
157
+ """)
158
+ return
159
+
160
+
161
+ @app.cell
162
+ def _(df, mo, pl, px):
163
+ chapter_distribution = (
164
+ df.with_columns(pl.col("code").str.slice(0, 1).alias("chapter_letter"))
165
+ .group_by("chapter_letter")
166
+ .len()
167
+ .sort("chapter_letter")
168
+ )
169
+ chapter_chart = px.bar(
170
+ chapter_distribution,
171
+ x="chapter_letter",
172
+ y="len",
173
+ title="Diagnoses by ICD-10-GM chapter letter",
174
+ labels={"len": "diagnoses", "chapter_letter": "chapter (A–Z, U)"},
175
+ )
176
+ mo.ui.plotly(chapter_chart)
177
+ return
178
+
179
+
180
+ @app.cell
181
+ def _(df, mo, px):
182
+ top_codes = df.group_by("code").len().sort("len", descending=True).head(20)
183
+ top_codes_chart = px.bar(
184
+ top_codes,
185
+ x="code",
186
+ y="len",
187
+ title="Top-20 ICD-10-GM codes by diagnosis count",
188
+ labels={"len": "diagnoses"},
189
+ )
190
+ mo.ui.plotly(top_codes_chart)
191
+ return (top_codes,)
192
+
193
+
194
+ @app.cell
195
+ def _(mo, top_codes):
196
+ mo.ui.table(top_codes, label="Top-20 codes")
197
+ return
198
+
199
+
200
+ @app.cell
201
+ def _(mo):
202
+ mo.md(r"""
203
+ ## Temporal distribution
204
+ """)
205
+ return
206
+
207
+
208
+ @app.cell
209
+ def _(df, pl):
210
+ has_aufnahme_ts = "aufnahme_ts" in df.columns
211
+ timestamps_df = (
212
+ df.with_columns(
213
+ pl.col("aufnahme_ts").str.to_datetime("%Y-%m-%dT%H:%M:%SZ").alias("_admission")
214
+ )
215
+ if has_aufnahme_ts
216
+ else df
217
+ )
218
+ return has_aufnahme_ts, timestamps_df
219
+
220
+
221
+ @app.cell
222
+ def _(has_aufnahme_ts, mo, pl, px, timestamps_df):
223
+ mo.stop(
224
+ not has_aufnahme_ts,
225
+ mo.callout(
226
+ mo.md(
227
+ "``aufnahme_ts`` was dropped from the interim parquet — month-level chart unavailable."
228
+ ),
229
+ kind="warn",
230
+ ),
231
+ )
232
+ monthly = (
233
+ timestamps_df.unique(subset=["a_encounter_num"])
234
+ .with_columns(pl.col("_admission").dt.strftime("%Y-%m").alias("year_month"))
235
+ .group_by("year_month")
236
+ .len()
237
+ .sort("year_month")
238
+ )
239
+ monthly_chart = px.bar(
240
+ monthly,
241
+ x="year_month",
242
+ y="len",
243
+ title="Encounters per month",
244
+ labels={"len": "encounters", "year_month": "year-month"},
245
+ )
246
+ mo.ui.plotly(monthly_chart)
247
+ return
248
+
249
+
250
+ @app.cell
251
+ def _(df, mo, px):
252
+ weekly = (
253
+ df.unique(subset=["a_encounter_num"])
254
+ .group_by(["year", "week"])
255
+ .len()
256
+ .sort(["year", "week"])
257
+ )
258
+ weekly_chart = px.scatter(
259
+ weekly,
260
+ x="week",
261
+ y="len",
262
+ color="year",
263
+ title="Encounters per ISO week, by year",
264
+ labels={"len": "encounters", "week": "ISO week"},
265
+ )
266
+ weekly_chart.update_layout(xaxis=dict(tickmode="linear", dtick=4))
267
+ mo.ui.plotly(weekly_chart)
268
+ return
269
+
270
+
271
+ if __name__ == "__main__":
272
+ app.run()
@@ -0,0 +1,18 @@
1
+ # AKTIN adapter references
2
+
3
+ ## icd10gm2025syst_kodes.txt
4
+
5
+ Source: BfArM (Bundesinstitut für Arzneimittel und Medizinprodukte), ICD-10-GM
6
+ Version 2025 — Systematisches Verzeichnis (classification file
7
+ `icd10gm2025syst-meta.zip` → `Klassifikationsdateien/icd10gm2025syst_kodes.txt`).
8
+
9
+ Stand der Klassifikation: 2024-09-13. OID `1.2.276.0.76.5.548`.
10
+
11
+ Format: semicolon-separated, no header, one row per code. ~16,800 rows (4 MB).
12
+
13
+ Redistribution / attribution: subject to BfArM download conditions
14
+ (`downloadbedingungen-2024.pdf`). License terms must be confirmed before
15
+ making the public repo containing this file generally available — tracked
16
+ in `AKTIN-PLAN.md § Stakeholder check-ins`.
17
+
18
+ Parsing of this file lives in `vir/adapters/aktin/terminology.py`.