virola 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virola-0.0.1/LICENSE +9 -0
- virola-0.0.1/PKG-INFO +405 -0
- virola-0.0.1/README.md +350 -0
- virola-0.0.1/pyproject.toml +83 -0
- virola-0.0.1/vir/__init__.py +1 -0
- virola-0.0.1/vir/adapters/__init__.py +46 -0
- virola-0.0.1/vir/adapters/aktin/__init__.py +184 -0
- virola-0.0.1/vir/adapters/aktin/bin/.gitkeep +0 -0
- virola-0.0.1/vir/adapters/aktin/cleaning.py +87 -0
- virola-0.0.1/vir/adapters/aktin/notebooks/.gitkeep +0 -0
- virola-0.0.1/vir/adapters/aktin/notebooks/eda_aktin.py +272 -0
- virola-0.0.1/vir/adapters/aktin/references/README.md +18 -0
- virola-0.0.1/vir/adapters/aktin/references/icd10gm2025syst_kodes.txt +16817 -0
- virola-0.0.1/vir/adapters/aktin/terminology.py +44 -0
- virola-0.0.1/vir/adapters/aktin/viz.py +24 -0
- virola-0.0.1/vir/adapters/base.py +80 -0
- virola-0.0.1/vir/adapters/datasus/__init__.py +91 -0
- virola-0.0.1/vir/adapters/datasus/bin/cities.txt +10 -0
- virola-0.0.1/vir/adapters/datasus/bin/enrich_latest_run.sh +145 -0
- virola-0.0.1/vir/adapters/datasus/bin/run_pipeline.sh +70 -0
- virola-0.0.1/vir/adapters/datasus/bin/slurm_ablation.sh +67 -0
- virola-0.0.1/vir/adapters/datasus/bin/slurm_pipeline.sh +71 -0
- virola-0.0.1/vir/adapters/datasus/cleaning.py +65 -0
- virola-0.0.1/vir/adapters/datasus/features.py +144 -0
- virola-0.0.1/vir/adapters/datasus/helpers.py +55 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/data_cleaning.py +350 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/eda_raw_data.py +648 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/non_indicative_codes.py +149 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/sc_clinical.py +608 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/sc_demographic.py +437 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/sc_temporal.py +862 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/temporal_distance_comparison.py +618 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/validation/cross_city.py +1651 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/validation/validation_ablation.py +826 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/validation/validation_reference_syndromes.py +1736 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/validation/validation_temporal_c.py +358 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/view_demographic.py +948 -0
- virola-0.0.1/vir/adapters/datasus/notebooks/viz_snf_clusters.py +588 -0
- virola-0.0.1/vir/adapters/datasus/references/RepositorioTerminologia_202506/tb_cid.csv +14240 -0
- virola-0.0.1/vir/adapters/datasus/references/abp_ciap2.csv +24 -0
- virola-0.0.1/vir/adapters/datasus/references/aesop/code_list_arbovirus_apr2024.csv +17 -0
- virola-0.0.1/vir/adapters/datasus/references/aesop/code_list_uri_apr2024.csv +51 -0
- virola-0.0.1/vir/adapters/datasus/references/ciap-2-wicc.csv +92 -0
- virola-0.0.1/vir/adapters/datasus/references/ciap2-cid10.csv +687 -0
- virola-0.0.1/vir/adapters/datasus/references/indicative_codes_yes_no_75perc.csv +112 -0
- virola-0.0.1/vir/adapters/datasus/references/indicative_codes_yes_no__2022_2025__90perc.csv +356 -0
- virola-0.0.1/vir/adapters/datasus/references/osi_template.json +25 -0
- virola-0.0.1/vir/adapters/datasus/references/terminology_mapping.csv +24 -0
- virola-0.0.1/vir/adapters/datasus/terminologies/__init__.py +0 -0
- virola-0.0.1/vir/adapters/datasus/terminologies/terminology_mapping.py +70 -0
- virola-0.0.1/vir/adapters/datasus/terminology.py +100 -0
- virola-0.0.1/vir/adapters/datasus/viz.py +29 -0
- virola-0.0.1/vir/cleaning/__init__.py +0 -0
- virola-0.0.1/vir/cleaning/data_preparation.py +180 -0
- virola-0.0.1/vir/cleaning/indicative_template.py +43 -0
- virola-0.0.1/vir/cleaning/prepare.py +89 -0
- virola-0.0.1/vir/cli.py +910 -0
- virola-0.0.1/vir/clusters.py +355 -0
- virola-0.0.1/vir/config.py +66 -0
- virola-0.0.1/vir/data_filters.py +119 -0
- virola-0.0.1/vir/db.py +1377 -0
- virola-0.0.1/vir/helpers.py +88 -0
- virola-0.0.1/vir/metrics.py +82 -0
- virola-0.0.1/vir/notebook_widgets.py +179 -0
- virola-0.0.1/vir/osi.py +98 -0
- virola-0.0.1/vir/terminologies/__init__.py +0 -0
- virola-0.0.1/vir/terminologies/embeddings_model.py +204 -0
- virola-0.0.1/vir/views/__init__.py +20 -0
- virola-0.0.1/vir/views/base.py +314 -0
- virola-0.0.1/vir/views/clinical.py +163 -0
- virola-0.0.1/vir/views/demographic.py +181 -0
- virola-0.0.1/vir/views/temporal.py +221 -0
- virola-0.0.1/vir/viz.py +58 -0
virola-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
|
|
2
|
+
The MIT License (MIT)
|
|
3
|
+
Copyright (c) 2025, Ana Paula Gomes Ferreira
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
virola-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: virola
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A framework for data-driven syndrome discovery from aggregated health records.
|
|
5
|
+
Author: Ana Paula Gomes Ferreira
|
|
6
|
+
Requires-Python: ~=3.12.0
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: aeon>=1.3.0
|
|
12
|
+
Requires-Dist: anywidget>=0.9.21
|
|
13
|
+
Requires-Dist: datasets>=4.4.2
|
|
14
|
+
Requires-Dist: diskcache>=5.6.3
|
|
15
|
+
Requires-Dist: duckdb>=1.3.2
|
|
16
|
+
Requires-Dist: einops>=0.8.1
|
|
17
|
+
Requires-Dist: embedding-atlas>=0.9.0
|
|
18
|
+
Requires-Dist: h5py>=3.14.0
|
|
19
|
+
Requires-Dist: hdbscan>=0.8.40
|
|
20
|
+
Requires-Dist: igraph>=1.0.0
|
|
21
|
+
Requires-Dist: ipython>=9.5.0
|
|
22
|
+
Requires-Dist: kaleido>=1.2.0
|
|
23
|
+
Requires-Dist: leidenalg>=0.11.0
|
|
24
|
+
Requires-Dist: llm>=0.27.1
|
|
25
|
+
Requires-Dist: loguru
|
|
26
|
+
Requires-Dist: marimo>=0.23.9
|
|
27
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
28
|
+
Requires-Dist: nbconvert>=7.17.1
|
|
29
|
+
Requires-Dist: nbformat>=5.10.4
|
|
30
|
+
Requires-Dist: numpy==2.2
|
|
31
|
+
Requires-Dist: openai>=1.101.0
|
|
32
|
+
Requires-Dist: pip
|
|
33
|
+
Requires-Dist: playwright>=1.59.0
|
|
34
|
+
Requires-Dist: plotly[express]>=6.2.0
|
|
35
|
+
Requires-Dist: polars>=1.32.3
|
|
36
|
+
Requires-Dist: protobuf>=5.0.0
|
|
37
|
+
Requires-Dist: polars-u64-idx>=1.32.3
|
|
38
|
+
Requires-Dist: pyarrow>=23.0.1
|
|
39
|
+
Requires-Dist: pytest
|
|
40
|
+
Requires-Dist: python-dotenv
|
|
41
|
+
Requires-Dist: ruff
|
|
42
|
+
Requires-Dist: scikit-learn>=1.7.2
|
|
43
|
+
Requires-Dist: scipy>=1.15.3
|
|
44
|
+
Requires-Dist: sentence-transformers>=5.1.0
|
|
45
|
+
Requires-Dist: sentencepiece>=0.2.0
|
|
46
|
+
Requires-Dist: setfit>=1.1.3
|
|
47
|
+
Requires-Dist: snfpy>=0.2.2
|
|
48
|
+
Requires-Dist: sqlglot>=27.10.0
|
|
49
|
+
Requires-Dist: torch>=2.8.0
|
|
50
|
+
Requires-Dist: tqdm
|
|
51
|
+
Requires-Dist: transformers>=5.3.0
|
|
52
|
+
Requires-Dist: typer
|
|
53
|
+
Requires-Dist: umap-learn>=0.5.9.post2
|
|
54
|
+
|
|
55
|
+
# virola
|
|
56
|
+
|
|
57
|
+
Find public health threats to be monitored from your data.
|
|
58
|
+
|
|
59
|
+
`virola` is dataset-agnostic: a framework plus pluggable **adapters**. Two adapters ship today:
|
|
60
|
+
|
|
61
|
+
- **`datasus`** — Brazilian AESOP/DATASUS data (CID-10 + CIAP-2 + AB terminologies, 10 IBGE cities).
|
|
62
|
+
- **`aktin`** — German AKTIN emergency-department data (ICD-10-GM, country-level stratum).
|
|
63
|
+
|
|
64
|
+
Each adapter declares its own raw-format reader, terminology source, default embedding model,
|
|
65
|
+
skip patterns, and stratum metadata. Everything downstream of the canonical row
|
|
66
|
+
`(year, week, sex, age_group, code_type, code, quantity, terminology_id, terminology_label, [city])`
|
|
67
|
+
is shared.
|
|
68
|
+
|
|
69
|
+
## Licensing
|
|
70
|
+
|
|
71
|
+
The MIT license (see `LICENSE`) covers the **source code of this project only**.
|
|
72
|
+
|
|
73
|
+
The reference and auxiliary files bundled with each adapter — terminology tables,
|
|
74
|
+
classification files, code lists, and any other third-party data (e.g. CID-10, CIAP-2,
|
|
75
|
+
AB, ICD-10-GM, AESOP code lists) — are **not** covered by this license. They remain
|
|
76
|
+
subject to the terms and licenses of their respective data sources, and their use and
|
|
77
|
+
redistribution are the responsibility of those sources. Refer to each source for the
|
|
78
|
+
applicable terms before reusing these files.
|
|
79
|
+
|
|
80
|
+
## Setup (shared)
|
|
81
|
+
|
|
82
|
+
Install [uv](https://docs.astral.sh/uv/getting-started/installation/) (the project pins its Python
|
|
83
|
+
version in `.python-version`), then from the project root:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uv sync
|
|
87
|
+
uv run plotly_get_chrome # required by kaleido for plotly figure export
|
|
88
|
+
cp .env.example .env # set DATA_DIR / RAW_DATA_DIR / VIROLA_ADAPTER as needed
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
To place the virtualenv outside the project: `UV_PROJECT_ENVIRONMENT=/path/to/.venv uv sync`.
|
|
92
|
+
|
|
93
|
+
### Environment variables
|
|
94
|
+
|
|
95
|
+
The project reads `.env` at the project root if present. Recognised variables:
|
|
96
|
+
|
|
97
|
+
- `DATA_DIR` — overrides the default `./data` location (raw, interim, processed, runs, `virola.db`).
|
|
98
|
+
- `RAW_DATA_DIR` — overrides `data/raw/` only. Point at your adapter's raw export.
|
|
99
|
+
- `VIROLA_ADAPTER` — default adapter for commands that take `--adapter`. CLI flag wins.
|
|
100
|
+
- `LLM_MODEL` — Ollama model used by `vir results llm-descriptions` (default `llama3.2:1b`).
|
|
101
|
+
- `HF_HOME` — cache directory for the SapBERT/XLMR downloads.
|
|
102
|
+
- `HF_FROM_LIMITED_ENV` — set to a non-empty value to force a manual HF snapshot download
|
|
103
|
+
(SLURM / air-gapped). Empty value = regular environment.
|
|
104
|
+
|
|
105
|
+
### CLI shape
|
|
106
|
+
|
|
107
|
+
```text
|
|
108
|
+
# Adapter-level commands — require --adapter (or VIROLA_ADAPTER env var)
|
|
109
|
+
vir setup terminologies --adapter <name>
|
|
110
|
+
vir setup embeddings --adapter <name> [--model MODEL]
|
|
111
|
+
|
|
112
|
+
# Stratum-level commands — adapter is derived from the stratum registry
|
|
113
|
+
vir prepare clean --stratum <name> [--input PATH] [--output DIR]
|
|
114
|
+
vir pipeline run <stratum> [--year-min ...] [--year-max ...] ...
|
|
115
|
+
vir validate ablation --stratum <name>
|
|
116
|
+
vir results delete --stratum <name>
|
|
117
|
+
|
|
118
|
+
# Canonical-row commands — adapter via env or explicit flag
|
|
119
|
+
vir prepare process --dataset PATH --adapter <name>
|
|
120
|
+
vir prepare indicative-template --dataset PATH --adapter <name>
|
|
121
|
+
vir view clinical --dataset NAME --adapter <name>
|
|
122
|
+
vir view demographic --dataset NAME
|
|
123
|
+
vir view temporal --dataset NAME
|
|
124
|
+
vir model snf --dataset NAME --adapter <name>
|
|
125
|
+
vir results build-profiles RUN_ID --dataset NAME
|
|
126
|
+
vir results explain RUN_ID --dataset NAME --adapter <name>
|
|
127
|
+
vir results list
|
|
128
|
+
vir results llm-descriptions RUN_ID
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
`--city` is accepted as a deprecated alias for `--stratum` (emits a warning).
|
|
132
|
+
|
|
133
|
+
Use `uv run vir COMMAND --help` for detailed flag descriptions.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## DATASUS (Brazil)
|
|
138
|
+
|
|
139
|
+
### Raw data
|
|
140
|
+
|
|
141
|
+
The pipeline expects an AESOP extraction at `data/raw/base_aesop_raw_extracao_01042026.parquet/`
|
|
142
|
+
(multi-file parquet). Schema, period, and the IBGE municipality filter are documented in
|
|
143
|
+
`data/raw/readme-base_aesop_raw_extracao_01042026.md`. Override the location with `RAW_DATA_DIR`
|
|
144
|
+
in `.env`.
|
|
145
|
+
|
|
146
|
+
The 10 cities targeted by the pipeline are listed in `vir/adapters/datasus/bin/cities.txt`;
|
|
147
|
+
the same names are accepted by `vir pipeline run <stratum>`.
|
|
148
|
+
|
|
149
|
+
### One-time setup
|
|
150
|
+
|
|
151
|
+
Reference files (`vir/adapters/datasus/references/RepositorioTerminologia_202506/tb_cid.csv`,
|
|
152
|
+
`ciap2-cid10.csv`, `abp_ciap2.csv`) are committed. Build the joined terminology table and the
|
|
153
|
+
SapBERT embeddings database once:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
uv run vir setup terminologies --adapter datasus
|
|
157
|
+
uv run vir setup embeddings --adapter datasus
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
`setup embeddings` downloads `cambridgeltl/SapBERT-from-PubMedBERT-fulltext` to `HF_HOME`.
|
|
161
|
+
|
|
162
|
+
### Run all cities
|
|
163
|
+
|
|
164
|
+
`vir/adapters/datasus/bin/cities.txt` drives both helpers:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
vir/adapters/datasus/bin/run_pipeline.sh all # clean → process → views → SNF per city
|
|
168
|
+
vir/adapters/datasus/bin/enrich_latest_run.sh all # build-profiles + explain + llm-descriptions
|
|
169
|
+
uv run vir results list
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Extra `vir pipeline run` flags pass through, e.g.
|
|
173
|
+
`vir/adapters/datasus/bin/run_pipeline.sh all --skip-clean --n-clusters 150`.
|
|
174
|
+
|
|
175
|
+
### Run a single city
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
uv run vir pipeline run "Belo Horizonte" --year-min 2022 --year-max 2024 \
|
|
179
|
+
--snf-k 10 --n-clusters 200 --notes "run for paper"
|
|
180
|
+
|
|
181
|
+
uv run vir results list
|
|
182
|
+
vir/adapters/datasus/bin/enrich_latest_run.sh
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`vir pipeline run` chains `clean → process → views (clinical, demographic, temporal) → model snf`
|
|
186
|
+
and resolves the adapter (`datasus`) from the stratum name via the registry.
|
|
187
|
+
|
|
188
|
+
### Step-by-step alternative
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
uv run vir prepare clean --stratum "Belo Horizonte"
|
|
192
|
+
uv run vir prepare process --dataset data/interim/cleaned_*belo_horizonte*.parquet \
|
|
193
|
+
--adapter datasus
|
|
194
|
+
|
|
195
|
+
uv run vir view clinical --dataset belo_horizonte --adapter datasus \
|
|
196
|
+
--year-min 2022 --year-max 2024
|
|
197
|
+
uv run vir view demographic --dataset belo_horizonte --year-min 2022 --year-max 2024
|
|
198
|
+
uv run vir view temporal --dataset belo_horizonte --year-min 2022 --year-max 2024 --min-weeks 12
|
|
199
|
+
|
|
200
|
+
uv run vir model snf --dataset belo_horizonte --adapter datasus \
|
|
201
|
+
--year-min 2022 --year-max 2024 --snf-k 10 --n-clusters 200 --notes "run for paper"
|
|
202
|
+
|
|
203
|
+
uv run vir results build-profiles 1 --dataset belo_horizonte
|
|
204
|
+
uv run vir results explain 1 --dataset belo_horizonte --adapter datasus
|
|
205
|
+
uv run vir results llm-descriptions 1 # requires Ollama (see Optional below)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### SLURM
|
|
209
|
+
|
|
210
|
+
Two SLURM scripts mirror the helpers above; edit the `--mail-user` line at the top of each before
|
|
211
|
+
submitting and set `DATA_DIR` (in `.env` or via `--export`) when the shared filesystem is not the
|
|
212
|
+
project root.
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
sbatch vir/adapters/datasus/bin/slurm_pipeline.sh # full pipeline, every city
|
|
216
|
+
sbatch --export=ALL,EXTRA_ARGS="--skip-clean" vir/adapters/datasus/bin/slurm_pipeline.sh
|
|
217
|
+
|
|
218
|
+
sbatch vir/adapters/datasus/bin/slurm_ablation.sh # ablation array, one task per city
|
|
219
|
+
sbatch --export=ALL,EXTRA_ARGS="--permutations 5000" vir/adapters/datasus/bin/slurm_ablation.sh
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
The ablation array index range (`#SBATCH --array=0-9` in `slurm_ablation.sh`) must match the
|
|
223
|
+
number of lines in `vir/adapters/datasus/bin/cities.txt`.
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
## AKTIN (Germany)
|
|
228
|
+
|
|
229
|
+
The AKTIN adapter treats Germany as a single stratum (`"germany"`). Per-encounter raw data lives
|
|
230
|
+
in `*_result/case_data.txt` + `diag_data.txt` pairs (tab-separated). The adapter pools every site
|
|
231
|
+
into one cleaned interim parquet.
|
|
232
|
+
|
|
233
|
+
### Raw data
|
|
234
|
+
|
|
235
|
+
Each AKTIN export ships per-hospital folders/zips. Either:
|
|
236
|
+
|
|
237
|
+
- a directory of unzipped `<N>_result/` subdirectories under `RAW_DATA_DIR`, **or**
|
|
238
|
+
- a directory containing a single site's `case_data.txt` and `diag_data.txt`.
|
|
239
|
+
|
|
240
|
+
Set `RAW_DATA_DIR` in `.env` (absolute path, or leave blank to use `data/raw/`).
|
|
241
|
+
|
|
242
|
+
### One-time setup
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
uv run vir setup terminologies --adapter aktin # parses BfArM ICD-10-GM 2025 (vendored)
|
|
246
|
+
uv run vir setup embeddings --adapter aktin # downloads SapBERT-UMLS-XLMR (~2.2 GB)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
The BfArM ICD-10-GM 2025 classification file is vendored at
|
|
250
|
+
`vir/adapters/aktin/references/icd10gm2025syst_kodes.txt`. AKTIN's default embedding model is
|
|
251
|
+
`cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR` (multilingual; embeds German labels
|
|
252
|
+
reasonably).
|
|
253
|
+
|
|
254
|
+
### Run the pipeline
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
uv run vir pipeline run germany --year-min 2018 --year-max 2023 \
|
|
258
|
+
--snf-k 10 --n-clusters 200 --notes "first aktin run"
|
|
259
|
+
|
|
260
|
+
uv run vir results list
|
|
261
|
+
uv run vir results build-profiles 1 --dataset germany
|
|
262
|
+
uv run vir results explain 1 --dataset germany --adapter aktin
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
`vir pipeline run` derives the adapter (`aktin`) from the stratum `germany` via the registry and
|
|
266
|
+
threads it through every step.
|
|
267
|
+
|
|
268
|
+
### Step-by-step alternative
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
uv run vir prepare clean --stratum germany # adapter derived
|
|
272
|
+
uv run vir prepare process --dataset data/interim/cleaned_aktin_germany.parquet \
|
|
273
|
+
--adapter aktin
|
|
274
|
+
|
|
275
|
+
uv run vir view clinical --dataset germany --adapter aktin \
|
|
276
|
+
--year-min 2018 --year-max 2023
|
|
277
|
+
uv run vir view demographic --dataset germany --year-min 2018 --year-max 2023
|
|
278
|
+
uv run vir view temporal --dataset germany --year-min 2018 --year-max 2023 --min-weeks 12
|
|
279
|
+
|
|
280
|
+
uv run vir model snf --dataset germany --adapter aktin \
|
|
281
|
+
--year-min 2018 --year-max 2023 --snf-k 10 --n-clusters 200
|
|
282
|
+
|
|
283
|
+
uv run vir results build-profiles 1 --dataset germany
|
|
284
|
+
uv run vir results explain 1 --dataset germany --adapter aktin
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Indicative-codes template
|
|
288
|
+
|
|
289
|
+
AKTIN ships **without** an epidemiologist-labeled CSV. To generate the top-volume template for
|
|
290
|
+
labeling:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
uv run vir prepare indicative-template \
|
|
294
|
+
--dataset data/interim/cleaned_aktin_germany.parquet \
|
|
295
|
+
--adapter aktin \
|
|
296
|
+
--coverage 0.9
|
|
297
|
+
# writes data/processed/indicative_codes_template_aktin.csv
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
Until labels are supplied, AKTIN runs effectively with `--no-indicative` (only Z-chapter codes
|
|
301
|
+
are filtered via `adapter.skip_patterns`).
|
|
302
|
+
|
|
303
|
+
### Exploratory data analysis
|
|
304
|
+
|
|
305
|
+
A starter notebook lives at `vir/adapters/aktin/notebooks/eda_aktin.py` (shape, demographics,
|
|
306
|
+
ICD-10-GM coverage, monthly/weekly temporal distribution):
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
uv run marimo edit vir/adapters/aktin/notebooks/eda_aktin.py
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## Adding a new adapter
|
|
315
|
+
|
|
316
|
+
The contract is the abstract base class `vir/adapters/base.py::Adapter`. Concrete adapters live
|
|
317
|
+
under `vir/adapters/<name>/` and own their cleaning, terminology, references, notebooks, bin
|
|
318
|
+
scripts, and tests. Each adapter declares:
|
|
319
|
+
|
|
320
|
+
- `name`, `default_model`, `terminology_csv_filename` — class attributes.
|
|
321
|
+
- `clean(raw_path, output_path, stratum, **opts)` — read raw → write canonical interim parquet.
|
|
322
|
+
- `build_terminology_labels()` — return `(code, code_type, description)` DataFrame.
|
|
323
|
+
- `stratum_metadata(stratum)` — `{city, region, population_range}`.
|
|
324
|
+
- `cleaned_filename(stratum)` — interim filename convention.
|
|
325
|
+
- `strata`, `skip_patterns`, `code_types` — properties.
|
|
326
|
+
- `indicative_codes_file` — optional `Path`, defaults to `None`.
|
|
327
|
+
|
|
328
|
+
Register the instance in `vir/adapters/__init__.py::ADAPTERS`. The registry detects stratum
|
|
329
|
+
collisions across adapters at import time. See `vir/adapters/datasus/` and `vir/adapters/aktin/`
|
|
330
|
+
for the two reference implementations.
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## Optional
|
|
335
|
+
|
|
336
|
+
### LLM descriptions
|
|
337
|
+
|
|
338
|
+
`vir results llm-descriptions` is optional. To enable it, install
|
|
339
|
+
[Ollama](https://ollama.com/download/), pull the model named in `LLM_MODEL` (default
|
|
340
|
+
`llama3.2:1b`), and add the `llm-ollama` plugin:
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
ollama pull llama3.2:1b
|
|
344
|
+
uv run llm install llm-ollama
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### Development
|
|
348
|
+
|
|
349
|
+
```bash
|
|
350
|
+
make test # or: uv run pytest
|
|
351
|
+
make lint # ruff format --check && ruff check
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
Interactive marimo edit mode for exploration:
|
|
355
|
+
|
|
356
|
+
```bash
|
|
357
|
+
uv run marimo edit notebooks/view_clinical.py # framework
|
|
358
|
+
uv run marimo edit vir/adapters/datasus/notebooks/view_demographic.py # DATASUS
|
|
359
|
+
uv run marimo edit vir/adapters/aktin/notebooks/eda_aktin.py # AKTIN
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
Two DATASUS notebook subfolders are organised by purpose:
|
|
363
|
+
|
|
364
|
+
- `vir/adapters/datasus/notebooks/validation/` — paper-grade validation (ablation, cross-city,
|
|
365
|
+
reference syndromes, temporal C). The ablation notebook is also exposed as `vir validate
|
|
366
|
+
ablation`.
|
|
367
|
+
- `vir/adapters/datasus/notebooks/sanity_checks/` — internal exploratory checks per view
|
|
368
|
+
(`sc_clinical`, `sc_demographic`, `sc_temporal`, feature transformation comparison,
|
|
369
|
+
temporal distance comparison). Not part of the paper's evidence base.
|
|
370
|
+
|
|
371
|
+
## Notes
|
|
372
|
+
|
|
373
|
+
### Terminology (DATASUS)
|
|
374
|
+
|
|
375
|
+
ABP008 is duplicated in the official AB terminology source
|
|
376
|
+
(<https://integracao.esusab.ufsc.br/ledi/documentacao/estrutura_arquivos/dicionario-fai.html#listaciapcondicaoavaliada>),
|
|
377
|
+
mapping to two distinct conditions: Beribéri (ABP008-1) and Desnutrição (ABP008-2).
|
|
378
|
+
The suffixes `-1` / `-2` are added in the reference files (`abp_ciap2.csv`,
|
|
379
|
+
`terminology_mapping.csv`) for disambiguation only — the raw data always carries the base code
|
|
380
|
+
`ABP008` without any suffix.
|
|
381
|
+
|
|
382
|
+
### AESOP validation files (DATASUS)
|
|
383
|
+
|
|
384
|
+
The anchor validation used AESOP syndrome indicators, downloaded from the project's
|
|
385
|
+
[repo](https://github.com/cidacslab/AESOP-Data-Documentation/):
|
|
386
|
+
|
|
387
|
+
- [code_list_arbovirus_apr2024.csv](https://raw.githubusercontent.com/cidacslab/AESOP-Data-Documentation/refs/heads/main/DataPipeline/documentation/code_list_arbovirus_apr2024.csv)
|
|
388
|
+
- [code_list_uri_apr2024.csv](https://raw.githubusercontent.com/cidacslab/AESOP-Data-Documentation/refs/heads/main/DataPipeline/documentation/code_list_uri_apr2024.csv)
|
|
389
|
+
|
|
390
|
+
### Indicative codes (DATASUS)
|
|
391
|
+
|
|
392
|
+
An expert reviewed the codes covering the top 90% of record volume and labeled each as indicative
|
|
393
|
+
(carries an epidemiological signal) or non-indicative. Codes labeled non-indicative are excluded
|
|
394
|
+
from the analysis; codes not yet reviewed (outside the top 90%, or reviewed but unlabeled) are
|
|
395
|
+
retained.
|
|
396
|
+
|
|
397
|
+
To generate (or regenerate) the labeling template for a different time window:
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
uv run vir prepare indicative-template \
|
|
401
|
+
--dataset data/interim/cleaned_*_all_cities.parquet \
|
|
402
|
+
--adapter datasus \
|
|
403
|
+
--coverage 0.9
|
|
404
|
+
```
|
|
405
|
+
|