virola 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. virola-0.0.1/LICENSE +9 -0
  2. virola-0.0.1/PKG-INFO +405 -0
  3. virola-0.0.1/README.md +350 -0
  4. virola-0.0.1/pyproject.toml +83 -0
  5. virola-0.0.1/vir/__init__.py +1 -0
  6. virola-0.0.1/vir/adapters/__init__.py +46 -0
  7. virola-0.0.1/vir/adapters/aktin/__init__.py +184 -0
  8. virola-0.0.1/vir/adapters/aktin/bin/.gitkeep +0 -0
  9. virola-0.0.1/vir/adapters/aktin/cleaning.py +87 -0
  10. virola-0.0.1/vir/adapters/aktin/notebooks/.gitkeep +0 -0
  11. virola-0.0.1/vir/adapters/aktin/notebooks/eda_aktin.py +272 -0
  12. virola-0.0.1/vir/adapters/aktin/references/README.md +18 -0
  13. virola-0.0.1/vir/adapters/aktin/references/icd10gm2025syst_kodes.txt +16817 -0
  14. virola-0.0.1/vir/adapters/aktin/terminology.py +44 -0
  15. virola-0.0.1/vir/adapters/aktin/viz.py +24 -0
  16. virola-0.0.1/vir/adapters/base.py +80 -0
  17. virola-0.0.1/vir/adapters/datasus/__init__.py +91 -0
  18. virola-0.0.1/vir/adapters/datasus/bin/cities.txt +10 -0
  19. virola-0.0.1/vir/adapters/datasus/bin/enrich_latest_run.sh +145 -0
  20. virola-0.0.1/vir/adapters/datasus/bin/run_pipeline.sh +70 -0
  21. virola-0.0.1/vir/adapters/datasus/bin/slurm_ablation.sh +67 -0
  22. virola-0.0.1/vir/adapters/datasus/bin/slurm_pipeline.sh +71 -0
  23. virola-0.0.1/vir/adapters/datasus/cleaning.py +65 -0
  24. virola-0.0.1/vir/adapters/datasus/features.py +144 -0
  25. virola-0.0.1/vir/adapters/datasus/helpers.py +55 -0
  26. virola-0.0.1/vir/adapters/datasus/notebooks/data_cleaning.py +350 -0
  27. virola-0.0.1/vir/adapters/datasus/notebooks/eda_raw_data.py +648 -0
  28. virola-0.0.1/vir/adapters/datasus/notebooks/non_indicative_codes.py +149 -0
  29. virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/sc_clinical.py +608 -0
  30. virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/sc_demographic.py +437 -0
  31. virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/sc_temporal.py +862 -0
  32. virola-0.0.1/vir/adapters/datasus/notebooks/sanity_checks/temporal_distance_comparison.py +618 -0
  33. virola-0.0.1/vir/adapters/datasus/notebooks/validation/cross_city.py +1651 -0
  34. virola-0.0.1/vir/adapters/datasus/notebooks/validation/validation_ablation.py +826 -0
  35. virola-0.0.1/vir/adapters/datasus/notebooks/validation/validation_reference_syndromes.py +1736 -0
  36. virola-0.0.1/vir/adapters/datasus/notebooks/validation/validation_temporal_c.py +358 -0
  37. virola-0.0.1/vir/adapters/datasus/notebooks/view_demographic.py +948 -0
  38. virola-0.0.1/vir/adapters/datasus/notebooks/viz_snf_clusters.py +588 -0
  39. virola-0.0.1/vir/adapters/datasus/references/RepositorioTerminologia_202506/tb_cid.csv +14240 -0
  40. virola-0.0.1/vir/adapters/datasus/references/abp_ciap2.csv +24 -0
  41. virola-0.0.1/vir/adapters/datasus/references/aesop/code_list_arbovirus_apr2024.csv +17 -0
  42. virola-0.0.1/vir/adapters/datasus/references/aesop/code_list_uri_apr2024.csv +51 -0
  43. virola-0.0.1/vir/adapters/datasus/references/ciap-2-wicc.csv +92 -0
  44. virola-0.0.1/vir/adapters/datasus/references/ciap2-cid10.csv +687 -0
  45. virola-0.0.1/vir/adapters/datasus/references/indicative_codes_yes_no_75perc.csv +112 -0
  46. virola-0.0.1/vir/adapters/datasus/references/indicative_codes_yes_no__2022_2025__90perc.csv +356 -0
  47. virola-0.0.1/vir/adapters/datasus/references/osi_template.json +25 -0
  48. virola-0.0.1/vir/adapters/datasus/references/terminology_mapping.csv +24 -0
  49. virola-0.0.1/vir/adapters/datasus/terminologies/__init__.py +0 -0
  50. virola-0.0.1/vir/adapters/datasus/terminologies/terminology_mapping.py +70 -0
  51. virola-0.0.1/vir/adapters/datasus/terminology.py +100 -0
  52. virola-0.0.1/vir/adapters/datasus/viz.py +29 -0
  53. virola-0.0.1/vir/cleaning/__init__.py +0 -0
  54. virola-0.0.1/vir/cleaning/data_preparation.py +180 -0
  55. virola-0.0.1/vir/cleaning/indicative_template.py +43 -0
  56. virola-0.0.1/vir/cleaning/prepare.py +89 -0
  57. virola-0.0.1/vir/cli.py +910 -0
  58. virola-0.0.1/vir/clusters.py +355 -0
  59. virola-0.0.1/vir/config.py +66 -0
  60. virola-0.0.1/vir/data_filters.py +119 -0
  61. virola-0.0.1/vir/db.py +1377 -0
  62. virola-0.0.1/vir/helpers.py +88 -0
  63. virola-0.0.1/vir/metrics.py +82 -0
  64. virola-0.0.1/vir/notebook_widgets.py +179 -0
  65. virola-0.0.1/vir/osi.py +98 -0
  66. virola-0.0.1/vir/terminologies/__init__.py +0 -0
  67. virola-0.0.1/vir/terminologies/embeddings_model.py +204 -0
  68. virola-0.0.1/vir/views/__init__.py +20 -0
  69. virola-0.0.1/vir/views/base.py +314 -0
  70. virola-0.0.1/vir/views/clinical.py +163 -0
  71. virola-0.0.1/vir/views/demographic.py +181 -0
  72. virola-0.0.1/vir/views/temporal.py +221 -0
  73. virola-0.0.1/vir/viz.py +58 -0
virola-0.0.1/LICENSE ADDED
@@ -0,0 +1,9 @@
1
+
2
+ The MIT License (MIT)
3
+ Copyright (c) 2025, Ana Paula Gomes Ferreira
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
virola-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,405 @@
1
+ Metadata-Version: 2.4
2
+ Name: virola
3
+ Version: 0.0.1
4
+ Summary: A framework for data-driven syndrome discovery from aggregated health records.
5
+ Author: Ana Paula Gomes Ferreira
6
+ Requires-Python: ~=3.12.0
7
+ Description-Content-Type: text/markdown
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ License-File: LICENSE
11
+ Requires-Dist: aeon>=1.3.0
12
+ Requires-Dist: anywidget>=0.9.21
13
+ Requires-Dist: datasets>=4.4.2
14
+ Requires-Dist: diskcache>=5.6.3
15
+ Requires-Dist: duckdb>=1.3.2
16
+ Requires-Dist: einops>=0.8.1
17
+ Requires-Dist: embedding-atlas>=0.9.0
18
+ Requires-Dist: h5py>=3.14.0
19
+ Requires-Dist: hdbscan>=0.8.40
20
+ Requires-Dist: igraph>=1.0.0
21
+ Requires-Dist: ipython>=9.5.0
22
+ Requires-Dist: kaleido>=1.2.0
23
+ Requires-Dist: leidenalg>=0.11.0
24
+ Requires-Dist: llm>=0.27.1
25
+ Requires-Dist: loguru
26
+ Requires-Dist: marimo>=0.23.9
27
+ Requires-Dist: matplotlib>=3.10.8
28
+ Requires-Dist: nbconvert>=7.17.1
29
+ Requires-Dist: nbformat>=5.10.4
30
+ Requires-Dist: numpy==2.2
31
+ Requires-Dist: openai>=1.101.0
32
+ Requires-Dist: pip
33
+ Requires-Dist: playwright>=1.59.0
34
+ Requires-Dist: plotly[express]>=6.2.0
35
+ Requires-Dist: polars>=1.32.3
36
+ Requires-Dist: protobuf>=5.0.0
37
+ Requires-Dist: polars-u64-idx>=1.32.3
38
+ Requires-Dist: pyarrow>=23.0.1
39
+ Requires-Dist: pytest
40
+ Requires-Dist: python-dotenv
41
+ Requires-Dist: ruff
42
+ Requires-Dist: scikit-learn>=1.7.2
43
+ Requires-Dist: scipy>=1.15.3
44
+ Requires-Dist: sentence-transformers>=5.1.0
45
+ Requires-Dist: sentencepiece>=0.2.0
46
+ Requires-Dist: setfit>=1.1.3
47
+ Requires-Dist: snfpy>=0.2.2
48
+ Requires-Dist: sqlglot>=27.10.0
49
+ Requires-Dist: torch>=2.8.0
50
+ Requires-Dist: tqdm
51
+ Requires-Dist: transformers>=5.3.0
52
+ Requires-Dist: typer
53
+ Requires-Dist: umap-learn>=0.5.9.post2
54
+
55
+ # virola
56
+
57
+ Find public health threats to be monitored from your data.
58
+
59
+ `virola` is dataset-agnostic: a framework plus pluggable **adapters**. Two adapters ship today:
60
+
61
+ - **`datasus`** — Brazilian AESOP/DATASUS data (CID-10 + CIAP-2 + AB terminologies, 10 IBGE cities).
62
+ - **`aktin`** — German AKTIN emergency-department data (ICD-10-GM, country-level stratum).
63
+
64
+ Each adapter declares its own raw-format reader, terminology source, default embedding model,
65
+ skip patterns, and stratum metadata. Everything downstream of the canonical row
66
+ `(year, week, sex, age_group, code_type, code, quantity, terminology_id, terminology_label, [city])`
67
+ is shared.
68
+
69
+ ## Licensing
70
+
71
+ The MIT license (see `LICENSE`) covers the **source code of this project only**.
72
+
73
+ The reference and auxiliary files bundled with each adapter — terminology tables,
74
+ classification files, code lists, and any other third-party data (e.g. CID-10, CIAP-2,
75
+ AB, ICD-10-GM, AESOP code lists) — are **not** covered by this license. They remain
76
+ subject to the terms and licenses of their respective data sources, and their use and
77
+ redistribution are the responsibility of those sources. Refer to each source for the
78
+ applicable terms before reusing these files.
79
+
80
+ ## Setup (shared)
81
+
82
+ Install [uv](https://docs.astral.sh/uv/getting-started/installation/) (the project pins its Python
83
+ version in `.python-version`), then from the project root:
84
+
85
+ ```bash
86
+ uv sync
87
+ uv run plotly_get_chrome # required by kaleido for plotly figure export
88
+ cp .env.example .env # set DATA_DIR / RAW_DATA_DIR / VIROLA_ADAPTER as needed
89
+ ```
90
+
91
+ To place the virtualenv outside the project: `UV_PROJECT_ENVIRONMENT=/path/to/.venv uv sync`.
92
+
93
+ ### Environment variables
94
+
95
+ The project reads `.env` at the project root if present. Recognised variables:
96
+
97
+ - `DATA_DIR` — overrides the default `./data` location (raw, interim, processed, runs, `virola.db`).
98
+ - `RAW_DATA_DIR` — overrides `data/raw/` only. Point at your adapter's raw export.
99
+ - `VIROLA_ADAPTER` — default adapter for commands that take `--adapter`. CLI flag wins.
100
+ - `LLM_MODEL` — Ollama model used by `vir results llm-descriptions` (default `llama3.2:1b`).
101
+ - `HF_HOME` — cache directory for the SapBERT/XLMR downloads.
102
+ - `HF_FROM_LIMITED_ENV` — set to a non-empty value to force a manual HF snapshot download
103
+ (SLURM / air-gapped). Empty value = regular environment.
104
+
105
+ ### CLI shape
106
+
107
+ ```text
108
+ # Adapter-level commands — require --adapter (or VIROLA_ADAPTER env var)
109
+ vir setup terminologies --adapter <name>
110
+ vir setup embeddings --adapter <name> [--model MODEL]
111
+
112
+ # Stratum-level commands — adapter is derived from the stratum registry
113
+ vir prepare clean --stratum <name> [--input PATH] [--output DIR]
114
+ vir pipeline run <stratum> [--year-min ...] [--year-max ...] ...
115
+ vir validate ablation --stratum <name>
116
+ vir results delete --stratum <name>
117
+
118
+ # Canonical-row commands — adapter via env or explicit flag
119
+ vir prepare process --dataset PATH --adapter <name>
120
+ vir prepare indicative-template --dataset PATH --adapter <name>
121
+ vir view clinical --dataset NAME --adapter <name>
122
+ vir view demographic --dataset NAME
123
+ vir view temporal --dataset NAME
124
+ vir model snf --dataset NAME --adapter <name>
125
+ vir results build-profiles RUN_ID --dataset NAME
126
+ vir results explain RUN_ID --dataset NAME --adapter <name>
127
+ vir results list
128
+ vir results llm-descriptions RUN_ID
129
+ ```
130
+
131
+ `--city` is accepted as a deprecated alias for `--stratum` (emits a warning).
132
+
133
+ Use `uv run vir COMMAND --help` for detailed flag descriptions.
134
+
135
+ ---
136
+
137
+ ## DATASUS (Brazil)
138
+
139
+ ### Raw data
140
+
141
+ The pipeline expects an AESOP extraction at `data/raw/base_aesop_raw_extracao_01042026.parquet/`
142
+ (multi-file parquet). Schema, period, and the IBGE municipality filter are documented in
143
+ `data/raw/readme-base_aesop_raw_extracao_01042026.md`. Override the location with `RAW_DATA_DIR`
144
+ in `.env`.
145
+
146
+ The 10 cities targeted by the pipeline are listed in `vir/adapters/datasus/bin/cities.txt`;
147
+ the same names are accepted by `vir pipeline run <stratum>`.
148
+
149
+ ### One-time setup
150
+
151
+ Reference files (`vir/adapters/datasus/references/RepositorioTerminologia_202506/tb_cid.csv`,
152
+ `ciap2-cid10.csv`, `abp_ciap2.csv`) are committed. Build the joined terminology table and the
153
+ SapBERT embeddings database once:
154
+
155
+ ```bash
156
+ uv run vir setup terminologies --adapter datasus
157
+ uv run vir setup embeddings --adapter datasus
158
+ ```
159
+
160
+ `setup embeddings` downloads `cambridgeltl/SapBERT-from-PubMedBERT-fulltext` to `HF_HOME`.
161
+
162
+ ### Run all cities
163
+
164
+ `vir/adapters/datasus/bin/cities.txt` drives both helpers:
165
+
166
+ ```bash
167
+ vir/adapters/datasus/bin/run_pipeline.sh all # clean → process → views → SNF per city
168
+ vir/adapters/datasus/bin/enrich_latest_run.sh all # build-profiles + explain + llm-descriptions
169
+ uv run vir results list
170
+ ```
171
+
172
+ Extra `vir pipeline run` flags pass through, e.g.
173
+ `vir/adapters/datasus/bin/run_pipeline.sh all --skip-clean --n-clusters 150`.
174
+
175
+ ### Run a single city
176
+
177
+ ```bash
178
+ uv run vir pipeline run "Belo Horizonte" --year-min 2022 --year-max 2024 \
179
+ --snf-k 10 --n-clusters 200 --notes "run for paper"
180
+
181
+ uv run vir results list
182
+ vir/adapters/datasus/bin/enrich_latest_run.sh
183
+ ```
184
+
185
+ `vir pipeline run` chains `clean → process → views (clinical, demographic, temporal) → model snf`
186
+ and resolves the adapter (`datasus`) from the stratum name via the registry.
187
+
188
+ ### Step-by-step alternative
189
+
190
+ ```bash
191
+ uv run vir prepare clean --stratum "Belo Horizonte"
192
+ uv run vir prepare process --dataset data/interim/cleaned_*belo_horizonte*.parquet \
193
+ --adapter datasus
194
+
195
+ uv run vir view clinical --dataset belo_horizonte --adapter datasus \
196
+ --year-min 2022 --year-max 2024
197
+ uv run vir view demographic --dataset belo_horizonte --year-min 2022 --year-max 2024
198
+ uv run vir view temporal --dataset belo_horizonte --year-min 2022 --year-max 2024 --min-weeks 12
199
+
200
+ uv run vir model snf --dataset belo_horizonte --adapter datasus \
201
+ --year-min 2022 --year-max 2024 --snf-k 10 --n-clusters 200 --notes "run for paper"
202
+
203
+ uv run vir results build-profiles 1 --dataset belo_horizonte
204
+ uv run vir results explain 1 --dataset belo_horizonte --adapter datasus
205
+ uv run vir results llm-descriptions 1 # requires Ollama (see Optional below)
206
+ ```
207
+
208
+ ### SLURM
209
+
210
+ Two SLURM scripts mirror the helpers above; edit the `--mail-user` line at the top of each before
211
+ submitting and set `DATA_DIR` (in `.env` or via `--export`) when the shared filesystem is not the
212
+ project root.
213
+
214
+ ```bash
215
+ sbatch vir/adapters/datasus/bin/slurm_pipeline.sh # full pipeline, every city
216
+ sbatch --export=ALL,EXTRA_ARGS="--skip-clean" vir/adapters/datasus/bin/slurm_pipeline.sh
217
+
218
+ sbatch vir/adapters/datasus/bin/slurm_ablation.sh # ablation array, one task per city
219
+ sbatch --export=ALL,EXTRA_ARGS="--permutations 5000" vir/adapters/datasus/bin/slurm_ablation.sh
220
+ ```
221
+
222
+ The ablation array index range (`#SBATCH --array=0-9` in `slurm_ablation.sh`) must match the
223
+ number of lines in `vir/adapters/datasus/bin/cities.txt`.
224
+
225
+ ---
226
+
227
+ ## AKTIN (Germany)
228
+
229
+ The AKTIN adapter treats Germany as a single stratum (`"germany"`). Per-encounter raw data lives
230
+ in `*_result/case_data.txt` + `diag_data.txt` pairs (tab-separated). The adapter pools every site
231
+ into one cleaned interim parquet.
232
+
233
+ ### Raw data
234
+
235
+ Each AKTIN export ships per-hospital folders/zips. Either:
236
+
237
+ - a directory of unzipped `<N>_result/` subdirectories under `RAW_DATA_DIR`, **or**
238
+ - a directory containing a single site's `case_data.txt` and `diag_data.txt`.
239
+
240
+ Set `RAW_DATA_DIR` in `.env` (absolute path, or leave blank to use `data/raw/`).
241
+
242
+ ### One-time setup
243
+
244
+ ```bash
245
+ uv run vir setup terminologies --adapter aktin # parses BfArM ICD-10-GM 2025 (vendored)
246
+ uv run vir setup embeddings --adapter aktin # downloads SapBERT-UMLS-XLMR (~2.2 GB)
247
+ ```
248
+
249
+ The BfArM ICD-10-GM 2025 classification file is vendored at
250
+ `vir/adapters/aktin/references/icd10gm2025syst_kodes.txt`. AKTIN's default embedding model is
251
+ `cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR` (multilingual; embeds German labels
252
+ reasonably).
253
+
254
+ ### Run the pipeline
255
+
256
+ ```bash
257
+ uv run vir pipeline run germany --year-min 2018 --year-max 2023 \
258
+ --snf-k 10 --n-clusters 200 --notes "first aktin run"
259
+
260
+ uv run vir results list
261
+ uv run vir results build-profiles 1 --dataset germany
262
+ uv run vir results explain 1 --dataset germany --adapter aktin
263
+ ```
264
+
265
+ `vir pipeline run` derives the adapter (`aktin`) from the stratum `germany` via the registry and
266
+ threads it through every step.
267
+
268
+ ### Step-by-step alternative
269
+
270
+ ```bash
271
+ uv run vir prepare clean --stratum germany # adapter derived
272
+ uv run vir prepare process --dataset data/interim/cleaned_aktin_germany.parquet \
273
+ --adapter aktin
274
+
275
+ uv run vir view clinical --dataset germany --adapter aktin \
276
+ --year-min 2018 --year-max 2023
277
+ uv run vir view demographic --dataset germany --year-min 2018 --year-max 2023
278
+ uv run vir view temporal --dataset germany --year-min 2018 --year-max 2023 --min-weeks 12
279
+
280
+ uv run vir model snf --dataset germany --adapter aktin \
281
+ --year-min 2018 --year-max 2023 --snf-k 10 --n-clusters 200
282
+
283
+ uv run vir results build-profiles 1 --dataset germany
284
+ uv run vir results explain 1 --dataset germany --adapter aktin
285
+ ```
286
+
287
+ ### Indicative-codes template
288
+
289
+ AKTIN ships **without** an epidemiologist-labeled CSV. To generate the top-volume template for
290
+ labeling:
291
+
292
+ ```bash
293
+ uv run vir prepare indicative-template \
294
+ --dataset data/interim/cleaned_aktin_germany.parquet \
295
+ --adapter aktin \
296
+ --coverage 0.9
297
+ # writes data/processed/indicative_codes_template_aktin.csv
298
+ ```
299
+
300
+ Until labels are supplied, AKTIN runs effectively with `--no-indicative` (only Z-chapter codes
301
+ are filtered via `adapter.skip_patterns`).
302
+
303
+ ### Exploratory data analysis
304
+
305
+ A starter notebook lives at `vir/adapters/aktin/notebooks/eda_aktin.py` (shape, demographics,
306
+ ICD-10-GM coverage, monthly/weekly temporal distribution):
307
+
308
+ ```bash
309
+ uv run marimo edit vir/adapters/aktin/notebooks/eda_aktin.py
310
+ ```
311
+
312
+ ---
313
+
314
+ ## Adding a new adapter
315
+
316
+ The contract is the abstract base class `vir/adapters/base.py::Adapter`. Concrete adapters live
317
+ under `vir/adapters/<name>/` and own their cleaning, terminology, references, notebooks, bin
318
+ scripts, and tests. Each adapter declares:
319
+
320
+ - `name`, `default_model`, `terminology_csv_filename` — class attributes.
321
+ - `clean(raw_path, output_path, stratum, **opts)` — read raw → write canonical interim parquet.
322
+ - `build_terminology_labels()` — return `(code, code_type, description)` DataFrame.
323
+ - `stratum_metadata(stratum)` — `{city, region, population_range}`.
324
+ - `cleaned_filename(stratum)` — interim filename convention.
325
+ - `strata`, `skip_patterns`, `code_types` — properties.
326
+ - `indicative_codes_file` — optional `Path`, defaults to `None`.
327
+
328
+ Register the instance in `vir/adapters/__init__.py::ADAPTERS`. The registry detects stratum
329
+ collisions across adapters at import time. See `vir/adapters/datasus/` and `vir/adapters/aktin/`
330
+ for the two reference implementations.
331
+
332
+ ---
333
+
334
+ ## Optional
335
+
336
+ ### LLM descriptions
337
+
338
+ `vir results llm-descriptions` is optional. To enable it, install
339
+ [Ollama](https://ollama.com/download/), pull the model named in `LLM_MODEL` (default
340
+ `llama3.2:1b`), and add the `llm-ollama` plugin:
341
+
342
+ ```bash
343
+ ollama pull llama3.2:1b
344
+ uv run llm install llm-ollama
345
+ ```
346
+
347
+ ### Development
348
+
349
+ ```bash
350
+ make test # or: uv run pytest
351
+ make lint # ruff format --check && ruff check
352
+ ```
353
+
354
+ Interactive marimo edit mode for exploration:
355
+
356
+ ```bash
357
+ uv run marimo edit notebooks/view_clinical.py # framework
358
+ uv run marimo edit vir/adapters/datasus/notebooks/view_demographic.py # DATASUS
359
+ uv run marimo edit vir/adapters/aktin/notebooks/eda_aktin.py # AKTIN
360
+ ```
361
+
362
+ Two DATASUS notebook subfolders are organised by purpose:
363
+
364
+ - `vir/adapters/datasus/notebooks/validation/` — paper-grade validation (ablation, cross-city,
365
+ reference syndromes, temporal C). The ablation notebook is also exposed as `vir validate
366
+ ablation`.
367
+ - `vir/adapters/datasus/notebooks/sanity_checks/` — internal exploratory checks per view
368
+ (`sc_clinical`, `sc_demographic`, `sc_temporal`, feature transformation comparison,
369
+ temporal distance comparison). Not part of the paper's evidence base.
370
+
371
+ ## Notes
372
+
373
+ ### Terminology (DATASUS)
374
+
375
+ ABP008 is duplicated in the official AB terminology source
376
+ (<https://integracao.esusab.ufsc.br/ledi/documentacao/estrutura_arquivos/dicionario-fai.html#listaciapcondicaoavaliada>),
377
+ mapping to two distinct conditions: Beribéri (ABP008-1) and Desnutrição (ABP008-2).
378
+ The suffixes `-1` / `-2` are added in the reference files (`abp_ciap2.csv`,
379
+ `terminology_mapping.csv`) for disambiguation only — the raw data always carries the base code
380
+ `ABP008` without any suffix.
381
+
382
+ ### AESOP validation files (DATASUS)
383
+
384
+ The anchor validation used AESOP syndrome indicators, downloaded from the project's
385
+ [repo](https://github.com/cidacslab/AESOP-Data-Documentation/):
386
+
387
+ - [code_list_arbovirus_apr2024.csv](https://raw.githubusercontent.com/cidacslab/AESOP-Data-Documentation/refs/heads/main/DataPipeline/documentation/code_list_arbovirus_apr2024.csv)
388
+ - [code_list_uri_apr2024.csv](https://raw.githubusercontent.com/cidacslab/AESOP-Data-Documentation/refs/heads/main/DataPipeline/documentation/code_list_uri_apr2024.csv)
389
+
390
+ ### Indicative codes (DATASUS)
391
+
392
+ An expert reviewed the codes covering the top 90% of record volume and labeled each as indicative
393
+ (carries an epidemiological signal) or non-indicative. Codes labeled non-indicative are excluded
394
+ from the analysis; codes not yet reviewed (outside the top 90%, or reviewed but unlabeled) are
395
+ retained.
396
+
397
+ To generate (or regenerate) the labeling template for a different time window:
398
+
399
+ ```bash
400
+ uv run vir prepare indicative-template \
401
+ --dataset data/interim/cleaned_*_all_cities.parquet \
402
+ --adapter datasus \
403
+ --coverage 0.9
404
+ ```
405
+