dacaf-mlc 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dacaf_mlc-1.1.1/LICENSE +21 -0
- dacaf_mlc-1.1.1/MANIFEST.in +3 -0
- dacaf_mlc-1.1.1/PKG-INFO +299 -0
- dacaf_mlc-1.1.1/README.md +245 -0
- dacaf_mlc-1.1.1/dacaf_mlc/__init__.py +17 -0
- dacaf_mlc-1.1.1/dacaf_mlc/arff_dataset.py +76 -0
- dacaf_mlc-1.1.1/dacaf_mlc/chest_xray_dataset/__init__.py +0 -0
- dacaf_mlc-1.1.1/dacaf_mlc/chest_xray_dataset/chest_xray_dataset.py +284 -0
- dacaf_mlc-1.1.1/dacaf_mlc/chest_xray_dataset/chest_xray_utils.py +51 -0
- dacaf_mlc-1.1.1/dacaf_mlc/config.py +14 -0
- dacaf_mlc-1.1.1/dacaf_mlc/datasets.py +60 -0
- dacaf_mlc-1.1.1/dacaf_mlc/evaluate.py +50 -0
- dacaf_mlc-1.1.1/dacaf_mlc/evaluation_metrics.py +219 -0
- dacaf_mlc-1.1.1/dacaf_mlc/metrics_registry.py +28 -0
- dacaf_mlc-1.1.1/dacaf_mlc/pipeline.py +192 -0
- dacaf_mlc-1.1.1/dacaf_mlc/probability_classifier_chains.py +340 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/LICENSE +35 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/__init__.py +0 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/_version.py +25 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/core/__init__.py +0 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/core/base.py +705 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/meta/__init__.py +0 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/meta/classifier_chains.py +256 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/utils/__init__.py +0 -0
- dacaf_mlc-1.1.1/dacaf_mlc/skmultiflow/utils/validation.py +31 -0
- dacaf_mlc-1.1.1/dacaf_mlc/utils.py +57 -0
- dacaf_mlc-1.1.1/dacaf_mlc.egg-info/PKG-INFO +299 -0
- dacaf_mlc-1.1.1/dacaf_mlc.egg-info/SOURCES.txt +36 -0
- dacaf_mlc-1.1.1/dacaf_mlc.egg-info/dependency_links.txt +1 -0
- dacaf_mlc-1.1.1/dacaf_mlc.egg-info/entry_points.txt +2 -0
- dacaf_mlc-1.1.1/dacaf_mlc.egg-info/requires.txt +16 -0
- dacaf_mlc-1.1.1/dacaf_mlc.egg-info/top_level.txt +1 -0
- dacaf_mlc-1.1.1/pyproject.toml +71 -0
- dacaf_mlc-1.1.1/setup.cfg +4 -0
- dacaf_mlc-1.1.1/tests/test_evaluation_metrics.py +204 -0
- dacaf_mlc-1.1.1/tests/test_inference_optimality.py +131 -0
- dacaf_mlc-1.1.1/tests/test_pipeline_e2e.py +48 -0
- dacaf_mlc-1.1.1/tests/test_predict_equivalence.py +57 -0
dacaf_mlc-1.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024–2026 Hoàng Xuân Trường
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dacaf_mlc-1.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dacaf-mlc
|
|
3
|
+
Version: 1.1.1
|
|
4
|
+
Summary: Bayes-optimal inference for probabilistic multi-label classification (DaCaF)
|
|
5
|
+
Author: Vu-Linh Nguyen, Anh Hoang, Van-Nam Huynh
|
|
6
|
+
Author-email: Xuan-Truong Hoang <hxtruong6@gmail.com>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2024–2026 Hoàng Xuân Trường
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
|
|
29
|
+
Project-URL: Paper, https://doi.org/10.1016/j.inffus.2026.104517
|
|
30
|
+
Project-URL: Repository, https://github.com/hxtruong6/inference_probabilistic_mlc
|
|
31
|
+
Keywords: multi-label classification,probabilistic classifier chains,bayes-optimal inference
|
|
32
|
+
Classifier: Programming Language :: Python :: 3
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
36
|
+
Requires-Python: >=3.10
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
License-File: LICENSE
|
|
39
|
+
Requires-Dist: numpy<2,>=1.26
|
|
40
|
+
Requires-Dist: scipy>=1.12
|
|
41
|
+
Requires-Dist: scikit-learn>=1.2
|
|
42
|
+
Requires-Dist: pandas>=2.0
|
|
43
|
+
Requires-Dist: joblib>=1.3
|
|
44
|
+
Provides-Extra: image
|
|
45
|
+
Requires-Dist: torch>=2.2; extra == "image"
|
|
46
|
+
Requires-Dist: torchvision>=0.17; extra == "image"
|
|
47
|
+
Requires-Dist: torchxrayvision>=1.2; extra == "image"
|
|
48
|
+
Requires-Dist: scikit-image>=0.22; extra == "image"
|
|
49
|
+
Requires-Dist: Pillow>=10.2; extra == "image"
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
52
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
|
|
55
|
+
# Probabilistic Multi-Label Classification via Divide-and-Conquer and Fusion (DaCaF)
|
|
56
|
+
|
|
57
|
+
[](https://doi.org/10.1016/j.inffus.2026.104517)
|
|
58
|
+
[](https://doi.org/10.1016/j.inffus.2026.104517)
|
|
59
|
+
[](https://doi.org/10.5281/zenodo.20572638)
|
|
60
|
+
[](https://codeocean.com/capsule/1580907/tree)
|
|
61
|
+
[](LICENSE)
|
|
62
|
+
|
|
63
|
+
Official code for the paper **published in _Information Fusion_ (2026)**:
|
|
64
|
+
|
|
65
|
+
> **Probabilistic multi-label classification via a divide-and-conquer and fusion approach**
|
|
66
|
+
> Vu-Linh Nguyen, Xuan-Truong Hoang, Anh Hoang, Van-Nam Huynh.
|
|
67
|
+
> *Information Fusion*, 2026, Article 104517. <https://doi.org/10.1016/j.inffus.2026.104517>
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## What is this about? (in one picture)
|
|
72
|
+
|
|
73
|
+
In multi-label classification, each instance can carry *any subset* of the labels, and **different evaluation metrics want different predictions**. A model that is great for one metric (e.g. F₁) can be poor for another (e.g. subset accuracy).
|
|
74
|
+
|
|
75
|
+
**DaCaF** is a generic recipe that, given a probabilistic model `P(y | x)`, finds the **Bayes-optimal prediction (BOP)** for a chosen metric: the prediction `ŷ` that maximises the *expected* score of that metric.
|
|
76
|
+
|
|
77
|
+
```mermaid
|
|
78
|
+
flowchart TD
|
|
79
|
+
A[Training data] -->|learn| B[Probabilistic classifier PCC<br/>estimates P of y given x]
|
|
80
|
+
B -->|inference, per instance x| C[Probabilistic prediction<br/>P of y given x]
|
|
81
|
+
C --> D{DaCaF}
|
|
82
|
+
D --> E[Divide and Conquer<br/>split predictions by number of relevant labels,<br/>solve each group by sorting]
|
|
83
|
+
E --> F[Fusion<br/>produce the prediction by fusing the chain binary classifiers<br/>to supply the needed marginal/pairwise probabilities]
|
|
84
|
+
F --> G[Bayes-optimal prediction y-hat<br/>for the chosen metric]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Two building blocks:**
|
|
88
|
+
|
|
89
|
+
1. **Divide & Conquer**: partition the `2^L` possible predictions into `L+1` groups (by how many labels are predicted relevant). Within each group the best prediction is found just by **sorting labels by a score**; the global best is the best across groups.
|
|
90
|
+
2. **Fusion**: the final step that produces the prediction. The sorting scores need certain marginal/pairwise probabilities, which are supplied by **fusing the predictions of the dependent binary classifiers** that make up the chain (via ancestral sampling).
|
|
91
|
+
|
|
92
|
+
The paper proves this works for **two whole families of metrics** (so it covers many metrics at once, not one at a time) and shows when a metric's optimal prediction is *trivial*, a useful warning sign when choosing a metric.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Results at a glance
|
|
97
|
+
|
|
98
|
+
**The headline finding: mismatch hurts.** When you evaluate with metric *E* but optimise for a different metric *T* during prediction, performance usually drops. Optimising the metric you actually care about is (almost always) best. This is verified on 5 tabular datasets plus a chest-X-ray image dataset, using the *exact* computation paradigm (no approximation blurring the picture).
|
|
99
|
+
|
|
100
|
+
You read the table **column by column**: each column is one evaluation metric, each row is the metric you optimised for. The **bold diagonal** (optimise the metric you evaluate) should be the largest value in its column.
|
|
101
|
+
|
|
102
|
+
**Example: CHD-49 (PCC + logistic regression, mean over 5 seeds × 10-fold, the fastest dataset).** Values are percentages, higher is better. Bold = the maximum of its column = the rule that targets that metric.
|
|
103
|
+
|
|
104
|
+
| Target ↓ \ Eval → | F₁ | Hamming | Markedness | Precision | NPV | Recall | Subset |
|
|
105
|
+
|---|---:|---:|---:|---:|---:|---:|---:|
|
|
106
|
+
| **F₁** | **67.1** | 69.3 | 71.9 | 62.7 | 81.1 | 79.2 | 15.1 |
|
|
107
|
+
| **Hamming** | 63.9 | **70.8** | 71.3 | 66.6 | 75.7 | 66.9 | 18.1 |
|
|
108
|
+
| **Markedness** | 34.2 | 63.7 | **77.0** | 33.4 | 71.1 | 40.5 | 8.4 |
|
|
109
|
+
| **Precision** | 40.5 | 64.8 | 68.3 | **73.5** | 63.1 | 29.1 | 3.1 |
|
|
110
|
+
| **NPV** | 58.4 | 43.0 | 71.5 | 43.0 | **100.0** | 99.5 | 0.0 |
|
|
111
|
+
| **Recall** | 58.4 | 43.0 | 71.5 | 43.0 | 100.0 | **99.5** | 0.0 |
|
|
112
|
+
| **Subset** | 64.0 | 69.4 | 70.4 | 64.2 | 76.5 | 69.8 | **18.9** |
|
|
113
|
+
|
|
114
|
+
In all **7 of 7** columns the diagonal (target = evaluation) is the maximum: to score best on a metric, optimise that metric. The NPV and Recall rows are identical because both BOPs are the all-ones vector `1…1` (see the metrics table below).
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Quickstart (one run)
|
|
119
|
+
|
|
120
|
+
Using [**uv**](https://docs.astral.sh/uv/) (recommended, fast; a checked-in `uv.lock` pins exact versions):
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
uv venv .venv --python 3.11 && source .venv/bin/activate
|
|
124
|
+
uv pip install -e . # core (tabular) deps; add ".[image]" for the ChestX-ray experiments
|
|
125
|
+
# reproducible install from the lockfile instead: uv sync (add --extra image for ChestX-ray)
|
|
126
|
+
|
|
127
|
+
# one (dataset, seed) run:
|
|
128
|
+
dacaf-mlc --dataset emotions --seed 1 --output-dir result
|
|
129
|
+
# or without activating a venv: uv run dacaf-mlc --dataset emotions --seed 1 --output-dir result
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
<details><summary>Alternative: plain pip / conda</summary>
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python -m venv .venv && source .venv/bin/activate # or conda create -n dacaf python=3.10
|
|
136
|
+
pip install -e . # core (tabular) deps; add ".[image]" for the ChestX-ray experiments
|
|
137
|
+
dacaf-mlc --dataset emotions --seed 1 --output-dir result
|
|
138
|
+
```
|
|
139
|
+
</details>
|
|
140
|
+
|
|
141
|
+
This writes `result/emotions/seed1_all.csv` and a cross-tab of **target metric × evaluation metric**, the table at the heart of the paper.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## The metrics and their optimal predictions
|
|
146
|
+
|
|
147
|
+
For a probabilistic prediction `P(y | x)` over `L` labels, each rule returns the prediction that maximises the expected metric. `pⱼ = P(yⱼ = 1 | x)` is the marginal.
|
|
148
|
+
|
|
149
|
+
**How to read the columns:** *Needs* is the probabilistic information the rule consumes (cheap **marginals** `pⱼ`, the harder **pairwise** terms, or the full joint). *Cost* is the per-instance time once that information is available. Rules marked *trivial* / *near-trivial* have a BOP you can write down without looking at any data.
|
|
150
|
+
|
|
151
|
+
| Metric | Optimal prediction (BOP) | Needs | Cost |
|
|
152
|
+
|---|---|---|---|
|
|
153
|
+
| **Hamming** | `ŷⱼ = 1 ⇔ pⱼ > ½` | marginals | `O(L)` |
|
|
154
|
+
| **Subset 0/1** | the single most probable label vector | full joint | intractable |
|
|
155
|
+
| **F-β / F₁** | sort by an F-score, pick best prefix size | pairwise `P(yⱼ=1, |y|=s)` | `O(L³)` |
|
|
156
|
+
| **Markedness** | rank by marginals, compare prefix sizes | marginals | `O(L log L)` |
|
|
157
|
+
| **Precision** | predict only the top-marginal label | marginals | `O(L)` *(near-trivial)* |
|
|
158
|
+
| **NPV** | predict all ones `1…1` (same BOP as Recall here); falls back to `ŷ^{K-1}` (all ones but the lowest-marginal label) only if `1…1` is disallowed | marginals | `O(L)` *(near-trivial)* |
|
|
159
|
+
| **Recall** | always predict `1…1` | none | trivial |
|
|
160
|
+
| **Specificity** | always predict `0…0` | none | trivial |
|
|
161
|
+
|
|
162
|
+
> **Why "trivial" matters:** Recall/Specificity (and near-trivial Precision/NPV) have optimal predictions you can write down *without looking at any data*. The paper argues such metrics are weak *standalone* evaluation metrics, a practical takeaway when designing a metric for a new domain.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Reproducing the paper's results
|
|
167
|
+
|
|
168
|
+
The paper uses **Probabilistic Classifier Chains (PCC)** with an **L2-regularised logistic-regression** base learner, **10-fold cross-validation**, and the **exact computation paradigm** (enumerate all `2^L` labelings, so it is limited to a small or moderate number of labels). The exact published protocol is recorded in [`docs/paper.yaml`](docs/paper.yaml).
|
|
169
|
+
|
|
170
|
+
**Datasets in the paper (6):**
|
|
171
|
+
|
|
172
|
+
| Dataset | #labels (L) | #instances | Type |
|
|
173
|
+
|---|---:|---:|---|
|
|
174
|
+
| Emotions | 6 | 593 | tabular |
|
|
175
|
+
| CHD-49 | 6 | 555 | tabular |
|
|
176
|
+
| Scene | 6 | 2407 | tabular |
|
|
177
|
+
| Water-quality | 14 | 1060 | tabular |
|
|
178
|
+
| Yeast | 14 | 2417 | tabular |
|
|
179
|
+
| ChestX-ray8 | 8 | 25596 | image (ResNet / resnetAE / DenseNet features) |
|
|
180
|
+
|
|
181
|
+
For the chest-X-ray data we extract features with a pretrained backbone via [TorchXRayVision](https://github.com/mlmed/torchxrayvision); the raw NIH features are **not redistributed** (see [`dacaf_mlc/chest_xray_dataset/Readme.md`](dacaf_mlc/chest_xray_dataset/Readme.md)).
|
|
182
|
+
|
|
183
|
+
**One command** for the tractable (tabular) subset, runs the 5 tabular datasets × 5 seeds and aggregates:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
make reproduce # = bash scripts/reproduce_tabular.sh
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Full sweep** (heavy, use a cluster):
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
dacaf-mlc # local, small datasets (or: python -m dacaf_mlc.evaluate)
|
|
193
|
+
# or: see scripts/SLURM.md # one job per (dataset × seed)
|
|
194
|
+
python scripts/aggregate.py # aggregate when jobs finish
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Aggregated outputs per dataset: `result/result_<dataset>.csv` (long format), `_summary.csv` (mean ± std), and `_crosstab.csv` (target × evaluation pivot).
|
|
198
|
+
|
|
199
|
+
### Run it online (Code Ocean)
|
|
200
|
+
|
|
201
|
+
A one-click reproducible capsule is available: **<https://codeocean.com/capsule/1580907/tree>**. Click **Reproducible Run** to rebuild the environment and reproduce the CHD-49 target × evaluation table (`result_CHD_49_crosstab.csv`) on CPU in seconds — every diagonal entry is the maximum of its column, the paper's central claim. The capsule entry point is [`run`](run); dependencies are pinned in [`requirements-core.txt`](requirements-core.txt).
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Library usage
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
from sklearn.linear_model import LogisticRegression
|
|
209
|
+
from dacaf_mlc.probability_classifier_chains import ProbabilisticClassifierChainCustom
|
|
210
|
+
from dacaf_mlc.evaluation_metrics import EvaluationMetrics as EM
|
|
211
|
+
|
|
212
|
+
pcc = ProbabilisticClassifierChainCustom(LogisticRegression(max_iter=10_000))
|
|
213
|
+
pcc.fit(X_train, Y_train) # Y: (n, L) binary
|
|
214
|
+
|
|
215
|
+
y_f1 = pcc.predict_fmeasure(X_test, beta=1) # Bayes-optimal for F1
|
|
216
|
+
y_ham = pcc.predict_hamming(X_test) # ... for Hamming
|
|
217
|
+
y_mar = pcc.predict_markedness(X_test) # ... for Markedness
|
|
218
|
+
|
|
219
|
+
print(EM.f_beta(Y_test, y_f1), EM.markedness(Y_test, y_mar))
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Every `predict_*` rule returns the prediction that maximises the expected value of its
|
|
223
|
+
target metric (see [`CONVENTIONS.md`](CONVENTIONS.md) for the exact rules and conventions).
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
## Repository layout
|
|
228
|
+
|
|
229
|
+
```
|
|
230
|
+
dacaf_mlc/ # installable package
|
|
231
|
+
probability_classifier_chains.py # PCC + the 7 per-metric Bayes-optimal predict_* rules
|
|
232
|
+
evaluation_metrics.py # the 7 paper metrics (higher-is-better form)
|
|
233
|
+
arff_dataset.py # MULAN ARFF loader + 10-fold CV
|
|
234
|
+
datasets.py # dataset registry + loaders
|
|
235
|
+
metrics_registry.py # which metrics run on which inference rule
|
|
236
|
+
pipeline.py # training / k-fold eval / run_single
|
|
237
|
+
evaluate.py # CLI entry point (dacaf-mlc): parse_args + main
|
|
238
|
+
config.py # paths + protocol constants
|
|
239
|
+
utils.py # result aggregation
|
|
240
|
+
chest_xray_dataset/ # NIH feature extractor + loader ([image] extra)
|
|
241
|
+
skmultiflow/ # vendored ClassifierChain base
|
|
242
|
+
pyproject.toml # packaging + deps (core / [image] / [dev])
|
|
243
|
+
scripts/ # reproduce_tabular.sh + Slurm cluster scripts
|
|
244
|
+
tests/ # unit tests + brute-force optimality + e2e
|
|
245
|
+
docs/ # paper.yaml protocol manifest
|
|
246
|
+
datasets/ # the paper's MULAN ARFFs (+ chest-xray label CSV)
|
|
247
|
+
result/ # aggregated result CSVs
|
|
248
|
+
CONVENTIONS.md CONTRIBUTING.md CITATION.cff
|
|
249
|
+
paper/ # local copy of the paper source (not tracked)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Testing
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
python -m pytest tests/ -v
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Every inference rule is checked against **brute-force enumeration** of the expected metric, so the closed-form rules are provably correct on small cases. A batched predictor (one `predict_proba` call per chain level instead of `N·L·2^L`) is verified numerically equivalent to the reference enumeration.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
264
|
+
## How to cite
|
|
265
|
+
|
|
266
|
+
```bibtex
|
|
267
|
+
@article{nguyen2026probabilistic,
|
|
268
|
+
title = {Probabilistic multi-label classification via a divide-and-conquer and fusion approach},
|
|
269
|
+
author = {Nguyen, Vu-Linh and Hoang, Xuan-Truong and Hoang, Anh and Huynh, Van-Nam},
|
|
270
|
+
journal = {Information Fusion},
|
|
271
|
+
year = {2026},
|
|
272
|
+
pages = {104517},
|
|
273
|
+
issn = {1566-2535},
|
|
274
|
+
doi = {10.1016/j.inffus.2026.104517}
|
|
275
|
+
}
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## References
|
|
281
|
+
|
|
282
|
+
- K. Dembczyński, W. Cheng, E. Hüllermeier. *Bayes Optimal Multilabel Classification via Probabilistic Classifier Chains.* ICML 2010.
|
|
283
|
+
- K. Dembczyński, W. Waegeman, W. Cheng, E. Hüllermeier. *An Exact Algorithm for F-Measure Maximization.* NeurIPS 2011.
|
|
284
|
+
- W. Waegeman et al. *On the Bayes-optimality of F-measure maximizers.* JMLR 2014.
|
|
285
|
+
- D. M. W. Powers. *Evaluation: From Precision, Recall and F-Measure to ROC, Informedness, Markedness & Correlation.* 2011.
|
|
286
|
+
- G. Tsoumakas, I. Katakis, I. Vlahavas. *Mining Multi-label Data.* 2010 (MULAN).
|
|
287
|
+
|
|
288
|
+
## Acknowledgements
|
|
289
|
+
|
|
290
|
+
The `dacaf_mlc/skmultiflow/` directory contains a trimmed, vendored subset of
|
|
291
|
+
[scikit-multiflow](https://github.com/scikit-multiflow/scikit-multiflow)
|
|
292
|
+
(the `ClassifierChain` base and its supporting utilities), redistributed under
|
|
293
|
+
its original 3-clause BSD license. See
|
|
294
|
+
[`dacaf_mlc/skmultiflow/LICENSE`](dacaf_mlc/skmultiflow/LICENSE) for the full text.
|
|
295
|
+
|
|
296
|
+
## License
|
|
297
|
+
|
|
298
|
+
MIT for the original DaCaF code, see [LICENSE](LICENSE). Vendored third-party
|
|
299
|
+
code retains its own license as noted in Acknowledgements above.
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# Probabilistic Multi-Label Classification via Divide-and-Conquer and Fusion (DaCaF)
|
|
2
|
+
|
|
3
|
+
[](https://doi.org/10.1016/j.inffus.2026.104517)
|
|
4
|
+
[](https://doi.org/10.1016/j.inffus.2026.104517)
|
|
5
|
+
[](https://doi.org/10.5281/zenodo.20572638)
|
|
6
|
+
[](https://codeocean.com/capsule/1580907/tree)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
Official code for the paper **published in _Information Fusion_ (2026)**:
|
|
10
|
+
|
|
11
|
+
> **Probabilistic multi-label classification via a divide-and-conquer and fusion approach**
|
|
12
|
+
> Vu-Linh Nguyen, Xuan-Truong Hoang, Anh Hoang, Van-Nam Huynh.
|
|
13
|
+
> *Information Fusion*, 2026, Article 104517. <https://doi.org/10.1016/j.inffus.2026.104517>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## What is this about? (in one picture)
|
|
18
|
+
|
|
19
|
+
In multi-label classification, each instance can carry *any subset* of the labels, and **different evaluation metrics want different predictions**. A model that is great for one metric (e.g. F₁) can be poor for another (e.g. subset accuracy).
|
|
20
|
+
|
|
21
|
+
**DaCaF** is a generic recipe that, given a probabilistic model `P(y | x)`, finds the **Bayes-optimal prediction (BOP)** for a chosen metric: the prediction `ŷ` that maximises the *expected* score of that metric.
|
|
22
|
+
|
|
23
|
+
```mermaid
|
|
24
|
+
flowchart TD
|
|
25
|
+
A[Training data] -->|learn| B[Probabilistic classifier PCC<br/>estimates P of y given x]
|
|
26
|
+
B -->|inference, per instance x| C[Probabilistic prediction<br/>P of y given x]
|
|
27
|
+
C --> D{DaCaF}
|
|
28
|
+
D --> E[Divide and Conquer<br/>split predictions by number of relevant labels,<br/>solve each group by sorting]
|
|
29
|
+
E --> F[Fusion<br/>produce the prediction by fusing the chain binary classifiers<br/>to supply the needed marginal/pairwise probabilities]
|
|
30
|
+
F --> G[Bayes-optimal prediction y-hat<br/>for the chosen metric]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
**Two building blocks:**
|
|
34
|
+
|
|
35
|
+
1. **Divide & Conquer**: partition the `2^L` possible predictions into `L+1` groups (by how many labels are predicted relevant). Within each group the best prediction is found just by **sorting labels by a score**; the global best is the best across groups.
|
|
36
|
+
2. **Fusion**: the final step that produces the prediction. The sorting scores need certain marginal/pairwise probabilities, which are supplied by **fusing the predictions of the dependent binary classifiers** that make up the chain (via ancestral sampling).
|
|
37
|
+
|
|
38
|
+
The paper proves this works for **two whole families of metrics** (so it covers many metrics at once, not one at a time) and shows when a metric's optimal prediction is *trivial*, a useful warning sign when choosing a metric.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Results at a glance
|
|
43
|
+
|
|
44
|
+
**The headline finding: mismatch hurts.** When you evaluate with metric *E* but optimise for a different metric *T* during prediction, performance usually drops. Optimising the metric you actually care about is (almost always) best. This is verified on 5 tabular datasets plus a chest-X-ray image dataset, using the *exact* computation paradigm (no approximation blurring the picture).
|
|
45
|
+
|
|
46
|
+
You read the table **column by column**: each column is one evaluation metric, each row is the metric you optimised for. The **bold diagonal** (optimise the metric you evaluate) should be the largest value in its column.
|
|
47
|
+
|
|
48
|
+
**Example: CHD-49 (PCC + logistic regression, mean over 5 seeds × 10-fold, the fastest dataset).** Values are percentages, higher is better. Bold = the maximum of its column = the rule that targets that metric.
|
|
49
|
+
|
|
50
|
+
| Target ↓ \ Eval → | F₁ | Hamming | Markedness | Precision | NPV | Recall | Subset |
|
|
51
|
+
|---|---:|---:|---:|---:|---:|---:|---:|
|
|
52
|
+
| **F₁** | **67.1** | 69.3 | 71.9 | 62.7 | 81.1 | 79.2 | 15.1 |
|
|
53
|
+
| **Hamming** | 63.9 | **70.8** | 71.3 | 66.6 | 75.7 | 66.9 | 18.1 |
|
|
54
|
+
| **Markedness** | 34.2 | 63.7 | **77.0** | 33.4 | 71.1 | 40.5 | 8.4 |
|
|
55
|
+
| **Precision** | 40.5 | 64.8 | 68.3 | **73.5** | 63.1 | 29.1 | 3.1 |
|
|
56
|
+
| **NPV** | 58.4 | 43.0 | 71.5 | 43.0 | **100.0** | 99.5 | 0.0 |
|
|
57
|
+
| **Recall** | 58.4 | 43.0 | 71.5 | 43.0 | 100.0 | **99.5** | 0.0 |
|
|
58
|
+
| **Subset** | 64.0 | 69.4 | 70.4 | 64.2 | 76.5 | 69.8 | **18.9** |
|
|
59
|
+
|
|
60
|
+
In all **7 of 7** columns the diagonal (target = evaluation) is the maximum: to score best on a metric, optimise that metric. The NPV and Recall rows are identical because both BOPs are the all-ones vector `1…1` (see the metrics table below).
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Quickstart (one run)
|
|
65
|
+
|
|
66
|
+
Using [**uv**](https://docs.astral.sh/uv/) (recommended, fast; a checked-in `uv.lock` pins exact versions):
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
uv venv .venv --python 3.11 && source .venv/bin/activate
|
|
70
|
+
uv pip install -e . # core (tabular) deps; add ".[image]" for the ChestX-ray experiments
|
|
71
|
+
# reproducible install from the lockfile instead: uv sync (add --extra image for ChestX-ray)
|
|
72
|
+
|
|
73
|
+
# one (dataset, seed) run:
|
|
74
|
+
dacaf-mlc --dataset emotions --seed 1 --output-dir result
|
|
75
|
+
# or without activating a venv: uv run dacaf-mlc --dataset emotions --seed 1 --output-dir result
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
<details><summary>Alternative: plain pip / conda</summary>
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
python -m venv .venv && source .venv/bin/activate # or conda create -n dacaf python=3.10
|
|
82
|
+
pip install -e . # core (tabular) deps; add ".[image]" for the ChestX-ray experiments
|
|
83
|
+
dacaf-mlc --dataset emotions --seed 1 --output-dir result
|
|
84
|
+
```
|
|
85
|
+
</details>
|
|
86
|
+
|
|
87
|
+
This writes `result/emotions/seed1_all.csv` and a cross-tab of **target metric × evaluation metric**, the table at the heart of the paper.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## The metrics and their optimal predictions
|
|
92
|
+
|
|
93
|
+
For a probabilistic prediction `P(y | x)` over `L` labels, each rule returns the prediction that maximises the expected metric. `pⱼ = P(yⱼ = 1 | x)` is the marginal.
|
|
94
|
+
|
|
95
|
+
**How to read the columns:** *Needs* is the probabilistic information the rule consumes (cheap **marginals** `pⱼ`, the harder **pairwise** terms, or the full joint). *Cost* is the per-instance time once that information is available. Rules marked *trivial* / *near-trivial* have a BOP you can write down without looking at any data.
|
|
96
|
+
|
|
97
|
+
| Metric | Optimal prediction (BOP) | Needs | Cost |
|
|
98
|
+
|---|---|---|---|
|
|
99
|
+
| **Hamming** | `ŷⱼ = 1 ⇔ pⱼ > ½` | marginals | `O(L)` |
|
|
100
|
+
| **Subset 0/1** | the single most probable label vector | full joint | intractable |
|
|
101
|
+
| **F-β / F₁** | sort by an F-score, pick best prefix size | pairwise `P(yⱼ=1, |y|=s)` | `O(L³)` |
|
|
102
|
+
| **Markedness** | rank by marginals, compare prefix sizes | marginals | `O(L log L)` |
|
|
103
|
+
| **Precision** | predict only the top-marginal label | marginals | `O(L)` *(near-trivial)* |
|
|
104
|
+
| **NPV** | predict all ones `1…1` (same BOP as Recall here); falls back to `ŷ^{K-1}` (all ones but the lowest-marginal label) only if `1…1` is disallowed | marginals | `O(L)` *(near-trivial)* |
|
|
105
|
+
| **Recall** | always predict `1…1` | none | trivial |
|
|
106
|
+
| **Specificity** | always predict `0…0` | none | trivial |
|
|
107
|
+
|
|
108
|
+
> **Why "trivial" matters:** Recall/Specificity (and near-trivial Precision/NPV) have optimal predictions you can write down *without looking at any data*. The paper argues such metrics are weak *standalone* evaluation metrics, a practical takeaway when designing a metric for a new domain.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Reproducing the paper's results
|
|
113
|
+
|
|
114
|
+
The paper uses **Probabilistic Classifier Chains (PCC)** with an **L2-regularised logistic-regression** base learner, **10-fold cross-validation**, and the **exact computation paradigm** (enumerate all `2^L` labelings, so it is limited to a small or moderate number of labels). The exact published protocol is recorded in [`docs/paper.yaml`](docs/paper.yaml).
|
|
115
|
+
|
|
116
|
+
**Datasets in the paper (6):**
|
|
117
|
+
|
|
118
|
+
| Dataset | #labels (L) | #instances | Type |
|
|
119
|
+
|---|---:|---:|---|
|
|
120
|
+
| Emotions | 6 | 593 | tabular |
|
|
121
|
+
| CHD-49 | 6 | 555 | tabular |
|
|
122
|
+
| Scene | 6 | 2407 | tabular |
|
|
123
|
+
| Water-quality | 14 | 1060 | tabular |
|
|
124
|
+
| Yeast | 14 | 2417 | tabular |
|
|
125
|
+
| ChestX-ray8 | 8 | 25596 | image (ResNet / resnetAE / DenseNet features) |
|
|
126
|
+
|
|
127
|
+
For the chest-X-ray data we extract features with a pretrained backbone via [TorchXRayVision](https://github.com/mlmed/torchxrayvision); the raw NIH features are **not redistributed** (see [`dacaf_mlc/chest_xray_dataset/Readme.md`](dacaf_mlc/chest_xray_dataset/Readme.md)).
|
|
128
|
+
|
|
129
|
+
**One command** for the tractable (tabular) subset, runs the 5 tabular datasets × 5 seeds and aggregates:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
make reproduce # = bash scripts/reproduce_tabular.sh
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**Full sweep** (heavy, use a cluster):
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
dacaf-mlc # local, small datasets (or: python -m dacaf_mlc.evaluate)
|
|
139
|
+
# or: see scripts/SLURM.md # one job per (dataset × seed)
|
|
140
|
+
python scripts/aggregate.py # aggregate when jobs finish
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Aggregated outputs per dataset: `result/result_<dataset>.csv` (long format), `_summary.csv` (mean ± std), and `_crosstab.csv` (target × evaluation pivot).
|
|
144
|
+
|
|
145
|
+
### Run it online (Code Ocean)
|
|
146
|
+
|
|
147
|
+
A one-click reproducible capsule is available: **<https://codeocean.com/capsule/1580907/tree>**. Click **Reproducible Run** to rebuild the environment and reproduce the CHD-49 target × evaluation table (`result_CHD_49_crosstab.csv`) on CPU in seconds — every diagonal entry is the maximum of its column, the paper's central claim. The capsule entry point is [`run`](run); dependencies are pinned in [`requirements-core.txt`](requirements-core.txt).
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Library usage
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from sklearn.linear_model import LogisticRegression
|
|
155
|
+
from dacaf_mlc.probability_classifier_chains import ProbabilisticClassifierChainCustom
|
|
156
|
+
from dacaf_mlc.evaluation_metrics import EvaluationMetrics as EM
|
|
157
|
+
|
|
158
|
+
pcc = ProbabilisticClassifierChainCustom(LogisticRegression(max_iter=10_000))
|
|
159
|
+
pcc.fit(X_train, Y_train) # Y: (n, L) binary
|
|
160
|
+
|
|
161
|
+
y_f1 = pcc.predict_fmeasure(X_test, beta=1) # Bayes-optimal for F1
|
|
162
|
+
y_ham = pcc.predict_hamming(X_test) # ... for Hamming
|
|
163
|
+
y_mar = pcc.predict_markedness(X_test) # ... for Markedness
|
|
164
|
+
|
|
165
|
+
print(EM.f_beta(Y_test, y_f1), EM.markedness(Y_test, y_mar))
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Every `predict_*` rule returns the prediction that maximises the expected value of its
|
|
169
|
+
target metric (see [`CONVENTIONS.md`](CONVENTIONS.md) for the exact rules and conventions).
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Repository layout
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
dacaf_mlc/ # installable package
|
|
177
|
+
probability_classifier_chains.py # PCC + the 7 per-metric Bayes-optimal predict_* rules
|
|
178
|
+
evaluation_metrics.py # the 7 paper metrics (higher-is-better form)
|
|
179
|
+
arff_dataset.py # MULAN ARFF loader + 10-fold CV
|
|
180
|
+
datasets.py # dataset registry + loaders
|
|
181
|
+
metrics_registry.py # which metrics run on which inference rule
|
|
182
|
+
pipeline.py # training / k-fold eval / run_single
|
|
183
|
+
evaluate.py # CLI entry point (dacaf-mlc): parse_args + main
|
|
184
|
+
config.py # paths + protocol constants
|
|
185
|
+
utils.py # result aggregation
|
|
186
|
+
chest_xray_dataset/ # NIH feature extractor + loader ([image] extra)
|
|
187
|
+
skmultiflow/ # vendored ClassifierChain base
|
|
188
|
+
pyproject.toml # packaging + deps (core / [image] / [dev])
|
|
189
|
+
scripts/ # reproduce_tabular.sh + Slurm cluster scripts
|
|
190
|
+
tests/ # unit tests + brute-force optimality + e2e
|
|
191
|
+
docs/ # paper.yaml protocol manifest
|
|
192
|
+
datasets/ # the paper's MULAN ARFFs (+ chest-xray label CSV)
|
|
193
|
+
result/ # aggregated result CSVs
|
|
194
|
+
CONVENTIONS.md CONTRIBUTING.md CITATION.cff
|
|
195
|
+
paper/ # local copy of the paper source (not tracked)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Testing
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
python -m pytest tests/ -v
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
Every inference rule is checked against **brute-force enumeration** of the expected metric, so the closed-form rules are provably correct on small cases. A batched predictor (one `predict_proba` call per chain level instead of `N·L·2^L`) is verified numerically equivalent to the reference enumeration.
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## How to cite
|
|
211
|
+
|
|
212
|
+
```bibtex
|
|
213
|
+
@article{nguyen2026probabilistic,
|
|
214
|
+
title = {Probabilistic multi-label classification via a divide-and-conquer and fusion approach},
|
|
215
|
+
author = {Nguyen, Vu-Linh and Hoang, Xuan-Truong and Hoang, Anh and Huynh, Van-Nam},
|
|
216
|
+
journal = {Information Fusion},
|
|
217
|
+
year = {2026},
|
|
218
|
+
pages = {104517},
|
|
219
|
+
issn = {1566-2535},
|
|
220
|
+
doi = {10.1016/j.inffus.2026.104517}
|
|
221
|
+
}
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## References
|
|
227
|
+
|
|
228
|
+
- K. Dembczyński, W. Cheng, E. Hüllermeier. *Bayes Optimal Multilabel Classification via Probabilistic Classifier Chains.* ICML 2010.
|
|
229
|
+
- K. Dembczyński, W. Waegeman, W. Cheng, E. Hüllermeier. *An Exact Algorithm for F-Measure Maximization.* NeurIPS 2011.
|
|
230
|
+
- W. Waegeman et al. *On the Bayes-optimality of F-measure maximizers.* JMLR 2014.
|
|
231
|
+
- D. M. W. Powers. *Evaluation: From Precision, Recall and F-Measure to ROC, Informedness, Markedness & Correlation.* 2011.
|
|
232
|
+
- G. Tsoumakas, I. Katakis, I. Vlahavas. *Mining Multi-label Data.* 2010 (MULAN).
|
|
233
|
+
|
|
234
|
+
## Acknowledgements
|
|
235
|
+
|
|
236
|
+
The `dacaf_mlc/skmultiflow/` directory contains a trimmed, vendored subset of
|
|
237
|
+
[scikit-multiflow](https://github.com/scikit-multiflow/scikit-multiflow)
|
|
238
|
+
(the `ClassifierChain` base and its supporting utilities), redistributed under
|
|
239
|
+
its original 3-clause BSD license. See
|
|
240
|
+
[`dacaf_mlc/skmultiflow/LICENSE`](dacaf_mlc/skmultiflow/LICENSE) for the full text.
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT for the original DaCaF code, see [LICENSE](LICENSE). Vendored third-party
|
|
245
|
+
code retains its own license as noted in Acknowledgements above.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""DaCaF: Bayes-optimal inference for probabilistic multi-label classification.
|
|
2
|
+
|
|
3
|
+
Official implementation of the divide-and-conquer and fusion (DaCaF) approach
|
|
4
|
+
described in Nguyen et al., *Information Fusion* (2026),
|
|
5
|
+
https://doi.org/10.1016/j.inffus.2026.104517
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dacaf_mlc.evaluation_metrics import EvaluationMetrics
|
|
9
|
+
from dacaf_mlc.probability_classifier_chains import ProbabilisticClassifierChainCustom
|
|
10
|
+
|
|
11
|
+
__version__ = "1.1.1"
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ProbabilisticClassifierChainCustom",
|
|
15
|
+
"EvaluationMetrics",
|
|
16
|
+
"__version__",
|
|
17
|
+
]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import scipy.io.arff as arff
|
|
3
|
+
from sklearn.model_selection import KFold
|
|
4
|
+
|
|
5
|
+
# Number of labels per (dense) MULAN ARFF used in the paper.
|
|
6
|
+
_LABEL_COUNTS = {
|
|
7
|
+
"emotions": 6,
|
|
8
|
+
"scene": 6,
|
|
9
|
+
"yeast": 14,
|
|
10
|
+
"Water-quality": 14,
|
|
11
|
+
"CHD_49": 6,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiLabelArffDataset:
|
|
16
|
+
"""Class to handle Mulan datasets stored in ARFF files.
|
|
17
|
+
|
|
18
|
+
Stores raw (unscaled) feature arrays. Callers are responsible for
|
|
19
|
+
applying StandardScaler per fold AFTER train/test split to avoid
|
|
20
|
+
data leakage.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
dataset_name,
|
|
26
|
+
path=None,
|
|
27
|
+
target_at_first=False,
|
|
28
|
+
X=None,
|
|
29
|
+
Y=None,
|
|
30
|
+
):
|
|
31
|
+
"""Initialize the dataset handler."""
|
|
32
|
+
self.dataset_name = dataset_name
|
|
33
|
+
|
|
34
|
+
if X is not None and Y is not None and isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame):
|
|
35
|
+
self.X, self.Y = self._to_numpy(X, Y)
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
self.path = path
|
|
39
|
+
data = arff.loadarff(self.path)
|
|
40
|
+
df = pd.DataFrame(data[0])
|
|
41
|
+
|
|
42
|
+
n_labels = self._get_label_count()
|
|
43
|
+
if target_at_first:
|
|
44
|
+
Y_df = df.iloc[:, :n_labels].astype(int)
|
|
45
|
+
X_df = df.iloc[:, n_labels:]
|
|
46
|
+
else:
|
|
47
|
+
X_df = df.iloc[:, :-n_labels]
|
|
48
|
+
Y_df = df.iloc[:, -n_labels:].astype(int)
|
|
49
|
+
|
|
50
|
+
self.X, self.Y = self._to_numpy(X_df, Y_df)
|
|
51
|
+
|
|
52
|
+
def _to_numpy(self, X, Y):
|
|
53
|
+
"""Convert DataFrames to numpy arrays (no scaling applied here)."""
|
|
54
|
+
X_np = X.to_numpy().astype(float)
|
|
55
|
+
Y_np = Y.to_numpy()
|
|
56
|
+
print(f"Features shape: {X_np.shape}")
|
|
57
|
+
print(f"Labels shape: {Y_np.shape}")
|
|
58
|
+
return X_np, Y_np
|
|
59
|
+
|
|
60
|
+
def _get_label_count(self):
|
|
61
|
+
"""Return the number of labels for the dataset."""
|
|
62
|
+
if self.dataset_name not in _LABEL_COUNTS:
|
|
63
|
+
raise Exception(f"Dataset '{self.dataset_name}' is not supported. "
|
|
64
|
+
f"Known datasets: {list(_LABEL_COUNTS)}")
|
|
65
|
+
return _LABEL_COUNTS[self.dataset_name]
|
|
66
|
+
|
|
67
|
+
def get_cross_validation_folds(self, n_splits=5, shuffle=True, random_state=None):
|
|
68
|
+
"""Yield (train_index, test_index) tuples for k-fold cross-validation.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
n_splits: Number of folds.
|
|
72
|
+
shuffle: Whether to shuffle before splitting.
|
|
73
|
+
random_state: Random seed for reproducibility.
|
|
74
|
+
"""
|
|
75
|
+
skf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
|
|
76
|
+
yield from skf.split(self.X, self.Y)
|
|
File without changes
|