ndscape 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ndscape-0.1.0/PKG-INFO +105 -0
- ndscape-0.1.0/ndscape/README.md +86 -0
- ndscape-0.1.0/ndscape/__init__.py +21 -0
- ndscape-0.1.0/ndscape/api.py +367 -0
- ndscape-0.1.0/ndscape.egg-info/PKG-INFO +105 -0
- ndscape-0.1.0/ndscape.egg-info/SOURCES.txt +9 -0
- ndscape-0.1.0/ndscape.egg-info/dependency_links.txt +1 -0
- ndscape-0.1.0/ndscape.egg-info/requires.txt +6 -0
- ndscape-0.1.0/ndscape.egg-info/top_level.txt +1 -0
- ndscape-0.1.0/pyproject.toml +33 -0
- ndscape-0.1.0/setup.cfg +4 -0
ndscape-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ndscape
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fit, score, and embed nested-dichotomy (ND) trees for multi-class classification.
|
|
5
|
+
Author: Maxwell Dix-Matthews
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/res-lucid/ndscape
|
|
8
|
+
Keywords: nested-dichotomy,multi-class,classification,scikit-learn
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering
|
|
12
|
+
Requires-Python: >=3.9
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: numpy>=1.24
|
|
15
|
+
Requires-Dist: scikit-learn>=1.3
|
|
16
|
+
Provides-Extra: spatial
|
|
17
|
+
Requires-Dist: esda>=2.6; extra == "spatial"
|
|
18
|
+
Requires-Dist: libpysal>=4.10; extra == "spatial"
|
|
19
|
+
|
|
20
|
+
# ndscape
|
|
21
|
+
|
|
22
|
+
Fit, score, and embed nested-dichotomy (ND) trees for multi-class classification.
|
|
23
|
+
|
|
24
|
+
A nested dichotomy reduces a C-class problem to a tree of binary splits
|
|
25
|
+
(e.g. {0,1,2} vs {3,4}, then {0} vs {1,2}, ...). ndscape lets you fit one,
|
|
26
|
+
score it, or place a whole population of candidate trees in a 2-D
|
|
27
|
+
"tree-space" to see how a property (accuracy, variance, ...) varies across
|
|
28
|
+
tree structures.
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install ndscape
|
|
34
|
+
pip install ndscape[spatial] # adds Moran's I support (esda, libpysal)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quickstart
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from sklearn.datasets import load_iris
|
|
41
|
+
from sklearn.model_selection import train_test_split
|
|
42
|
+
import ndscape as nds
|
|
43
|
+
|
|
44
|
+
X, y = load_iris(return_X_y=True)
|
|
45
|
+
classes = sorted(set(y))
|
|
46
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
|
|
47
|
+
|
|
48
|
+
nd = nds.fit(X_train, y_train, classes=classes, base="lr")
|
|
49
|
+
nd.predict(X_test)
|
|
50
|
+
nd.score(X_test, y_test) # {"accuracy": ..., "logloss": ...}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
`classes` is the list of class labels in your `y` (`sorted(set(y))` works for
|
|
54
|
+
integer or string labels). `nds.fit` samples one ND tree automatically; pass
|
|
55
|
+
`tree=...` to use a specific one (see "A `tree` is..." below).
|
|
56
|
+
|
|
57
|
+
## Use cases
|
|
58
|
+
|
|
59
|
+
**You have a dataset and a binary classifier.**
|
|
60
|
+
|
|
61
|
+
`base` can be the string `"lr"` or `"decisiontree"`, or your own unfitted
|
|
62
|
+
scikit-learn estimator — a fresh clone of it is fit at every split.
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from sklearn.svm import SVC
|
|
66
|
+
|
|
67
|
+
nd = nds.fit(X_train, y_train, classes=classes, base=SVC(probability=True, kernel="linear"))
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**You have a train/test split and want a score.**
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
nd = nds.ND(tree, classes).fit(X_train, y_train, base="lr")
|
|
74
|
+
nd.score(X_test, y_test) # {"accuracy": ..., "logloss": ...}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**You already trained the per-split models yourself.**
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
# models in the same order as tree, or a {(left, right): model} dict — either works
|
|
81
|
+
nd = nds.ND.from_trained(tree, classes, models=[fitted_model_1, fitted_model_2, ...])
|
|
82
|
+
nd.predict_proba(X_test)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**You already scored a set of trees and want to see where they sit in tree-space.**
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
trees, coords = nds.embed_trees(classes)
|
|
89
|
+
nds.spatial_autocorrelation(my_scores, coords) # {"I": ..., "p_sim": ...}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**You just want the whole picture: fit, score, and embed every candidate tree.**
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
rows = nds.analyze(X_train, y_train, classes, X_test=X_test, y_test=y_test, base="lr")
|
|
96
|
+
# [{"tree": ..., "accuracy": ..., "logloss": ..., "coord": array([...])}, ...]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
A `tree` is a list of `(left, right)` tuples of class labels, e.g.
|
|
100
|
+
`[((0, 1), (2, 3)), ((0,), (1,)), ((2,), (3,))]`. Use `nds.all_trees(classes)`
|
|
101
|
+
(exhaustive, for small C) or `nds.sample_trees(classes, N)` (for larger C)
|
|
102
|
+
to generate candidates.
|
|
103
|
+
|
|
104
|
+
`base` accepts `"lr"`, `"decisiontree"`, or any unfitted scikit-learn
|
|
105
|
+
estimator with `fit`/`predict_proba`.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# ndscape
|
|
2
|
+
|
|
3
|
+
Fit, score, and embed nested-dichotomy (ND) trees for multi-class classification.
|
|
4
|
+
|
|
5
|
+
A nested dichotomy reduces a C-class problem to a tree of binary splits
|
|
6
|
+
(e.g. {0,1,2} vs {3,4}, then {0} vs {1,2}, ...). ndscape lets you fit one,
|
|
7
|
+
score it, or place a whole population of candidate trees in a 2-D
|
|
8
|
+
"tree-space" to see how a property (accuracy, variance, ...) varies across
|
|
9
|
+
tree structures.
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
pip install ndscape
|
|
15
|
+
pip install ndscape[spatial] # adds Moran's I support (esda, libpysal)
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Quickstart
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from sklearn.datasets import load_iris
|
|
22
|
+
from sklearn.model_selection import train_test_split
|
|
23
|
+
import ndscape as nds
|
|
24
|
+
|
|
25
|
+
X, y = load_iris(return_X_y=True)
|
|
26
|
+
classes = sorted(set(y))
|
|
27
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
|
|
28
|
+
|
|
29
|
+
nd = nds.fit(X_train, y_train, classes=classes, base="lr")
|
|
30
|
+
nd.predict(X_test)
|
|
31
|
+
nd.score(X_test, y_test) # {"accuracy": ..., "logloss": ...}
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
`classes` is the list of class labels in your `y` (`sorted(set(y))` works for
|
|
35
|
+
integer or string labels). `nds.fit` samples one ND tree automatically; pass
|
|
36
|
+
`tree=...` to use a specific one (see "A `tree` is..." below).
|
|
37
|
+
|
|
38
|
+
## Use cases
|
|
39
|
+
|
|
40
|
+
**You have a dataset and a binary classifier.**
|
|
41
|
+
|
|
42
|
+
`base` can be the string `"lr"` or `"decisiontree"`, or your own unfitted
|
|
43
|
+
scikit-learn estimator — a fresh clone of it is fit at every split.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from sklearn.svm import SVC
|
|
47
|
+
|
|
48
|
+
nd = nds.fit(X_train, y_train, classes=classes, base=SVC(probability=True, kernel="linear"))
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**You have a train/test split and want a score.**
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
nd = nds.ND(tree, classes).fit(X_train, y_train, base="lr")
|
|
55
|
+
nd.score(X_test, y_test) # {"accuracy": ..., "logloss": ...}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**You already trained the per-split models yourself.**
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# models in the same order as tree, or a {(left, right): model} dict — either works
|
|
62
|
+
nd = nds.ND.from_trained(tree, classes, models=[fitted_model_1, fitted_model_2, ...])
|
|
63
|
+
nd.predict_proba(X_test)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**You already scored a set of trees and want to see where they sit in tree-space.**
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
trees, coords = nds.embed_trees(classes)
|
|
70
|
+
nds.spatial_autocorrelation(my_scores, coords) # {"I": ..., "p_sim": ...}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**You just want the whole picture: fit, score, and embed every candidate tree.**
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
rows = nds.analyze(X_train, y_train, classes, X_test=X_test, y_test=y_test, base="lr")
|
|
77
|
+
# [{"tree": ..., "accuracy": ..., "logloss": ..., "coord": array([...])}, ...]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
A `tree` is a list of `(left, right)` tuples of class labels, e.g.
|
|
81
|
+
`[((0, 1), (2, 3)), ((0,), (1,)), ((2,), (3,))]`. Use `nds.all_trees(classes)`
|
|
82
|
+
(exhaustive, for small C) or `nds.sample_trees(classes, N)` (for larger C)
|
|
83
|
+
to generate candidates.
|
|
84
|
+
|
|
85
|
+
`base` accepts `"lr"`, `"decisiontree"`, or any unfitted scikit-learn
|
|
86
|
+
estimator with `fit`/`predict_proba`.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""ndscape: fit, score, and embed nested-dichotomy (ND) trees for multi-class classification."""
|
|
2
|
+
|
|
3
|
+
from .api import (
|
|
4
|
+
ND,
|
|
5
|
+
all_trees,
|
|
6
|
+
analyze,
|
|
7
|
+
embed_trees,
|
|
8
|
+
fit,
|
|
9
|
+
sample_trees,
|
|
10
|
+
spatial_autocorrelation,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ND",
|
|
15
|
+
"all_trees",
|
|
16
|
+
"analyze",
|
|
17
|
+
"embed_trees",
|
|
18
|
+
"fit",
|
|
19
|
+
"sample_trees",
|
|
20
|
+
"spatial_autocorrelation",
|
|
21
|
+
]
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Fit, score, and embed nested-dichotomy (ND) trees on real datasets.
|
|
2
|
+
|
|
3
|
+
Self-contained: this module does not import core.py, config.py, or anything
|
|
4
|
+
else from the rest of the repo, so it can be packaged and installed on its
|
|
5
|
+
own via pip.
|
|
6
|
+
|
|
7
|
+
A tree is a list of (left, right) tuples of class labels, e.g.
|
|
8
|
+
[((0, 1), (2, 3)), ((0,), (1,)), ((2,), (3,))]
|
|
9
|
+
read top-down: first split {0,1} vs {2,3}, then split each side further.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from itertools import combinations
|
|
13
|
+
from math import comb, sqrt
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from sklearn.base import clone
|
|
17
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
18
|
+
from sklearn.linear_model import LogisticRegression
|
|
19
|
+
from sklearn.manifold import MDS
|
|
20
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
21
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------- models
|
|
25
|
+
|
|
26
|
+
def _get_model(base):
|
|
27
|
+
"""Resolve `base` to a fresh, unfitted binary classifier.
|
|
28
|
+
|
|
29
|
+
`base` is either a string shorthand ("lr", "decisiontree") or an actual
|
|
30
|
+
unfitted scikit-learn estimator, e.g. SVC(probability=True) or your own
|
|
31
|
+
model implementing fit/predict_proba. A fresh clone is made per split.
|
|
32
|
+
"""
|
|
33
|
+
if isinstance(base, str):
|
|
34
|
+
if base == "lr":
|
|
35
|
+
return LogisticRegression(
|
|
36
|
+
penalty="l2", solver="newton-cholesky", C=0.1, max_iter=2000
|
|
37
|
+
)
|
|
38
|
+
if base in ("decisiontree", "dt"):
|
|
39
|
+
return DecisionTreeClassifier(
|
|
40
|
+
criterion="entropy", min_samples_leaf=5, random_state=0
|
|
41
|
+
)
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"Unknown base model '{base}'. Pass 'lr', 'decisiontree', "
|
|
44
|
+
"or an unfitted scikit-learn estimator."
|
|
45
|
+
)
|
|
46
|
+
return clone(base)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------- tree enumeration
|
|
50
|
+
|
|
51
|
+
def _gen(labels):
|
|
52
|
+
labels = tuple(sorted(labels))
|
|
53
|
+
if len(labels) <= 1:
|
|
54
|
+
yield ()
|
|
55
|
+
return
|
|
56
|
+
s = labels[0]
|
|
57
|
+
for r in range(1, len(labels)):
|
|
58
|
+
for rest in combinations(labels[1:], r - 1):
|
|
59
|
+
left = tuple(sorted((s,) + rest))
|
|
60
|
+
right = tuple(x for x in labels if x not in left)
|
|
61
|
+
pair = tuple(sorted((left, right), key=lambda t: (-len(t), t)))
|
|
62
|
+
L = ((),) if len(left) == 1 else tuple(_gen(left))
|
|
63
|
+
R = ((),) if len(right) == 1 else tuple(_gen(right))
|
|
64
|
+
for lt in L:
|
|
65
|
+
for rt in R:
|
|
66
|
+
yield (pair,) + lt + rt
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def all_trees(classes):
|
|
70
|
+
"""Enumerate every ND tree over `classes` (exhaustive — use sample_trees for many classes)."""
|
|
71
|
+
return list(_gen(tuple(classes)))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _sample_tree_indices(n, N):
|
|
75
|
+
"""Sample up to N ND trees over positions 0..n-1, proportionally balanced."""
|
|
76
|
+
tree_counts = [0, 1, 1] + [0] * max(0, n - 2)
|
|
77
|
+
for m in range(3, n + 1):
|
|
78
|
+
tree_counts[m] = (2 * m - 3) * tree_counts[m - 1]
|
|
79
|
+
|
|
80
|
+
def _draw(classes, budget):
|
|
81
|
+
m = len(classes)
|
|
82
|
+
if m <= 1:
|
|
83
|
+
return [()]
|
|
84
|
+
if m == 2:
|
|
85
|
+
return [((tuple(classes[:1]), tuple(classes[1:])),)]
|
|
86
|
+
if budget == 1:
|
|
87
|
+
k = m // 2
|
|
88
|
+
right, left = tuple(classes[k:]), tuple(classes[:k])
|
|
89
|
+
return [((right, left),) + _draw(list(right), 1)[0] + _draw(list(left), 1)[0]]
|
|
90
|
+
left_sizes = range(1, m // 2 + 1)
|
|
91
|
+
weights = [
|
|
92
|
+
comb(m, k) // (2 if 2 * k == m else 1) * tree_counts[k] * tree_counts[m - k]
|
|
93
|
+
for k in left_sizes
|
|
94
|
+
]
|
|
95
|
+
total = sum(weights)
|
|
96
|
+
alloc = [budget * w // total for w in weights]
|
|
97
|
+
for i in sorted(range(len(weights)), key=lambda i: -(budget * weights[i] % total))[
|
|
98
|
+
: budget - sum(alloc)
|
|
99
|
+
]:
|
|
100
|
+
alloc[i] += 1
|
|
101
|
+
out = []
|
|
102
|
+
for k, k_budget in zip(left_sizes, alloc):
|
|
103
|
+
if k_budget == 0:
|
|
104
|
+
continue
|
|
105
|
+
n_combos = comb(m, k) // (2 if 2 * k == m else 1)
|
|
106
|
+
per_combo, remainder = divmod(k_budget, n_combos)
|
|
107
|
+
seen = 0
|
|
108
|
+
for left in combinations(classes, k):
|
|
109
|
+
if 2 * k == m and classes[0] not in left:
|
|
110
|
+
continue
|
|
111
|
+
combo_budget = per_combo + (seen < remainder)
|
|
112
|
+
seen += 1
|
|
113
|
+
if combo_budget == 0:
|
|
114
|
+
if per_combo == 0 and seen >= remainder:
|
|
115
|
+
break
|
|
116
|
+
continue
|
|
117
|
+
left = tuple(left)
|
|
118
|
+
right = tuple(x for x in classes if x not in left)
|
|
119
|
+
left_budget = int(sqrt(combo_budget * tree_counts[k] / tree_counts[m - k]) + 0.5)
|
|
120
|
+
left_budget = max(left_budget, (combo_budget + tree_counts[m - k] - 1) // tree_counts[m - k])
|
|
121
|
+
left_budget = min(left_budget, tree_counts[k])
|
|
122
|
+
right_budget = (combo_budget + left_budget - 1) // left_budget
|
|
123
|
+
drawn = 0
|
|
124
|
+
for right_sub in _draw(list(right), right_budget):
|
|
125
|
+
for left_sub in _draw(list(left), left_budget):
|
|
126
|
+
out.append(((right, left),) + right_sub + left_sub)
|
|
127
|
+
drawn += 1
|
|
128
|
+
if drawn == combo_budget:
|
|
129
|
+
break
|
|
130
|
+
if drawn == combo_budget:
|
|
131
|
+
break
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
return _draw(list(range(n)), min(N, tree_counts[n]))
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _relabel_tree(tree, classes):
|
|
138
|
+
m = dict(enumerate(classes))
|
|
139
|
+
return [(tuple(m[i] for i in left), tuple(m[i] for i in right)) for left, right in tree]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def sample_trees(classes, N):
|
|
143
|
+
"""Sample up to N ND trees over `classes`, proportionally balanced by subtree size."""
|
|
144
|
+
classes = list(classes)
|
|
145
|
+
raw = _sample_tree_indices(len(classes), N)
|
|
146
|
+
return [_relabel_tree(t, classes) for t in raw]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ---------------------------------------------------------------- ND
|
|
150
|
+
|
|
151
|
+
class ND:
|
|
152
|
+
"""A single nested-dichotomy tree fitted to a dataset.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
tree : list of (left, right) tuples of class labels.
|
|
157
|
+
classes : the full set of class labels, in the order predict_proba columns follow.
|
|
158
|
+
models : optional, already-fitted binary classifiers — skips .fit(). Either
|
|
159
|
+
a list of fitted models in the same order as `tree`, or a dict mapping
|
|
160
|
+
(left, right) -> fitted model.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def __init__(self, tree, classes, models=None):
|
|
164
|
+
self.tree = [(tuple(left), tuple(right)) for left, right in tree]
|
|
165
|
+
self.classes = list(classes)
|
|
166
|
+
if models is None:
|
|
167
|
+
self.models = {}
|
|
168
|
+
elif isinstance(models, dict):
|
|
169
|
+
self.models = dict(models)
|
|
170
|
+
else:
|
|
171
|
+
self.models = dict(zip(self.tree, models))
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_trained(cls, tree, classes, models):
|
|
175
|
+
"""Wrap a tree whose per-split binary models are already fitted.
|
|
176
|
+
|
|
177
|
+
`models` is a list of fitted models in the same order as `tree`,
|
|
178
|
+
or a dict mapping (left, right) -> fitted model.
|
|
179
|
+
"""
|
|
180
|
+
return cls(tree, classes, models=models)
|
|
181
|
+
|
|
182
|
+
def fit(self, X, y, base="lr"):
|
|
183
|
+
"""Fit one binary classifier per split on (X, y). Returns self.
|
|
184
|
+
|
|
185
|
+
`base` is "lr", "decisiontree", or any unfitted scikit-learn
|
|
186
|
+
estimator, e.g. base=SVC(probability=True, kernel="linear").
|
|
187
|
+
"""
|
|
188
|
+
X, y = np.asarray(X), np.asarray(y)
|
|
189
|
+
for left, right in self.tree:
|
|
190
|
+
mask = np.isin(y, left + right)
|
|
191
|
+
y_node = np.where(np.isin(y[mask], left), 0, 1)
|
|
192
|
+
if len(np.unique(y_node)) < 2:
|
|
193
|
+
self.models[(left, right)] = None
|
|
194
|
+
continue
|
|
195
|
+
model = _get_model(base)
|
|
196
|
+
model.fit(X[mask], y_node)
|
|
197
|
+
self.models[(left, right)] = model
|
|
198
|
+
return self
|
|
199
|
+
|
|
200
|
+
def predict_proba(self, X):
|
|
201
|
+
"""Class-probability matrix, columns ordered as `self.classes`."""
|
|
202
|
+
X = np.asarray(X)
|
|
203
|
+
idx = {c: i for i, c in enumerate(self.classes)}
|
|
204
|
+
proba = np.ones((X.shape[0], len(self.classes)))
|
|
205
|
+
for left, right in self.tree:
|
|
206
|
+
model = self.models.get((left, right))
|
|
207
|
+
li = [idx[c] for c in left]
|
|
208
|
+
ri = [idx[c] for c in right]
|
|
209
|
+
if model is None:
|
|
210
|
+
proba[:, ri] = 0.0
|
|
211
|
+
continue
|
|
212
|
+
p = model.predict_proba(X)
|
|
213
|
+
proba[:, li] *= p[:, 0:1]
|
|
214
|
+
proba[:, ri] *= p[:, 1:2]
|
|
215
|
+
row_sums = proba.sum(axis=1, keepdims=True)
|
|
216
|
+
return proba / np.where(row_sums == 0, 1.0, row_sums)
|
|
217
|
+
|
|
218
|
+
def predict(self, X):
|
|
219
|
+
"""Predicted class label per row."""
|
|
220
|
+
proba = self.predict_proba(X)
|
|
221
|
+
return np.asarray(self.classes)[proba.argmax(axis=1)]
|
|
222
|
+
|
|
223
|
+
def score(self, X, y):
|
|
224
|
+
"""Accuracy and mean log-loss of this ND on (X, y)."""
|
|
225
|
+
y = np.asarray(y)
|
|
226
|
+
proba = self.predict_proba(X)
|
|
227
|
+
pred = np.asarray(self.classes)[proba.argmax(axis=1)]
|
|
228
|
+
idx = {c: i for i, c in enumerate(self.classes)}
|
|
229
|
+
y_idx = np.array([idx[v] for v in y])
|
|
230
|
+
logloss = -np.log(np.clip(proba[np.arange(len(y)), y_idx], 1e-15, 1.0)).mean()
|
|
231
|
+
return {"accuracy": float((pred == y).mean()), "logloss": float(logloss)}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def fit(X, y, classes, tree=None, base="lr"):
|
|
235
|
+
"""Fit a single ND on (X, y).
|
|
236
|
+
|
|
237
|
+
If `tree` is omitted, one tree is sampled automatically. `base` is "lr",
|
|
238
|
+
"decisiontree", or any unfitted scikit-learn estimator you want to use
|
|
239
|
+
at each split, e.g. base=SVC(probability=True, kernel="linear").
|
|
240
|
+
"""
|
|
241
|
+
classes = list(classes)
|
|
242
|
+
if tree is None:
|
|
243
|
+
tree = sample_trees(classes, 1)[0]
|
|
244
|
+
return ND(tree, classes).fit(X, y, base=base)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ---------------------------------------------------------------- embedding + spatial stats
|
|
248
|
+
|
|
249
|
+
def _split_analyzer(n_classes):
|
|
250
|
+
"""TF-IDF token-analyser function for ND trees (same scheme used to build the paper's artifacts)."""
|
|
251
|
+
def _analyze(doc):
|
|
252
|
+
splits = doc
|
|
253
|
+
splits = sorted(splits, key=lambda lr: -len(set(lr[0]) | set(lr[1])))
|
|
254
|
+
|
|
255
|
+
max_depth = min(8, n_classes - 2)
|
|
256
|
+
small_group = 3
|
|
257
|
+
all_classes = frozenset(x for L, R in splits for x in tuple(L) + tuple(R))
|
|
258
|
+
depth = {all_classes: 0}
|
|
259
|
+
extra_depth_w = max(0, n_classes - 6)
|
|
260
|
+
toks = []
|
|
261
|
+
|
|
262
|
+
for L, R in splits:
|
|
263
|
+
U = frozenset(L) | frozenset(R)
|
|
264
|
+
d = depth[U]
|
|
265
|
+
if len(L) > 1:
|
|
266
|
+
depth[frozenset(L)] = d + 1
|
|
267
|
+
if len(R) > 1:
|
|
268
|
+
depth[frozenset(R)] = d + 1
|
|
269
|
+
if d > max_depth:
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
L, R = tuple(sorted(L)), tuple(sorted(R))
|
|
273
|
+
A, B = sorted([L, R], key=lambda x: (-len(x), x))
|
|
274
|
+
dbin = min(d, 5)
|
|
275
|
+
w = max(1, max_depth + 1 - d)
|
|
276
|
+
|
|
277
|
+
toks.extend([f"sz:{len(A)}|{len(B)}"] * max(1, w // 2))
|
|
278
|
+
if d == 0:
|
|
279
|
+
toks.extend([f"root:{A}|{B}"] * 2)
|
|
280
|
+
toks.extend([f"root_sz:{len(A)}|{len(B)}"] * 4)
|
|
281
|
+
|
|
282
|
+
for i in A:
|
|
283
|
+
for j in B:
|
|
284
|
+
a, b = sorted((i, j))
|
|
285
|
+
toks.extend([f"sep:{a}-{b}:sz:{len(A)}|{len(B)}"] * max(1, w // 2))
|
|
286
|
+
toks.append(f"sep:{a}-{b}:d{dbin}")
|
|
287
|
+
if d == 0:
|
|
288
|
+
toks.extend([f"root_sep:{a}-{b}"] * 2)
|
|
289
|
+
toks.extend([f"root_sep:{a}-{b}:sz:{len(A)}|{len(B)}"] * 2)
|
|
290
|
+
|
|
291
|
+
for side in (A, B):
|
|
292
|
+
for i, j in combinations(side, 2):
|
|
293
|
+
toks.append(f"same:{i}-{j}")
|
|
294
|
+
if n_classes >= 7 and 1 < len(side) <= small_group:
|
|
295
|
+
toks.extend([f"small_group_size:{len(side)}:d{dbin}"] * extra_depth_w)
|
|
296
|
+
|
|
297
|
+
return toks
|
|
298
|
+
return _analyze
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def embed_trees(classes, trees=None, N=2000, seed=0, dim=2):
|
|
302
|
+
"""Place a set of ND trees in `dim`-D tree-space (TF-IDF + cosine distance + MDS).
|
|
303
|
+
|
|
304
|
+
If `trees` is omitted, all trees are used for <=7 classes and a balanced
|
|
305
|
+
sample of N otherwise. Returns (trees, coords); coords[i] is the
|
|
306
|
+
embedding of trees[i].
|
|
307
|
+
"""
|
|
308
|
+
classes = list(classes)
|
|
309
|
+
n = len(classes)
|
|
310
|
+
if trees is None:
|
|
311
|
+
trees = all_trees(classes) if n <= 7 else sample_trees(classes, N)
|
|
312
|
+
|
|
313
|
+
vec = TfidfVectorizer(analyzer=_split_analyzer(n), use_idf=False, norm="l2")
|
|
314
|
+
tfidf = vec.fit_transform(trees)
|
|
315
|
+
sim = cosine_similarity(tfidf)
|
|
316
|
+
dist = np.sqrt(np.maximum(0, 2.0 * (1.0 - sim)))
|
|
317
|
+
np.fill_diagonal(dist, 0.0)
|
|
318
|
+
|
|
319
|
+
mds = MDS(
|
|
320
|
+
n_components=dim, dissimilarity="precomputed",
|
|
321
|
+
random_state=seed, n_init=1, max_iter=1000, eps=1e-6, n_jobs=-1,
|
|
322
|
+
)
|
|
323
|
+
coords = mds.fit_transform(dist)
|
|
324
|
+
return trees, coords
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def spatial_autocorrelation(values, coords, k=50, permutations=999):
|
|
328
|
+
"""Global Moran's I for `values` over a tree-space embedding.
|
|
329
|
+
|
|
330
|
+
Needs the optional 'spatial' extra: pip install ndscape[spatial]
|
|
331
|
+
"""
|
|
332
|
+
try:
|
|
333
|
+
from esda import Moran
|
|
334
|
+
from libpysal.weights import KNN
|
|
335
|
+
except ImportError as e:
|
|
336
|
+
raise ImportError(
|
|
337
|
+
"spatial_autocorrelation needs esda and libpysal. "
|
|
338
|
+
"Install with: pip install ndscape[spatial]"
|
|
339
|
+
) from e
|
|
340
|
+
|
|
341
|
+
import warnings
|
|
342
|
+
k = min(k, len(coords) - 1)
|
|
343
|
+
with warnings.catch_warnings():
|
|
344
|
+
warnings.simplefilter("ignore")
|
|
345
|
+
w = KNN.from_array(coords, k=k)
|
|
346
|
+
w.transform = "r"
|
|
347
|
+
mi = Moran(np.asarray(values), w, permutations=permutations)
|
|
348
|
+
return {"I": mi.I, "p_sim": mi.p_sim}
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def analyze(X, y, classes, X_test=None, y_test=None, base="lr", N=2000, seed=0):
|
|
352
|
+
"""Fit every candidate ND tree on (X, y), score it, and place it in tree-space.
|
|
353
|
+
|
|
354
|
+
If X_test/y_test are omitted, scores are computed on (X, y). Returns a
|
|
355
|
+
list of dicts: tree, accuracy, logloss, coord.
|
|
356
|
+
"""
|
|
357
|
+
classes = list(classes)
|
|
358
|
+
Xte = X if X_test is None else X_test
|
|
359
|
+
yte = y if y_test is None else y_test
|
|
360
|
+
|
|
361
|
+
trees, coords = embed_trees(classes, N=N, seed=seed)
|
|
362
|
+
rows = []
|
|
363
|
+
for tree, coord in zip(trees, coords):
|
|
364
|
+
nd = ND(tree, classes).fit(X, y, base=base)
|
|
365
|
+
s = nd.score(Xte, yte)
|
|
366
|
+
rows.append({"tree": tree, "accuracy": s["accuracy"], "logloss": s["logloss"], "coord": coord})
|
|
367
|
+
return rows
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ndscape
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fit, score, and embed nested-dichotomy (ND) trees for multi-class classification.
|
|
5
|
+
Author: Maxwell Dix-Matthews
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/res-lucid/ndscape
|
|
8
|
+
Keywords: nested-dichotomy,multi-class,classification,scikit-learn
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering
|
|
12
|
+
Requires-Python: >=3.9
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: numpy>=1.24
|
|
15
|
+
Requires-Dist: scikit-learn>=1.3
|
|
16
|
+
Provides-Extra: spatial
|
|
17
|
+
Requires-Dist: esda>=2.6; extra == "spatial"
|
|
18
|
+
Requires-Dist: libpysal>=4.10; extra == "spatial"
|
|
19
|
+
|
|
20
|
+
# ndscape
|
|
21
|
+
|
|
22
|
+
Fit, score, and embed nested-dichotomy (ND) trees for multi-class classification.
|
|
23
|
+
|
|
24
|
+
A nested dichotomy reduces a C-class problem to a tree of binary splits
|
|
25
|
+
(e.g. {0,1,2} vs {3,4}, then {0} vs {1,2}, ...). ndscape lets you fit one,
|
|
26
|
+
score it, or place a whole population of candidate trees in a 2-D
|
|
27
|
+
"tree-space" to see how a property (accuracy, variance, ...) varies across
|
|
28
|
+
tree structures.
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install ndscape
|
|
34
|
+
pip install ndscape[spatial] # adds Moran's I support (esda, libpysal)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quickstart
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from sklearn.datasets import load_iris
|
|
41
|
+
from sklearn.model_selection import train_test_split
|
|
42
|
+
import ndscape as nds
|
|
43
|
+
|
|
44
|
+
X, y = load_iris(return_X_y=True)
|
|
45
|
+
classes = sorted(set(y))
|
|
46
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
|
|
47
|
+
|
|
48
|
+
nd = nds.fit(X_train, y_train, classes=classes, base="lr")
|
|
49
|
+
nd.predict(X_test)
|
|
50
|
+
nd.score(X_test, y_test) # {"accuracy": ..., "logloss": ...}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
`classes` is the list of class labels in your `y` (`sorted(set(y))` works for
|
|
54
|
+
integer or string labels). `nds.fit` samples one ND tree automatically; pass
|
|
55
|
+
`tree=...` to use a specific one (see "A `tree` is..." below).
|
|
56
|
+
|
|
57
|
+
## Use cases
|
|
58
|
+
|
|
59
|
+
**You have a dataset and a binary classifier.**
|
|
60
|
+
|
|
61
|
+
`base` can be the string `"lr"` or `"decisiontree"`, or your own unfitted
|
|
62
|
+
scikit-learn estimator — a fresh clone of it is fit at every split.
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from sklearn.svm import SVC
|
|
66
|
+
|
|
67
|
+
nd = nds.fit(X_train, y_train, classes=classes, base=SVC(probability=True, kernel="linear"))
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**You have a train/test split and want a score.**
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
nd = nds.ND(tree, classes).fit(X_train, y_train, base="lr")
|
|
74
|
+
nd.score(X_test, y_test) # {"accuracy": ..., "logloss": ...}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**You already trained the per-split models yourself.**
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
# models in the same order as tree, or a {(left, right): model} dict — either works
|
|
81
|
+
nd = nds.ND.from_trained(tree, classes, models=[fitted_model_1, fitted_model_2, ...])
|
|
82
|
+
nd.predict_proba(X_test)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**You already scored a set of trees and want to see where they sit in tree-space.**
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
trees, coords = nds.embed_trees(classes)
|
|
89
|
+
nds.spatial_autocorrelation(my_scores, coords) # {"I": ..., "p_sim": ...}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**You just want the whole picture: fit, score, and embed every candidate tree.**
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
rows = nds.analyze(X_train, y_train, classes, X_test=X_test, y_test=y_test, base="lr")
|
|
96
|
+
# [{"tree": ..., "accuracy": ..., "logloss": ..., "coord": array([...])}, ...]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
A `tree` is a list of `(left, right)` tuples of class labels, e.g.
|
|
100
|
+
`[((0, 1), (2, 3)), ((0,), (1,)), ((2,), (3,))]`. Use `nds.all_trees(classes)`
|
|
101
|
+
(exhaustive, for small C) or `nds.sample_trees(classes, N)` (for larger C)
|
|
102
|
+
to generate candidates.
|
|
103
|
+
|
|
104
|
+
`base` accepts `"lr"`, `"decisiontree"`, or any unfitted scikit-learn
|
|
105
|
+
estimator with `fit`/`predict_proba`.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ndscape
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ndscape"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fit, score, and embed nested-dichotomy (ND) trees for multi-class classification."
|
|
9
|
+
readme = "ndscape/README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Maxwell Dix-Matthews"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["nested-dichotomy", "multi-class", "classification", "scikit-learn"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"Topic :: Scientific/Engineering",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"numpy>=1.24",
|
|
23
|
+
"scikit-learn>=1.3",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
spatial = ["esda>=2.6", "libpysal>=4.10"]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Repository = "https://github.com/res-lucid/ndscape"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
include = ["ndscape*"]
|
ndscape-0.1.0/setup.cfg
ADDED