champions 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- champions/__init__.py +24 -0
- champions/cli.py +39 -0
- champions/model/__init__.py +0 -0
- champions/model/champions.py +35 -0
- champions/model/datacard.py +48 -0
- champions/model/dataframes.py +225 -0
- champions/model/filter.py +37 -0
- champions/model/settings.py +55 -0
- champions/service/__init__.py +0 -0
- champions/service/darkwing.py +96 -0
- champions/service/eval.py +130 -0
- champions/service/train.py +213 -0
- champions-0.9.0.dist-info/METADATA +33 -0
- champions-0.9.0.dist-info/RECORD +17 -0
- champions-0.9.0.dist-info/WHEEL +4 -0
- champions-0.9.0.dist-info/entry_points.txt +2 -0
- champions-0.9.0.dist-info/licenses/LICENSE +674 -0
@@ -0,0 +1,213 @@
|
|
1
|
+
import itertools
|
2
|
+
import math
|
3
|
+
import sys
|
4
|
+
import time
|
5
|
+
from typing import Optional
|
6
|
+
from pydantic import BaseModel
|
7
|
+
from champions.model.datacard import DataCard, Feature
|
8
|
+
from champions.model.dataframes import (
|
9
|
+
CombinedCategorizedFeature,
|
10
|
+
ContCats,
|
11
|
+
TrainDataframes,
|
12
|
+
CategorizedFeature,
|
13
|
+
)
|
14
|
+
from champions.model.filter import CombineFilter, Filter, SingleFilter
|
15
|
+
from champions.model.settings import TrainSettings
|
16
|
+
from champions.model.champions import Champion, Spore
|
17
|
+
import polars as pl
|
18
|
+
|
19
|
+
|
20
|
+
import logging
|
21
|
+
|
22
|
+
from champions.service.darkwing import Darkwing
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class Train(BaseModel):
|
28
|
+
dc: DataCard
|
29
|
+
settings: TrainSettings
|
30
|
+
darkwing: Optional[Darkwing] = None
|
31
|
+
|
32
|
+
def model_post_init(self, __context) -> None:
|
33
|
+
self.darkwing = Darkwing(dc=self.dc)
|
34
|
+
super().model_post_init(__context)
|
35
|
+
|
36
|
+
def run(self):
|
37
|
+
logger.info("start training")
|
38
|
+
for n in range(self.settings.n):
|
39
|
+
for cat in self.dc.target.values:
|
40
|
+
if self.settings.champion_exists(target=cat, n=n):
|
41
|
+
logger.info(f"tree {n} for label {cat} already exists")
|
42
|
+
continue
|
43
|
+
|
44
|
+
logger.info(f"train tree {n} for label {cat}")
|
45
|
+
|
46
|
+
train_res = Champion(
|
47
|
+
spore=self.train_champion(
|
48
|
+
target_filter=self.gen_target_filter(cat=cat),
|
49
|
+
path_filter=[],
|
50
|
+
),
|
51
|
+
target=cat,
|
52
|
+
)
|
53
|
+
self.settings.save_champion(champion=train_res, n=n)
|
54
|
+
|
55
|
+
def gen_target_filter(self, cat) -> Filter:
|
56
|
+
return Filter(
|
57
|
+
combine=[
|
58
|
+
CombineFilter(
|
59
|
+
combine=[
|
60
|
+
SingleFilter(
|
61
|
+
feat_name=self.dc.target.feature_name,
|
62
|
+
operator="=",
|
63
|
+
value=cat,
|
64
|
+
)
|
65
|
+
]
|
66
|
+
)
|
67
|
+
],
|
68
|
+
invert=False,
|
69
|
+
)
|
70
|
+
|
71
|
+
def train_champion(
|
72
|
+
self, target_filter: Filter, path_filter: list[Filter], depth: str = ""
|
73
|
+
) -> list[Spore]:
|
74
|
+
train_df = self.darkwing.read_akt_train(
|
75
|
+
targer_filter=target_filter,
|
76
|
+
train_settings=self.settings,
|
77
|
+
akt_filters=path_filter,
|
78
|
+
)
|
79
|
+
|
80
|
+
if train_df.is_final_size() or len(depth) >= self.settings.max_depth:
|
81
|
+
score = train_df.score()
|
82
|
+
logging.info(
|
83
|
+
f"final spore depth: {depth}, {score} non target {train_df.non_target_df_size} target: {train_df.target_df_size}"
|
84
|
+
)
|
85
|
+
return [
|
86
|
+
Spore(
|
87
|
+
cut=[fil.sql() for fil in path_filter],
|
88
|
+
score=score,
|
89
|
+
depth=depth,
|
90
|
+
)
|
91
|
+
]
|
92
|
+
|
93
|
+
self.cater(train_df=train_df)
|
94
|
+
left_filter, right_filter = self.counter(train_df=train_df)
|
95
|
+
left_spores = self.train_champion(
|
96
|
+
target_filter=target_filter,
|
97
|
+
path_filter=path_filter + [left_filter],
|
98
|
+
depth=depth + "l",
|
99
|
+
)
|
100
|
+
right_spores = self.train_champion(
|
101
|
+
target_filter=target_filter,
|
102
|
+
path_filter=path_filter + [right_filter],
|
103
|
+
depth=depth + "r",
|
104
|
+
)
|
105
|
+
return left_spores + right_spores
|
106
|
+
|
107
|
+
def counter(self, train_df: TrainDataframes) -> tuple[Filter, Filter]:
|
108
|
+
sorted_train_feats = sorted(
|
109
|
+
train_df.train_features,
|
110
|
+
key=lambda x: x.calc_diff(),
|
111
|
+
reverse=True,
|
112
|
+
)
|
113
|
+
best_feat = sorted_train_feats[0]
|
114
|
+
best_feat_diff = best_feat.calc_diff()
|
115
|
+
|
116
|
+
for dim in range(2, self.settings.n_dims + 1):
|
117
|
+
counter = 0
|
118
|
+
for comb in itertools.combinations(sorted_train_feats, r=dim):
|
119
|
+
counter += 1
|
120
|
+
akt_feature = CombinedCategorizedFeature(
|
121
|
+
comb,
|
122
|
+
non_target_size=train_df.n_count_non_target,
|
123
|
+
target_size=train_df.n_count_target,
|
124
|
+
)
|
125
|
+
if akt_feature.calc_diff() > best_feat_diff:
|
126
|
+
best_feat = akt_feature
|
127
|
+
best_feat_diff = akt_feature.calc_diff()
|
128
|
+
|
129
|
+
if (
|
130
|
+
self.settings.calcs_per_dim
|
131
|
+
and counter > self.settings.calcs_per_dim
|
132
|
+
):
|
133
|
+
break
|
134
|
+
|
135
|
+
# with pl.Config(tbl_rows=100, tbl_cols=100):
|
136
|
+
# print(best_feat.diff_df)
|
137
|
+
|
138
|
+
# print(best_feat_diff, train_df.n_count_non_target)
|
139
|
+
# print(best_feat_diff, train_df.n_count_target)
|
140
|
+
|
141
|
+
return best_feat.get_left_right_filter()
|
142
|
+
|
143
|
+
def cater(self, train_df: TrainDataframes):
|
144
|
+
for feat in self.dc.train_cat_feature:
|
145
|
+
self.cat_cater(feat=feat, train_df=train_df)
|
146
|
+
for feat in self.dc.train_con_feature:
|
147
|
+
self.cont_cater(feat=feat, train_df=train_df)
|
148
|
+
|
149
|
+
def cat_cater(self, feat: Feature, train_df: TrainDataframes):
|
150
|
+
raise NotImplementedError("gibt noch net")
|
151
|
+
df = train_df.non_target_df_cat
|
152
|
+
print(df.select(pl.col(feat.name).value_counts()))
|
153
|
+
|
154
|
+
def get_n_quantiles(self, series: pl.Series, n: int) -> ContCats:
|
155
|
+
res = []
|
156
|
+
label = ["0"]
|
157
|
+
idx = 0
|
158
|
+
for i in range(1, n):
|
159
|
+
quantile = series.quantile(quantile=i / n, interpolation="lower")
|
160
|
+
if quantile in res:
|
161
|
+
continue
|
162
|
+
idx += 1
|
163
|
+
label.append(f"{idx}")
|
164
|
+
res.append(quantile)
|
165
|
+
return ContCats(cuts=res, labels=label, feat_name=series.name)
|
166
|
+
|
167
|
+
def cont_cater(self, feat: Feature, train_df: TrainDataframes):
|
168
|
+
categorized_feature = self.cont_cater_impl(feat=feat, train_df=train_df, n=2)
|
169
|
+
|
170
|
+
for n in range(3,self.settings.n_cat+1):
|
171
|
+
cat_feature = self.cont_cater_impl(feat=feat, train_df=train_df, n=n)
|
172
|
+
if cat_feature.calc_diff() > categorized_feature.calc_diff():
|
173
|
+
categorized_feature = cat_feature
|
174
|
+
|
175
|
+
if not categorized_feature.is_diff_to_low():
|
176
|
+
train_df.train_features.append(categorized_feature)
|
177
|
+
|
178
|
+
|
179
|
+
def cont_cater_impl(
|
180
|
+
self, feat: Feature, train_df: TrainDataframes, n: int
|
181
|
+
) -> CategorizedFeature:
|
182
|
+
|
183
|
+
|
184
|
+
df_sorted = train_df.df_group[[feat.name,'weight']].sort(feat.name)
|
185
|
+
cumulative_weights = df_sorted['weight'].cum_sum()
|
186
|
+
|
187
|
+
res = []
|
188
|
+
label = ["0"]
|
189
|
+
idx = 0
|
190
|
+
for i in range(1, n):
|
191
|
+
quantile = i /n
|
192
|
+
index = (cumulative_weights >= quantile).arg_true()[0]
|
193
|
+
|
194
|
+
|
195
|
+
if index > 0:
|
196
|
+
weight_below = cumulative_weights[index - 1]
|
197
|
+
value_below = df_sorted[feat.name][index - 1]
|
198
|
+
value_at_index = df_sorted[feat.name][index]
|
199
|
+
fraction = (quantile - weight_below) / (cumulative_weights[index] - weight_below)
|
200
|
+
value = value_below + fraction * (value_at_index - value_below)
|
201
|
+
else:
|
202
|
+
value = df_sorted[feat.name][0]
|
203
|
+
if value in res:
|
204
|
+
continue
|
205
|
+
idx += 1
|
206
|
+
label.append(f"{idx}")
|
207
|
+
res.append(value)
|
208
|
+
|
209
|
+
|
210
|
+
return train_df.create_categorized_features(
|
211
|
+
feat=feat,
|
212
|
+
cuts=ContCats(cuts=res, labels=label, feat_name=feat.name),
|
213
|
+
)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: champions
|
3
|
+
Version: 0.9.0
|
4
|
+
Summary: Add your description here
|
5
|
+
Project-URL: Homepage, https://gitlab.com/gwhe/champions
|
6
|
+
Project-URL: Issues, https://gitlab.com/gwhe/champions/-/issues
|
7
|
+
Author-email: swayand <stefan.wayand@gmail.com>
|
8
|
+
License-Expression: GPL-3.0-or-later
|
9
|
+
License-File: LICENSE
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.13
|
13
|
+
Requires-Dist: altair>=5.5.0
|
14
|
+
Requires-Dist: duckdb>=1.3.2
|
15
|
+
Requires-Dist: polars>=1.32.0
|
16
|
+
Requires-Dist: pyarrow>=19.0.1
|
17
|
+
Requires-Dist: pydantic>=2.10.6
|
18
|
+
Requires-Dist: pyyaml>=6.0.2
|
19
|
+
Requires-Dist: scikit-learn>=1.7.1
|
20
|
+
Requires-Dist: typer>=0.15.2
|
21
|
+
Requires-Dist: vl-convert-python>=1.8.0
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
|
24
|
+
# Champions
|
25
|
+
|
26
|
+
|
27
|
+
## Status
|
28
|
+
|
29
|
+
Disclaimer Doku muss noch gemahct werden.
|
30
|
+
|
31
|
+
## Initial Thoughts
|
32
|
+
|
33
|
+
The idea of this project is to make binary classification. It departs from some well-established Machine Learning methods in order to test some new ones. Some are purely technical, while others are more fundamental. This is a complete rewrite of my first private package, which I called Pilze (mushrooms). In German, "Champions" and "Champignons" sound similar, which is why I liked the name. But "Mushrooms" explains the project's purpose very well. Mushrooms are neither plants nor animals, but somewhere in between. In ML, it's all about correlations, and in a very abstract way, neural networks and tree-based algorithms solve this problem in opposite directions. Neural networks look at a single event and try to learn all correlations at one stage, while tree-based ML algorithms look at all (or many) events and try to find the correlations stage by stage. SVMs are a step further than trees, but in my personal opinion, they focus on the wrong thing. They focus on separation and not on the correlation of the individual classes. The initial idea of this project was to build a tree in which nodes not only make one cut but a set of cuts in more than one dimension. When it comes to implementations, the obvious choice is to use sklearn. It is one of the best software projects ever built, in my opinion. But some things are really annoying, and overall, it is hard to fit new ideas into this very strict framework.
|
@@ -0,0 +1,17 @@
|
|
1
|
+
champions/__init__.py,sha256=Wbt9W5dYpbtuqqoh7K24jtGSMZSrzY5t1fnql0k6_No,641
|
2
|
+
champions/cli.py,sha256=iXfoAxS9-oDyK1gsDonT7O0lj-SgK5rBRnNKiF44rWQ,991
|
3
|
+
champions/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
champions/model/champions.py,sha256=2avlOipAR78d1N-xqExWT0a4WdDKEoF_wwcrPGPNJn0,799
|
5
|
+
champions/model/datacard.py,sha256=I2yhhsw5JN9Au6sKIJvvfAYmHzflsKh4eiOyTb9XlsY,1175
|
6
|
+
champions/model/dataframes.py,sha256=nGdec3ubQI5z-Xsf_aljtN1K3PtGElKy1Ms7akMjEZE,7516
|
7
|
+
champions/model/filter.py,sha256=T-M-W9rYMY_Ool7N-CbDGe3ecPMU2gFrSLS5NDVPrjM,1364
|
8
|
+
champions/model/settings.py,sha256=qjLw9fUx2KScYzsh3-53gEcE6tdc0HI-vH0di8Uf6Jg,2118
|
9
|
+
champions/service/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
champions/service/darkwing.py,sha256=hXEzXhsOTUB0qEmIaWySXMiGkNfWjq5Hui1prAQEfD8,3189
|
11
|
+
champions/service/eval.py,sha256=lxy5jkExMTpJdtNHeEQj3jYn4DezWnhv5urMwYwCnB8,4689
|
12
|
+
champions/service/train.py,sha256=lJP01OikyjnLy4MgmBpool7IuNnzd9TK7zT53W4LOWo,7329
|
13
|
+
champions-0.9.0.dist-info/METADATA,sha256=9YbdlkbofJ3DsOI8Otf1fsXmAJegZwGNIp-eGhHpfng,2274
|
14
|
+
champions-0.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
15
|
+
champions-0.9.0.dist-info/entry_points.txt,sha256=Jrat4iyGkU0OttDrUreuSo9fbaA5iJSsV2TPdiy_Sok,48
|
16
|
+
champions-0.9.0.dist-info/licenses/LICENSE,sha256=1neeRA1TYb6baPpggguJlo8nWoiIlnBvhF_Sb1DTWhs,35139
|
17
|
+
champions-0.9.0.dist-info/RECORD,,
|