champions 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ import itertools
2
+ import math
3
+ import sys
4
+ import time
5
+ from typing import Optional
6
+ from pydantic import BaseModel
7
+ from champions.model.datacard import DataCard, Feature
8
+ from champions.model.dataframes import (
9
+ CombinedCategorizedFeature,
10
+ ContCats,
11
+ TrainDataframes,
12
+ CategorizedFeature,
13
+ )
14
+ from champions.model.filter import CombineFilter, Filter, SingleFilter
15
+ from champions.model.settings import TrainSettings
16
+ from champions.model.champions import Champion, Spore
17
+ import polars as pl
18
+
19
+
20
+ import logging
21
+
22
+ from champions.service.darkwing import Darkwing
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class Train(BaseModel):
28
+ dc: DataCard
29
+ settings: TrainSettings
30
+ darkwing: Optional[Darkwing] = None
31
+
32
+ def model_post_init(self, __context) -> None:
33
+ self.darkwing = Darkwing(dc=self.dc)
34
+ super().model_post_init(__context)
35
+
36
+ def run(self):
37
+ logger.info("start training")
38
+ for n in range(self.settings.n):
39
+ for cat in self.dc.target.values:
40
+ if self.settings.champion_exists(target=cat, n=n):
41
+ logger.info(f"tree {n} for label {cat} already exists")
42
+ continue
43
+
44
+ logger.info(f"train tree {n} for label {cat}")
45
+
46
+ train_res = Champion(
47
+ spore=self.train_champion(
48
+ target_filter=self.gen_target_filter(cat=cat),
49
+ path_filter=[],
50
+ ),
51
+ target=cat,
52
+ )
53
+ self.settings.save_champion(champion=train_res, n=n)
54
+
55
+ def gen_target_filter(self, cat) -> Filter:
56
+ return Filter(
57
+ combine=[
58
+ CombineFilter(
59
+ combine=[
60
+ SingleFilter(
61
+ feat_name=self.dc.target.feature_name,
62
+ operator="=",
63
+ value=cat,
64
+ )
65
+ ]
66
+ )
67
+ ],
68
+ invert=False,
69
+ )
70
+
71
+ def train_champion(
72
+ self, target_filter: Filter, path_filter: list[Filter], depth: str = ""
73
+ ) -> list[Spore]:
74
+ train_df = self.darkwing.read_akt_train(
75
+ targer_filter=target_filter,
76
+ train_settings=self.settings,
77
+ akt_filters=path_filter,
78
+ )
79
+
80
+ if train_df.is_final_size() or len(depth) >= self.settings.max_depth:
81
+ score = train_df.score()
82
+ logging.info(
83
+ f"final spore depth: {depth}, {score} non target {train_df.non_target_df_size} target: {train_df.target_df_size}"
84
+ )
85
+ return [
86
+ Spore(
87
+ cut=[fil.sql() for fil in path_filter],
88
+ score=score,
89
+ depth=depth,
90
+ )
91
+ ]
92
+
93
+ self.cater(train_df=train_df)
94
+ left_filter, right_filter = self.counter(train_df=train_df)
95
+ left_spores = self.train_champion(
96
+ target_filter=target_filter,
97
+ path_filter=path_filter + [left_filter],
98
+ depth=depth + "l",
99
+ )
100
+ right_spores = self.train_champion(
101
+ target_filter=target_filter,
102
+ path_filter=path_filter + [right_filter],
103
+ depth=depth + "r",
104
+ )
105
+ return left_spores + right_spores
106
+
107
+ def counter(self, train_df: TrainDataframes) -> tuple[Filter, Filter]:
108
+ sorted_train_feats = sorted(
109
+ train_df.train_features,
110
+ key=lambda x: x.calc_diff(),
111
+ reverse=True,
112
+ )
113
+ best_feat = sorted_train_feats[0]
114
+ best_feat_diff = best_feat.calc_diff()
115
+
116
+ for dim in range(2, self.settings.n_dims + 1):
117
+ counter = 0
118
+ for comb in itertools.combinations(sorted_train_feats, r=dim):
119
+ counter += 1
120
+ akt_feature = CombinedCategorizedFeature(
121
+ comb,
122
+ non_target_size=train_df.n_count_non_target,
123
+ target_size=train_df.n_count_target,
124
+ )
125
+ if akt_feature.calc_diff() > best_feat_diff:
126
+ best_feat = akt_feature
127
+ best_feat_diff = akt_feature.calc_diff()
128
+
129
+ if (
130
+ self.settings.calcs_per_dim
131
+ and counter > self.settings.calcs_per_dim
132
+ ):
133
+ break
134
+
135
+ # with pl.Config(tbl_rows=100, tbl_cols=100):
136
+ # print(best_feat.diff_df)
137
+
138
+ # print(best_feat_diff, train_df.n_count_non_target)
139
+ # print(best_feat_diff, train_df.n_count_target)
140
+
141
+ return best_feat.get_left_right_filter()
142
+
143
+ def cater(self, train_df: TrainDataframes):
144
+ for feat in self.dc.train_cat_feature:
145
+ self.cat_cater(feat=feat, train_df=train_df)
146
+ for feat in self.dc.train_con_feature:
147
+ self.cont_cater(feat=feat, train_df=train_df)
148
+
149
+ def cat_cater(self, feat: Feature, train_df: TrainDataframes):
150
+ raise NotImplementedError("gibt noch net")
151
+ df = train_df.non_target_df_cat
152
+ print(df.select(pl.col(feat.name).value_counts()))
153
+
154
+ def get_n_quantiles(self, series: pl.Series, n: int) -> ContCats:
155
+ res = []
156
+ label = ["0"]
157
+ idx = 0
158
+ for i in range(1, n):
159
+ quantile = series.quantile(quantile=i / n, interpolation="lower")
160
+ if quantile in res:
161
+ continue
162
+ idx += 1
163
+ label.append(f"{idx}")
164
+ res.append(quantile)
165
+ return ContCats(cuts=res, labels=label, feat_name=series.name)
166
+
167
+ def cont_cater(self, feat: Feature, train_df: TrainDataframes):
168
+ categorized_feature = self.cont_cater_impl(feat=feat, train_df=train_df, n=2)
169
+
170
+ for n in range(3,self.settings.n_cat+1):
171
+ cat_feature = self.cont_cater_impl(feat=feat, train_df=train_df, n=n)
172
+ if cat_feature.calc_diff() > categorized_feature.calc_diff():
173
+ categorized_feature = cat_feature
174
+
175
+ if not categorized_feature.is_diff_to_low():
176
+ train_df.train_features.append(categorized_feature)
177
+
178
+
179
+ def cont_cater_impl(
180
+ self, feat: Feature, train_df: TrainDataframes, n: int
181
+ ) -> CategorizedFeature:
182
+
183
+
184
+ df_sorted = train_df.df_group[[feat.name,'weight']].sort(feat.name)
185
+ cumulative_weights = df_sorted['weight'].cum_sum()
186
+
187
+ res = []
188
+ label = ["0"]
189
+ idx = 0
190
+ for i in range(1, n):
191
+ quantile = i /n
192
+ index = (cumulative_weights >= quantile).arg_true()[0]
193
+
194
+
195
+ if index > 0:
196
+ weight_below = cumulative_weights[index - 1]
197
+ value_below = df_sorted[feat.name][index - 1]
198
+ value_at_index = df_sorted[feat.name][index]
199
+ fraction = (quantile - weight_below) / (cumulative_weights[index] - weight_below)
200
+ value = value_below + fraction * (value_at_index - value_below)
201
+ else:
202
+ value = df_sorted[feat.name][0]
203
+ if value in res:
204
+ continue
205
+ idx += 1
206
+ label.append(f"{idx}")
207
+ res.append(value)
208
+
209
+
210
+ return train_df.create_categorized_features(
211
+ feat=feat,
212
+ cuts=ContCats(cuts=res, labels=label, feat_name=feat.name),
213
+ )
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: champions
3
+ Version: 0.9.0
4
+ Summary: Add your description here
5
+ Project-URL: Homepage, https://gitlab.com/gwhe/champions
6
+ Project-URL: Issues, https://gitlab.com/gwhe/champions/-/issues
7
+ Author-email: swayand <stefan.wayand@gmail.com>
8
+ License-Expression: GPL-3.0-or-later
9
+ License-File: LICENSE
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.13
13
+ Requires-Dist: altair>=5.5.0
14
+ Requires-Dist: duckdb>=1.3.2
15
+ Requires-Dist: polars>=1.32.0
16
+ Requires-Dist: pyarrow>=19.0.1
17
+ Requires-Dist: pydantic>=2.10.6
18
+ Requires-Dist: pyyaml>=6.0.2
19
+ Requires-Dist: scikit-learn>=1.7.1
20
+ Requires-Dist: typer>=0.15.2
21
+ Requires-Dist: vl-convert-python>=1.8.0
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Champions
25
+
26
+
27
+ ## Status
28
+
29
+ Disclaimer Doku muss noch gemahct werden.
30
+
31
+ ## Initial Thoughts
32
+
33
+ The idea of this project is to make binary classification. It departs from some well-established Machine Learning methods in order to test some new ones. Some are purely technical, while others are more fundamental. This is a complete rewrite of my first private package, which I called Pilze (mushrooms). In German, "Champions" and "Champignons" sound similar, which is why I liked the name. But "Mushrooms" explains the project's purpose very well. Mushrooms are neither plants nor animals, but somewhere in between. In ML, it's all about correlations, and in a very abstract way, neural networks and tree-based algorithms solve this problem in opposite directions. Neural networks look at a single event and try to learn all correlations at one stage, while tree-based ML algorithms look at all (or many) events and try to find the correlations stage by stage. SVMs are a step further than trees, but in my personal opinion, they focus on the wrong thing. They focus on separation and not on the correlation of the individual classes. The initial idea of this project was to build a tree in which nodes not only make one cut but a set of cuts in more than one dimension. When it comes to implementations, the obvious choice is to use sklearn. It is one of the best software projects ever built, in my opinion. But some things are really annoying, and overall, it is hard to fit new ideas into this very strict framework.
@@ -0,0 +1,17 @@
1
+ champions/__init__.py,sha256=Wbt9W5dYpbtuqqoh7K24jtGSMZSrzY5t1fnql0k6_No,641
2
+ champions/cli.py,sha256=iXfoAxS9-oDyK1gsDonT7O0lj-SgK5rBRnNKiF44rWQ,991
3
+ champions/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ champions/model/champions.py,sha256=2avlOipAR78d1N-xqExWT0a4WdDKEoF_wwcrPGPNJn0,799
5
+ champions/model/datacard.py,sha256=I2yhhsw5JN9Au6sKIJvvfAYmHzflsKh4eiOyTb9XlsY,1175
6
+ champions/model/dataframes.py,sha256=nGdec3ubQI5z-Xsf_aljtN1K3PtGElKy1Ms7akMjEZE,7516
7
+ champions/model/filter.py,sha256=T-M-W9rYMY_Ool7N-CbDGe3ecPMU2gFrSLS5NDVPrjM,1364
8
+ champions/model/settings.py,sha256=qjLw9fUx2KScYzsh3-53gEcE6tdc0HI-vH0di8Uf6Jg,2118
9
+ champions/service/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ champions/service/darkwing.py,sha256=hXEzXhsOTUB0qEmIaWySXMiGkNfWjq5Hui1prAQEfD8,3189
11
+ champions/service/eval.py,sha256=lxy5jkExMTpJdtNHeEQj3jYn4DezWnhv5urMwYwCnB8,4689
12
+ champions/service/train.py,sha256=lJP01OikyjnLy4MgmBpool7IuNnzd9TK7zT53W4LOWo,7329
13
+ champions-0.9.0.dist-info/METADATA,sha256=9YbdlkbofJ3DsOI8Otf1fsXmAJegZwGNIp-eGhHpfng,2274
14
+ champions-0.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
15
+ champions-0.9.0.dist-info/entry_points.txt,sha256=Jrat4iyGkU0OttDrUreuSo9fbaA5iJSsV2TPdiy_Sok,48
16
+ champions-0.9.0.dist-info/licenses/LICENSE,sha256=1neeRA1TYb6baPpggguJlo8nWoiIlnBvhF_Sb1DTWhs,35139
17
+ champions-0.9.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ champions = champions.cli:app