onekit 1.3.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {onekit-1.3.0 → onekit-1.4.0}/PKG-INFO +3 -4
- {onekit-1.3.0 → onekit-1.4.0}/README.md +1 -1
- {onekit-1.3.0 → onekit-1.4.0}/pyproject.toml +5 -3
- onekit-1.4.0/src/onekit/sklearnkit.py +153 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/sparkkit.py +4 -2
- {onekit-1.3.0 → onekit-1.4.0}/LICENSE +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/__init__.py +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/mathkit.py +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/numpykit.py +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/optfunckit.py +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/pandaskit.py +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/pythonkit.py +0 -0
- {onekit-1.3.0 → onekit-1.4.0}/src/onekit/vizkit.py +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: onekit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: All-in-One Python Kit.
|
|
5
5
|
Home-page: https://github.com/estripling/onekit
|
|
6
6
|
License: BSD 3-Clause
|
|
7
7
|
Keywords: onekit
|
|
8
8
|
Author: Eugen Stripling
|
|
9
9
|
Author-email: estripling042@gmail.com
|
|
10
|
-
Requires-Python: >=3.
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
11
|
Classifier: License :: Other/Proprietary License
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -15,7 +15,6 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
19
18
|
Requires-Dist: pytz (>=2024.1,<2025.0)
|
|
20
19
|
Requires-Dist: toolz (>=0.12.0,<0.13.0)
|
|
21
20
|
Project-URL: Documentation, https://onekit.readthedocs.io/en/stable/
|
|
@@ -46,7 +45,7 @@ All-in-One Python Kit:
|
|
|
46
45
|
|
|
47
46
|
## Installation
|
|
48
47
|
|
|
49
|
-
`onekit` is available on [PyPI](https://pypi.org/project/onekit/) for Python 3.
|
|
48
|
+
`onekit` is available on [PyPI](https://pypi.org/project/onekit/) for Python 3.9+:
|
|
50
49
|
|
|
51
50
|
```console
|
|
52
51
|
pip install onekit
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "onekit"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.4.0"
|
|
4
4
|
description = "All-in-One Python Kit."
|
|
5
5
|
authors = ["Eugen Stripling <estripling042@gmail.com>"]
|
|
6
6
|
license = "BSD 3-Clause"
|
|
@@ -10,7 +10,6 @@ documentation = "https://onekit.readthedocs.io/en/stable/"
|
|
|
10
10
|
keywords = ["onekit"]
|
|
11
11
|
classifiers = [
|
|
12
12
|
"Programming Language :: Python :: 3 :: Only",
|
|
13
|
-
"Programming Language :: Python :: 3.8",
|
|
14
13
|
"Programming Language :: Python :: 3.9",
|
|
15
14
|
"Programming Language :: Python :: 3.10",
|
|
16
15
|
"Programming Language :: Python :: 3.11",
|
|
@@ -18,7 +17,7 @@ classifiers = [
|
|
|
18
17
|
]
|
|
19
18
|
|
|
20
19
|
[tool.poetry.dependencies]
|
|
21
|
-
python = ">=3.
|
|
20
|
+
python = ">=3.9"
|
|
22
21
|
toolz = "^0.12.0"
|
|
23
22
|
pytz = "^2024.1"
|
|
24
23
|
|
|
@@ -50,6 +49,9 @@ python-semantic-release = "^8.3.0"
|
|
|
50
49
|
[tool.poetry.group.pandaskit.dependencies]
|
|
51
50
|
pandas = ">=0.23.2"
|
|
52
51
|
|
|
52
|
+
[tool.poetry.group.sklearnkit.dependencies]
|
|
53
|
+
scikit-learn = ">=1.3"
|
|
54
|
+
|
|
53
55
|
[tool.poetry.group.sparkkit.dependencies]
|
|
54
56
|
pyspark = "3.1.1"
|
|
55
57
|
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from typing import (
|
|
2
|
+
Optional,
|
|
3
|
+
Union,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import numpy.typing as npt
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pandas import DataFrame as PandasDF
|
|
10
|
+
from sklearn import metrics
|
|
11
|
+
from sklearn.utils import validation
|
|
12
|
+
|
|
13
|
+
__all__ = (
|
|
14
|
+
"precision_given_recall_score",
|
|
15
|
+
"threshold_summary",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
ArrayLike = npt.ArrayLike
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def precision_given_recall_score(
|
|
23
|
+
y_true: ArrayLike,
|
|
24
|
+
y_score: ArrayLike,
|
|
25
|
+
*,
|
|
26
|
+
min_recall: float,
|
|
27
|
+
pos_label: Optional[Union[int, str]] = None,
|
|
28
|
+
) -> float:
|
|
29
|
+
"""Compute precision given a desired minimum recall level.
|
|
30
|
+
|
|
31
|
+
Examples
|
|
32
|
+
--------
|
|
33
|
+
>>> import onekit.sklearnkit as slk
|
|
34
|
+
>>> y_true = [0, 1, 1, 1, 0, 0, 0, 1]
|
|
35
|
+
>>> y_score = [0.1, 0.4, 0.35, 0.8, 0.5, 0.2, 0.75, 0.5]
|
|
36
|
+
>>> slk.precision_given_recall_score(y_true, y_score, min_recall=0.7)
|
|
37
|
+
0.6
|
|
38
|
+
"""
|
|
39
|
+
if not (0 < min_recall <= 1):
|
|
40
|
+
raise ValueError(f"{min_recall=} - must be a float in (0, 1]")
|
|
41
|
+
|
|
42
|
+
df = (
|
|
43
|
+
threshold_summary(y_true, y_score, pos_label=pos_label)
|
|
44
|
+
.filter(items=["precision", "recall"])
|
|
45
|
+
.query(f"recall >= {min_recall}")
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
min_empirical_recall = df["recall"].min()
|
|
49
|
+
|
|
50
|
+
return float(
|
|
51
|
+
0
|
|
52
|
+
if df.empty
|
|
53
|
+
else df.query(f"recall == {min_empirical_recall}")["precision"].max()
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def threshold_summary(
|
|
58
|
+
y_true: ArrayLike,
|
|
59
|
+
y_score: ArrayLike,
|
|
60
|
+
*,
|
|
61
|
+
pos_label: Optional[Union[int, str]] = None,
|
|
62
|
+
) -> PandasDF:
|
|
63
|
+
"""Threshold summary.
|
|
64
|
+
|
|
65
|
+
Notes
|
|
66
|
+
-----
|
|
67
|
+
- Support for binary classification only
|
|
68
|
+
- Assumpution: classifier returns scores
|
|
69
|
+
- First values correspond to the edge case where everything is predicted positive
|
|
70
|
+
- Last values correspond to the edge case where everything is predicted negative
|
|
71
|
+
|
|
72
|
+
Examples
|
|
73
|
+
--------
|
|
74
|
+
>>> import onekit.sklearnkit as slk
|
|
75
|
+
>>> y_true = [0, 1, 1, 1, 0, 0, 0, 1]
|
|
76
|
+
>>> y_score = [0.1, 0.4, 0.35, 0.8, 0.5, 0.2, 0.75, 0.5]
|
|
77
|
+
>>> with pd.option_context("display.float_format", "{:.2f}".format):
|
|
78
|
+
... slk.threshold_summary(y_true, y_score).T
|
|
79
|
+
0 1 2 3 4 5 6 7
|
|
80
|
+
threshold 0.10 0.20 0.35 0.40 0.50 0.75 0.80 inf
|
|
81
|
+
predicted_positive 8.00 7.00 6.00 5.00 4.00 2.00 1.00 0.00
|
|
82
|
+
true_positive 4.00 4.00 4.00 3.00 2.00 1.00 1.00 0.00
|
|
83
|
+
false_positive 4.00 3.00 2.00 2.00 2.00 1.00 0.00 0.00
|
|
84
|
+
false_negative 0.00 0.00 0.00 1.00 2.00 3.00 3.00 4.00
|
|
85
|
+
true_negative 0.00 1.00 2.00 2.00 2.00 3.00 4.00 4.00
|
|
86
|
+
precision 0.50 0.57 0.67 0.60 0.50 0.50 1.00 1.00
|
|
87
|
+
recall 1.00 1.00 1.00 0.75 0.50 0.25 0.25 0.00
|
|
88
|
+
f1 0.67 0.73 0.80 0.67 0.50 0.33 0.40 0.00
|
|
89
|
+
accuracy 0.50 0.62 0.75 0.62 0.50 0.50 0.62 0.50
|
|
90
|
+
balanced_accuracy 0.50 0.62 0.75 0.62 0.50 0.50 0.62 0.50
|
|
91
|
+
matthews_corrcoef NaN 0.38 0.58 0.26 0.00 0.00 0.38 NaN
|
|
92
|
+
"""
|
|
93
|
+
y = validation.column_or_1d(y_true)
|
|
94
|
+
s = validation.column_or_1d(y_score)
|
|
95
|
+
validation.check_consistent_length(y, s)
|
|
96
|
+
validation.assert_all_finite(y)
|
|
97
|
+
validation.assert_all_finite(s)
|
|
98
|
+
pos_label = validation._check_pos_label_consistency(pos_label, y)
|
|
99
|
+
|
|
100
|
+
precision, recall, thresholds = metrics.precision_recall_curve(
|
|
101
|
+
y_true=y,
|
|
102
|
+
y_score=s,
|
|
103
|
+
pos_label=pos_label,
|
|
104
|
+
sample_weight=None,
|
|
105
|
+
drop_intermediate=False,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
is_true_pos = y == pos_label
|
|
109
|
+
is_true_neg = y != pos_label
|
|
110
|
+
|
|
111
|
+
def is_pred_pos(t: float) -> np.ndarray:
|
|
112
|
+
return s >= t
|
|
113
|
+
|
|
114
|
+
def is_pred_neg(t: float) -> np.ndarray:
|
|
115
|
+
return s < t
|
|
116
|
+
|
|
117
|
+
return (
|
|
118
|
+
pd.DataFrame(np.append(thresholds, np.inf), columns=["t"])
|
|
119
|
+
.assign(
|
|
120
|
+
pp=lambda df: df.t.map(lambda t: is_pred_pos(t).sum()),
|
|
121
|
+
tp=lambda df: df.t.map(lambda t: (is_pred_pos(t) & is_true_pos).sum()),
|
|
122
|
+
fp=lambda df: df.t.map(lambda t: (is_pred_pos(t) & is_true_neg).sum()),
|
|
123
|
+
fn=lambda df: df.t.map(lambda t: (is_pred_neg(t) & is_true_pos).sum()),
|
|
124
|
+
tn=lambda df: df.t.map(lambda t: (is_pred_neg(t) & is_true_neg).sum()),
|
|
125
|
+
precision=precision,
|
|
126
|
+
recall=recall,
|
|
127
|
+
f1=2 * (precision * recall) / (precision + recall),
|
|
128
|
+
acc=lambda df: (df.tp + df.tn) / (df.tp + df.tn + df.fp + df.fn),
|
|
129
|
+
bacc=lambda df: 0.5 * (df.tp / (df.tp + df.fn) + df.tn / (df.tn + df.fp)),
|
|
130
|
+
mcc=lambda df: np.true_divide(
|
|
131
|
+
(df.tp * df.tn - df.fp * df.fn),
|
|
132
|
+
np.sqrt(
|
|
133
|
+
(df.tp + df.fp)
|
|
134
|
+
* (df.tp + df.fn)
|
|
135
|
+
* (df.tn + df.fp)
|
|
136
|
+
* (df.tn + df.fn)
|
|
137
|
+
),
|
|
138
|
+
),
|
|
139
|
+
)
|
|
140
|
+
.rename(
|
|
141
|
+
columns={
|
|
142
|
+
"t": "threshold",
|
|
143
|
+
"pp": "predicted_positive",
|
|
144
|
+
"tp": "true_positive",
|
|
145
|
+
"fp": "false_positive",
|
|
146
|
+
"fn": "false_negative",
|
|
147
|
+
"tn": "true_negative",
|
|
148
|
+
"acc": "accuracy",
|
|
149
|
+
"bacc": "balanced_accuracy",
|
|
150
|
+
"mcc": "matthews_corrcoef",
|
|
151
|
+
},
|
|
152
|
+
)
|
|
153
|
+
)
|
|
@@ -1075,9 +1075,11 @@ def select_col_types(df: SparkDF, /, *col_types: T.DataType) -> List[str]:
|
|
|
1075
1075
|
>>> sk.select_col_types(df, T.IntegerType, T.LongType)
|
|
1076
1076
|
['int', 'long']
|
|
1077
1077
|
"""
|
|
1078
|
+
valid_types = {v.typeName() for k, v in T.__dict__.items() if k.endswith("Type")}
|
|
1078
1079
|
col_types = tuple(pk.flatten(col_types))
|
|
1079
|
-
|
|
1080
|
-
|
|
1080
|
+
for col_type in col_types:
|
|
1081
|
+
if not hasattr(col_type, "typeName") or col_type.typeName() not in valid_types:
|
|
1082
|
+
raise TypeError(f"{col_type=} - must be a valid data type: {valid_types}")
|
|
1081
1083
|
return [c for c in df.columns if isinstance(df.schema[c].dataType, col_types)]
|
|
1082
1084
|
|
|
1083
1085
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|