onekit 1.3.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: onekit
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: All-in-One Python Kit.
5
5
  Home-page: https://github.com/estripling/onekit
6
6
  License: BSD 3-Clause
7
7
  Keywords: onekit
8
8
  Author: Eugen Stripling
9
9
  Author-email: estripling042@gmail.com
10
- Requires-Python: >=3.8.1
10
+ Requires-Python: >=3.9
11
11
  Classifier: License :: Other/Proprietary License
12
12
  Classifier: Programming Language :: Python :: 3
13
13
  Classifier: Programming Language :: Python :: 3.9
@@ -15,7 +15,6 @@ Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3 :: Only
18
- Classifier: Programming Language :: Python :: 3.8
19
18
  Requires-Dist: pytz (>=2024.1,<2025.0)
20
19
  Requires-Dist: toolz (>=0.12.0,<0.13.0)
21
20
  Project-URL: Documentation, https://onekit.readthedocs.io/en/stable/
@@ -46,7 +45,7 @@ All-in-One Python Kit:
46
45
 
47
46
  ## Installation
48
47
 
49
- `onekit` is available on [PyPI](https://pypi.org/project/onekit/) for Python 3.8+:
48
+ `onekit` is available on [PyPI](https://pypi.org/project/onekit/) for Python 3.9+:
50
49
 
51
50
  ```console
52
51
  pip install onekit
@@ -22,7 +22,7 @@ All-in-One Python Kit:
22
22
 
23
23
  ## Installation
24
24
 
25
- `onekit` is available on [PyPI](https://pypi.org/project/onekit/) for Python 3.8+:
25
+ `onekit` is available on [PyPI](https://pypi.org/project/onekit/) for Python 3.9+:
26
26
 
27
27
  ```console
28
28
  pip install onekit
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "onekit"
3
- version = "1.3.0"
3
+ version = "1.4.0"
4
4
  description = "All-in-One Python Kit."
5
5
  authors = ["Eugen Stripling <estripling042@gmail.com>"]
6
6
  license = "BSD 3-Clause"
@@ -10,7 +10,6 @@ documentation = "https://onekit.readthedocs.io/en/stable/"
10
10
  keywords = ["onekit"]
11
11
  classifiers = [
12
12
  "Programming Language :: Python :: 3 :: Only",
13
- "Programming Language :: Python :: 3.8",
14
13
  "Programming Language :: Python :: 3.9",
15
14
  "Programming Language :: Python :: 3.10",
16
15
  "Programming Language :: Python :: 3.11",
@@ -18,7 +17,7 @@ classifiers = [
18
17
  ]
19
18
 
20
19
  [tool.poetry.dependencies]
21
- python = ">=3.8.1"
20
+ python = ">=3.9"
22
21
  toolz = "^0.12.0"
23
22
  pytz = "^2024.1"
24
23
 
@@ -50,6 +49,9 @@ python-semantic-release = "^8.3.0"
50
49
  [tool.poetry.group.pandaskit.dependencies]
51
50
  pandas = ">=0.23.2"
52
51
 
52
+ [tool.poetry.group.sklearnkit.dependencies]
53
+ scikit-learn = ">=1.3"
54
+
53
55
  [tool.poetry.group.sparkkit.dependencies]
54
56
  pyspark = "3.1.1"
55
57
 
@@ -0,0 +1,153 @@
1
+ from typing import (
2
+ Optional,
3
+ Union,
4
+ )
5
+
6
+ import numpy as np
7
+ import numpy.typing as npt
8
+ import pandas as pd
9
+ from pandas import DataFrame as PandasDF
10
+ from sklearn import metrics
11
+ from sklearn.utils import validation
12
+
13
+ __all__ = (
14
+ "precision_given_recall_score",
15
+ "threshold_summary",
16
+ )
17
+
18
+
19
+ ArrayLike = npt.ArrayLike
20
+
21
+
22
+ def precision_given_recall_score(
23
+ y_true: ArrayLike,
24
+ y_score: ArrayLike,
25
+ *,
26
+ min_recall: float,
27
+ pos_label: Optional[Union[int, str]] = None,
28
+ ) -> float:
29
+ """Compute precision given a desired minimum recall level.
30
+
31
+ Examples
32
+ --------
33
+ >>> import onekit.sklearnkit as slk
34
+ >>> y_true = [0, 1, 1, 1, 0, 0, 0, 1]
35
+ >>> y_score = [0.1, 0.4, 0.35, 0.8, 0.5, 0.2, 0.75, 0.5]
36
+ >>> slk.precision_given_recall_score(y_true, y_score, min_recall=0.7)
37
+ 0.6
38
+ """
39
+ if not (0 < min_recall <= 1):
40
+ raise ValueError(f"{min_recall=} - must be a float in (0, 1]")
41
+
42
+ df = (
43
+ threshold_summary(y_true, y_score, pos_label=pos_label)
44
+ .filter(items=["precision", "recall"])
45
+ .query(f"recall >= {min_recall}")
46
+ )
47
+
48
+ min_empirical_recall = df["recall"].min()
49
+
50
+ return float(
51
+ 0
52
+ if df.empty
53
+ else df.query(f"recall == {min_empirical_recall}")["precision"].max()
54
+ )
55
+
56
+
57
+ def threshold_summary(
58
+ y_true: ArrayLike,
59
+ y_score: ArrayLike,
60
+ *,
61
+ pos_label: Optional[Union[int, str]] = None,
62
+ ) -> PandasDF:
63
+ """Threshold summary.
64
+
65
+ Notes
66
+ -----
67
+ - Support for binary classification only
68
+ - Assumpution: classifier returns scores
69
+ - First values correspond to the edge case where everything is predicted positive
70
+ - Last values correspond to the edge case where everything is predicted negative
71
+
72
+ Examples
73
+ --------
74
+ >>> import onekit.sklearnkit as slk
75
+ >>> y_true = [0, 1, 1, 1, 0, 0, 0, 1]
76
+ >>> y_score = [0.1, 0.4, 0.35, 0.8, 0.5, 0.2, 0.75, 0.5]
77
+ >>> with pd.option_context("display.float_format", "{:.2f}".format):
78
+ ... slk.threshold_summary(y_true, y_score).T
79
+ 0 1 2 3 4 5 6 7
80
+ threshold 0.10 0.20 0.35 0.40 0.50 0.75 0.80 inf
81
+ predicted_positive 8.00 7.00 6.00 5.00 4.00 2.00 1.00 0.00
82
+ true_positive 4.00 4.00 4.00 3.00 2.00 1.00 1.00 0.00
83
+ false_positive 4.00 3.00 2.00 2.00 2.00 1.00 0.00 0.00
84
+ false_negative 0.00 0.00 0.00 1.00 2.00 3.00 3.00 4.00
85
+ true_negative 0.00 1.00 2.00 2.00 2.00 3.00 4.00 4.00
86
+ precision 0.50 0.57 0.67 0.60 0.50 0.50 1.00 1.00
87
+ recall 1.00 1.00 1.00 0.75 0.50 0.25 0.25 0.00
88
+ f1 0.67 0.73 0.80 0.67 0.50 0.33 0.40 0.00
89
+ accuracy 0.50 0.62 0.75 0.62 0.50 0.50 0.62 0.50
90
+ balanced_accuracy 0.50 0.62 0.75 0.62 0.50 0.50 0.62 0.50
91
+ matthews_corrcoef NaN 0.38 0.58 0.26 0.00 0.00 0.38 NaN
92
+ """
93
+ y = validation.column_or_1d(y_true)
94
+ s = validation.column_or_1d(y_score)
95
+ validation.check_consistent_length(y, s)
96
+ validation.assert_all_finite(y)
97
+ validation.assert_all_finite(s)
98
+ pos_label = validation._check_pos_label_consistency(pos_label, y)
99
+
100
+ precision, recall, thresholds = metrics.precision_recall_curve(
101
+ y_true=y,
102
+ y_score=s,
103
+ pos_label=pos_label,
104
+ sample_weight=None,
105
+ drop_intermediate=False,
106
+ )
107
+
108
+ is_true_pos = y == pos_label
109
+ is_true_neg = y != pos_label
110
+
111
+ def is_pred_pos(t: float) -> np.ndarray:
112
+ return s >= t
113
+
114
+ def is_pred_neg(t: float) -> np.ndarray:
115
+ return s < t
116
+
117
+ return (
118
+ pd.DataFrame(np.append(thresholds, np.inf), columns=["t"])
119
+ .assign(
120
+ pp=lambda df: df.t.map(lambda t: is_pred_pos(t).sum()),
121
+ tp=lambda df: df.t.map(lambda t: (is_pred_pos(t) & is_true_pos).sum()),
122
+ fp=lambda df: df.t.map(lambda t: (is_pred_pos(t) & is_true_neg).sum()),
123
+ fn=lambda df: df.t.map(lambda t: (is_pred_neg(t) & is_true_pos).sum()),
124
+ tn=lambda df: df.t.map(lambda t: (is_pred_neg(t) & is_true_neg).sum()),
125
+ precision=precision,
126
+ recall=recall,
127
+ f1=2 * (precision * recall) / (precision + recall),
128
+ acc=lambda df: (df.tp + df.tn) / (df.tp + df.tn + df.fp + df.fn),
129
+ bacc=lambda df: 0.5 * (df.tp / (df.tp + df.fn) + df.tn / (df.tn + df.fp)),
130
+ mcc=lambda df: np.true_divide(
131
+ (df.tp * df.tn - df.fp * df.fn),
132
+ np.sqrt(
133
+ (df.tp + df.fp)
134
+ * (df.tp + df.fn)
135
+ * (df.tn + df.fp)
136
+ * (df.tn + df.fn)
137
+ ),
138
+ ),
139
+ )
140
+ .rename(
141
+ columns={
142
+ "t": "threshold",
143
+ "pp": "predicted_positive",
144
+ "tp": "true_positive",
145
+ "fp": "false_positive",
146
+ "fn": "false_negative",
147
+ "tn": "true_negative",
148
+ "acc": "accuracy",
149
+ "bacc": "balanced_accuracy",
150
+ "mcc": "matthews_corrcoef",
151
+ },
152
+ )
153
+ )
@@ -1075,9 +1075,11 @@ def select_col_types(df: SparkDF, /, *col_types: T.DataType) -> List[str]:
1075
1075
  >>> sk.select_col_types(df, T.IntegerType, T.LongType)
1076
1076
  ['int', 'long']
1077
1077
  """
1078
+ valid_types = {v.typeName() for k, v in T.__dict__.items() if k.endswith("Type")}
1078
1079
  col_types = tuple(pk.flatten(col_types))
1079
- if not all(isinstance(col_type, T.DataTypeSingleton) for col_type in col_types):
1080
- raise TypeError(f"{col_types=} - must be a data type of pyspark.sql.types")
1080
+ for col_type in col_types:
1081
+ if not hasattr(col_type, "typeName") or col_type.typeName() not in valid_types:
1082
+ raise TypeError(f"{col_type=} - must be a valid data type: {valid_types}")
1081
1083
  return [c for c in df.columns if isinstance(df.schema[c].dataType, col_types)]
1082
1084
 
1083
1085
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes