upgini 1.1.266a3254.post2__py3-none-any.whl → 1.1.267a3254.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/autofe/date.py +22 -14
- upgini/features_enricher.py +38 -6
- upgini/resource_bundle/strings.properties +4 -2
- upgini/utils/target_utils.py +18 -0
- {upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/METADATA +1 -1
- {upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/RECORD +9 -9
- {upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/LICENSE +0 -0
- {upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/WHEEL +0 -0
- {upgini-1.1.266a3254.post2.dist-info → upgini-1.1.267a3254.post3.dist-info}/top_level.txt +0 -0
upgini/autofe/date.py
CHANGED
|
@@ -54,6 +54,9 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
54
54
|
return diff
|
|
55
55
|
|
|
56
56
|
|
|
57
|
+
_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
|
|
58
|
+
|
|
59
|
+
|
|
57
60
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
58
61
|
is_binary = True
|
|
59
62
|
has_symmetry_importance = True
|
|
@@ -72,18 +75,31 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
72
75
|
|
|
73
76
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
74
77
|
left = self._convert_to_date(left, self.left_unit)
|
|
78
|
+
right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
|
|
79
|
+
|
|
80
|
+
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
75
81
|
|
|
76
|
-
|
|
82
|
+
def _diff(self, x):
|
|
83
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
84
|
+
return x[x > 0]
|
|
85
|
+
|
|
86
|
+
def _agg(self, x):
|
|
87
|
+
method = getattr(np, self.aggregation, None)
|
|
88
|
+
default = np.nan
|
|
89
|
+
if method is None and self.aggregation in _ext_aggregations:
|
|
90
|
+
method, default = _ext_aggregations[self.aggregation]
|
|
91
|
+
elif not callable(method):
|
|
92
|
+
raise ValueError(f"Unsupported aggregation: {self.aggregation}")
|
|
93
|
+
|
|
94
|
+
return method(x) if len(x) > 0 else default
|
|
77
95
|
|
|
78
96
|
|
|
79
97
|
class DateListDiffBounded(DateListDiff):
|
|
80
98
|
lower_bound: Optional[int]
|
|
81
99
|
upper_bound: Optional[int]
|
|
82
|
-
inclusive: Optional[str]
|
|
83
100
|
|
|
84
101
|
def __init__(self, **data: Any) -> None:
|
|
85
102
|
if "name" not in data:
|
|
86
|
-
inclusive = data.get("inclusive")
|
|
87
103
|
lower_bound = data.get("lower_bound")
|
|
88
104
|
upper_bound = data.get("upper_bound")
|
|
89
105
|
components = [
|
|
@@ -92,18 +108,10 @@ class DateListDiffBounded(DateListDiff):
|
|
|
92
108
|
str(lower_bound if lower_bound is not None else "minusinf"),
|
|
93
109
|
str(upper_bound if upper_bound is not None else "plusinf"),
|
|
94
110
|
]
|
|
95
|
-
if inclusive:
|
|
96
|
-
components.append(inclusive)
|
|
97
111
|
components.append(data.get("aggregation"))
|
|
98
112
|
data["name"] = "_".join(components)
|
|
99
113
|
super().__init__(**data)
|
|
100
114
|
|
|
101
|
-
def
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
& (
|
|
105
|
-
diff_list.between(
|
|
106
|
-
self.lower_bound or -np.inf, self.upper_bound or np.inf, inclusive=self.inclusive or "left"
|
|
107
|
-
)
|
|
108
|
-
)
|
|
109
|
-
].aggregate(self.aggregation)
|
|
115
|
+
def _agg(self, x):
|
|
116
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
117
|
+
return super()._agg(x)
|
upgini/features_enricher.py
CHANGED
|
@@ -94,7 +94,7 @@ try:
|
|
|
94
94
|
except Exception:
|
|
95
95
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
96
96
|
|
|
97
|
-
from upgini.utils.target_utils import define_task
|
|
97
|
+
from upgini.utils.target_utils import calculate_psi, define_task
|
|
98
98
|
from upgini.utils.warning_counter import WarningCounter
|
|
99
99
|
from upgini.version_validator import validate_version
|
|
100
100
|
|
|
@@ -2226,14 +2226,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2226
2226
|
validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
|
|
2227
2227
|
)
|
|
2228
2228
|
|
|
2229
|
-
|
|
2229
|
+
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2230
|
+
has_date = maybe_date_column is not None
|
|
2230
2231
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2231
2232
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2232
2233
|
|
|
2233
|
-
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
2234
|
-
|
|
2235
|
-
df = self.__correct_target(df)
|
|
2236
|
-
|
|
2237
2234
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
2238
2235
|
self.loss, model_task_type, self.runtime_parameters, self.logger
|
|
2239
2236
|
)
|
|
@@ -2245,6 +2242,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2245
2242
|
eval_df[EVAL_SET_INDEX] = idx + 1
|
|
2246
2243
|
df = pd.concat([df, eval_df])
|
|
2247
2244
|
|
|
2245
|
+
df = self.__correct_target(df)
|
|
2246
|
+
|
|
2247
|
+
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
2248
|
+
|
|
2249
|
+
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2250
|
+
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2251
|
+
|
|
2248
2252
|
if DEFAULT_INDEX in df.columns:
|
|
2249
2253
|
msg = self.bundle.get("unsupported_index_column")
|
|
2250
2254
|
self.logger.info(msg)
|
|
@@ -3567,6 +3571,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3567
3571
|
self.logger.warning(msg)
|
|
3568
3572
|
print(msg)
|
|
3569
3573
|
|
|
3574
|
+
def _validate_PSI(self, df: pd.DataFrame):
|
|
3575
|
+
if EVAL_SET_INDEX in df.columns:
|
|
3576
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
3577
|
+
eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
|
|
3578
|
+
else:
|
|
3579
|
+
train = df
|
|
3580
|
+
eval1 = None
|
|
3581
|
+
|
|
3582
|
+
# 1. Check train PSI
|
|
3583
|
+
half_train = round(len(train) / 2)
|
|
3584
|
+
part1 = train[:half_train]
|
|
3585
|
+
part2 = train[half_train:]
|
|
3586
|
+
train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
3587
|
+
if train_psi > 0.2:
|
|
3588
|
+
self.warning_counter.increment()
|
|
3589
|
+
msg = self.bundle.get("train_unstable_target").format(train_psi)
|
|
3590
|
+
print(msg)
|
|
3591
|
+
self.logger.warning(msg)
|
|
3592
|
+
|
|
3593
|
+
# 2. Check train-test PSI
|
|
3594
|
+
if eval1 is not None:
|
|
3595
|
+
train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
3596
|
+
if train_test_psi > 0.2:
|
|
3597
|
+
self.warning_counter.increment()
|
|
3598
|
+
msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
|
|
3599
|
+
print(msg)
|
|
3600
|
+
self.logger.warning(msg)
|
|
3601
|
+
|
|
3570
3602
|
def _dump_python_libs(self):
|
|
3571
3603
|
try:
|
|
3572
3604
|
from pip._internal.operations.freeze import freeze
|
|
@@ -111,7 +111,9 @@ x_is_empty=X is empty
|
|
|
111
111
|
y_is_empty=y is empty
|
|
112
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
113
|
missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
-
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
114
|
+
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
115
|
+
train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
116
|
+
eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
115
117
|
# eval set validation
|
|
116
118
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
117
119
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -198,7 +200,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
198
200
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
199
201
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
200
202
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
201
|
-
target_type_detected
|
|
203
|
+
target_type_detected=\nDetected task type: {}\n
|
|
202
204
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
203
205
|
all_ok_community_invite=❓ Support request
|
|
204
206
|
too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
upgini/utils/target_utils.py
CHANGED
|
@@ -177,3 +177,21 @@ def balance_undersample(
|
|
|
177
177
|
|
|
178
178
|
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
179
179
|
return resampled_data
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
183
|
+
df = pd.concat([expected, actual])
|
|
184
|
+
|
|
185
|
+
# Define the bins for the target variable
|
|
186
|
+
df_min = df.min()
|
|
187
|
+
df_max = df.max()
|
|
188
|
+
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
189
|
+
|
|
190
|
+
# Calculate the base distribution
|
|
191
|
+
train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
|
|
192
|
+
|
|
193
|
+
# Calculate the target distribution
|
|
194
|
+
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
|
+
|
|
196
|
+
# Calculate the PSI
|
|
197
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=poGGf5MZgangMFmfTxRWtE6FDPDy5VUtXLmW2tGiorI,174170
|
|
6
6
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
7
7
|
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
8
8
|
upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
|
|
@@ -14,7 +14,7 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
|
|
|
14
14
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
|
|
16
16
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
17
|
-
upgini/autofe/date.py,sha256=
|
|
17
|
+
upgini/autofe/date.py,sha256=ffASAn0CQiYRovRrTRLjnPmr_3Xy7GlGLieZv7yBoC0,4218
|
|
18
18
|
upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
|
|
19
19
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
20
20
|
upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
|
|
@@ -28,7 +28,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
28
28
|
upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
|
|
29
29
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
30
30
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
31
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
31
|
+
upgini/resource_bundle/strings.properties,sha256=00KNv1A3rxXioktqB9o_V_zX0etC2LZO7NBIEsCoNNQ,26087
|
|
32
32
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
33
33
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
34
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
@@ -52,11 +52,11 @@ upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,4
|
|
|
52
52
|
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
53
53
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
54
54
|
upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
|
|
55
|
-
upgini/utils/target_utils.py,sha256=
|
|
55
|
+
upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
|
|
56
56
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
57
57
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
58
|
+
upgini-1.1.267a3254.post3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
59
|
+
upgini-1.1.267a3254.post3.dist-info/METADATA,sha256=8-ODvHx4kAE3IrjYFRmIsThFJ8nIeBsD1BWjP6iuDno,48167
|
|
60
|
+
upgini-1.1.267a3254.post3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
61
|
+
upgini-1.1.267a3254.post3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
62
|
+
upgini-1.1.267a3254.post3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|