upgini 1.2.112__py3-none-any.whl → 1.2.113a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +84 -21
- upgini/resource_bundle/strings.properties +7 -0
- {upgini-1.2.112.dist-info → upgini-1.2.113a1.dist-info}/METADATA +1 -1
- {upgini-1.2.112.dist-info → upgini-1.2.113a1.dist-info}/RECORD +7 -7
- {upgini-1.2.112.dist-info → upgini-1.2.113a1.dist-info}/WHEEL +0 -0
- {upgini-1.2.112.dist-info → upgini-1.2.113a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.113a1"
|
upgini/features_enricher.py
CHANGED
@@ -415,6 +415,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
415
415
|
y: Union[pd.Series, np.ndarray, List],
|
416
416
|
eval_set: Optional[Union[List[tuple], tuple]] = None,
|
417
417
|
*args,
|
418
|
+
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
418
419
|
exclude_features_sources: Optional[List[str]] = None,
|
419
420
|
calculate_metrics: Optional[bool] = None,
|
420
421
|
estimator: Optional[Any] = None,
|
@@ -443,6 +444,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
443
444
|
eval_set: List[tuple], optional (default=None)
|
444
445
|
List of pairs (X, y) for validation.
|
445
446
|
|
447
|
+
oot: pandas.DataFrame of shape (n_samples, n_features)
|
448
|
+
Out of time data.
|
449
|
+
|
446
450
|
importance_threshold: float, optional (default=None)
|
447
451
|
Minimum SHAP value to select a feature. Default value is 0.0.
|
448
452
|
|
@@ -508,7 +512,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
508
512
|
X,
|
509
513
|
y,
|
510
514
|
self.eval_set,
|
511
|
-
|
515
|
+
oot=oot,
|
516
|
+
progress_bar=progress_bar,
|
512
517
|
start_time=start_time,
|
513
518
|
exclude_features_sources=exclude_features_sources,
|
514
519
|
calculate_metrics=calculate_metrics,
|
@@ -563,6 +568,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
563
568
|
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
564
569
|
eval_set: Optional[Union[List[tuple], tuple]] = None,
|
565
570
|
*args,
|
571
|
+
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
566
572
|
exclude_features_sources: Optional[List[str]] = None,
|
567
573
|
keep_input: bool = True,
|
568
574
|
importance_threshold: Optional[float] = None,
|
@@ -667,7 +673,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
667
673
|
X,
|
668
674
|
y,
|
669
675
|
self.eval_set,
|
670
|
-
|
676
|
+
oot=oot,
|
677
|
+
progress_bar=progress_bar,
|
671
678
|
start_time=start_time,
|
672
679
|
exclude_features_sources=exclude_features_sources,
|
673
680
|
calculate_metrics=calculate_metrics,
|
@@ -940,7 +947,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
940
947
|
):
|
941
948
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
942
949
|
|
943
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
950
|
+
validated_X, validated_y, validated_eval_set, _ = self._validate_train_eval(
|
944
951
|
effective_X, effective_y, effective_eval_set
|
945
952
|
)
|
946
953
|
|
@@ -1535,7 +1542,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1535
1542
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1536
1543
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1537
1544
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1538
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1545
|
+
validated_X, validated_y, validated_eval_set, _ = self._validate_train_eval(X, y, checked_eval_set)
|
1539
1546
|
|
1540
1547
|
sampled_data = self._get_enriched_for_metrics(
|
1541
1548
|
trace_id,
|
@@ -1931,11 +1938,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1931
1938
|
)
|
1932
1939
|
|
1933
1940
|
# Handle eval sets extraction based on EVAL_SET_INDEX
|
1934
|
-
if EVAL_SET_INDEX in enriched_Xy.columns:
|
1935
|
-
|
1936
|
-
if 0 in eval_set_indices:
|
1937
|
-
eval_set_indices.remove(0)
|
1938
|
-
for eval_set_index in eval_set_indices:
|
1941
|
+
if EVAL_SET_INDEX in enriched_Xy.columns and eval_set is not None:
|
1942
|
+
for eval_set_index in range(1, len(eval_set) + 1):
|
1939
1943
|
enriched_eval_sets[eval_set_index] = enriched_Xy.loc[
|
1940
1944
|
enriched_Xy[EVAL_SET_INDEX] == eval_set_index
|
1941
1945
|
].copy()
|
@@ -2047,7 +2051,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2047
2051
|
)
|
2048
2052
|
|
2049
2053
|
def __combine_train_and_eval_sets(
|
2050
|
-
self,
|
2054
|
+
self,
|
2055
|
+
X: pd.DataFrame,
|
2056
|
+
y: Optional[pd.Series] = None,
|
2057
|
+
eval_set: Optional[List[tuple]] = None,
|
2058
|
+
oot: Optional[pd.DataFrame] = None,
|
2051
2059
|
) -> pd.DataFrame:
|
2052
2060
|
df = X.copy()
|
2053
2061
|
if y is not None:
|
@@ -2063,6 +2071,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2063
2071
|
eval_df_with_index[TARGET] = eval_y
|
2064
2072
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
2065
2073
|
df = pd.concat([df, eval_df_with_index])
|
2074
|
+
|
2075
|
+
if oot is not None:
|
2076
|
+
oot_df_with_index = oot.copy()
|
2077
|
+
oot_df_with_index[EVAL_SET_INDEX] = -1
|
2078
|
+
df = pd.concat([df, oot_df_with_index])
|
2066
2079
|
|
2067
2080
|
return df
|
2068
2081
|
|
@@ -2115,12 +2128,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2115
2128
|
) -> Dict[int, Tuple]:
|
2116
2129
|
eval_set_sampled_dict = {}
|
2117
2130
|
|
2118
|
-
for idx in range(eval_set_len):
|
2119
|
-
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx
|
2131
|
+
for idx in range(1, eval_set_len + 1):
|
2132
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx}")
|
2120
2133
|
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
2121
2134
|
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
2122
2135
|
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
2123
|
-
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
2136
|
+
eval_set_sampled_dict[idx - 1] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
2124
2137
|
|
2125
2138
|
return eval_set_sampled_dict
|
2126
2139
|
|
@@ -2312,10 +2325,10 @@ if response.status_code == 200:
|
|
2312
2325
|
with MDC(trace_id=trace_id, search_id=search_id):
|
2313
2326
|
self.logger.info("Start transform")
|
2314
2327
|
|
2315
|
-
validated_X, validated_y,
|
2316
|
-
X, y,
|
2328
|
+
validated_X, validated_y, _, _ = self._validate_train_eval(
|
2329
|
+
X, y, is_transform=True
|
2317
2330
|
)
|
2318
|
-
df = self.__combine_train_and_eval_sets(validated_X, validated_y
|
2331
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y)
|
2319
2332
|
|
2320
2333
|
validated_Xy = df.copy()
|
2321
2334
|
|
@@ -2790,9 +2803,10 @@ if response.status_code == 200:
|
|
2790
2803
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
2791
2804
|
y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
|
2792
2805
|
eval_set: Optional[List[tuple]],
|
2806
|
+
*,
|
2807
|
+
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
2793
2808
|
progress_bar: Optional[ProgressBar],
|
2794
2809
|
start_time: int,
|
2795
|
-
*,
|
2796
2810
|
exclude_features_sources: Optional[List[str]] = None,
|
2797
2811
|
calculate_metrics: Optional[bool],
|
2798
2812
|
scoring: Union[Callable, str, None],
|
@@ -2813,7 +2827,7 @@ if response.status_code == 200:
|
|
2813
2827
|
self.fit_dropped_features = set()
|
2814
2828
|
self.fit_generated_features = []
|
2815
2829
|
|
2816
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2830
|
+
validated_X, validated_y, validated_eval_set, validated_oot = self._validate_train_eval(X, y, eval_set, oot)
|
2817
2831
|
|
2818
2832
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
2819
2833
|
if is_demo_dataset:
|
@@ -2854,6 +2868,7 @@ if response.status_code == 200:
|
|
2854
2868
|
validated_X,
|
2855
2869
|
validated_y,
|
2856
2870
|
validated_eval_set,
|
2871
|
+
validated_oot,
|
2857
2872
|
exclude_features_sources=exclude_features_sources,
|
2858
2873
|
calculate_metrics=calculate_metrics,
|
2859
2874
|
scoring=scoring,
|
@@ -2861,7 +2876,7 @@ if response.status_code == 200:
|
|
2861
2876
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
2862
2877
|
)
|
2863
2878
|
|
2864
|
-
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2879
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set, validated_oot)
|
2865
2880
|
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
2866
2881
|
|
2867
2882
|
self.fit_search_keys = self.search_keys.copy()
|
@@ -3288,12 +3303,14 @@ if response.status_code == 200:
|
|
3288
3303
|
X: pd.DataFrame,
|
3289
3304
|
y: Optional[pd.Series] = None,
|
3290
3305
|
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3306
|
+
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
3291
3307
|
is_transform: bool = False,
|
3292
3308
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3293
3309
|
validated_X = self._validate_X(X, is_transform)
|
3294
3310
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3295
3311
|
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3296
|
-
|
3312
|
+
validated_oot = self._validate_oot(validated_X, oot)
|
3313
|
+
return validated_X, validated_y, validated_eval_set, validated_oot
|
3297
3314
|
|
3298
3315
|
def _encode_id_columns(
|
3299
3316
|
self,
|
@@ -3429,6 +3446,49 @@ if response.status_code == 200:
|
|
3429
3446
|
return None
|
3430
3447
|
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3431
3448
|
|
3449
|
+
def _validate_oot(self, X: pd.DataFrame, oot: Optional[pd.DataFrame]):
|
3450
|
+
if oot is None:
|
3451
|
+
return None
|
3452
|
+
|
3453
|
+
if _num_samples(oot) == 0:
|
3454
|
+
raise ValidationError(self.bundle.get("oot_is_empty"))
|
3455
|
+
if isinstance(oot, pd.DataFrame):
|
3456
|
+
if isinstance(oot.columns, pd.MultiIndex) or isinstance(oot.index, pd.MultiIndex):
|
3457
|
+
raise ValidationError(self.bundle.get("oot_multiindex_unsupported"))
|
3458
|
+
validated_oot = oot.copy()
|
3459
|
+
elif isinstance(oot, pd.Series):
|
3460
|
+
validated_oot = oot.to_frame()
|
3461
|
+
elif isinstance(oot, (list, np.ndarray)):
|
3462
|
+
validated_oot = pd.DataFrame(oot)
|
3463
|
+
renaming = {c: str(c) for c in validated_oot.columns}
|
3464
|
+
validated_oot = validated_oot.rename(columns=renaming)
|
3465
|
+
else:
|
3466
|
+
raise ValidationError(self.bundle.get("unsupported_type_oot").format(type(oot)))
|
3467
|
+
|
3468
|
+
if not validated_oot.index.is_unique:
|
3469
|
+
raise ValidationError(self.bundle.get("non_unique_index_oot"))
|
3470
|
+
|
3471
|
+
if self.exclude_columns is not None:
|
3472
|
+
validated_oot = validated_oot.drop(columns=self.exclude_columns, errors="ignore")
|
3473
|
+
|
3474
|
+
if self.baseline_score_column:
|
3475
|
+
validated_oot[self.baseline_score_column] = validated_oot[self.baseline_score_column].astype(
|
3476
|
+
"float64", errors="ignore"
|
3477
|
+
)
|
3478
|
+
|
3479
|
+
if validated_oot.columns.to_list() != X.columns.to_list():
|
3480
|
+
if set(validated_oot.columns.to_list()) == set(X.columns.to_list()):
|
3481
|
+
validated_oot = validated_oot[X.columns.to_list()]
|
3482
|
+
else:
|
3483
|
+
raise ValidationError(self.bundle.get("oot_and_x_diff_shape"))
|
3484
|
+
|
3485
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3486
|
+
train_eval_intersection = pd.merge(X, validated_oot, how="inner")
|
3487
|
+
if len(train_eval_intersection) > 0:
|
3488
|
+
raise ValidationError(self.bundle.get("oot_has_train_samples"))
|
3489
|
+
|
3490
|
+
return validated_oot
|
3491
|
+
|
3432
3492
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3433
3493
|
if len(eval_pair) != 2:
|
3434
3494
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
@@ -3600,6 +3660,7 @@ if response.status_code == 200:
|
|
3600
3660
|
X: pd.DataFrame,
|
3601
3661
|
y: Union[pd.Series, np.ndarray, list, None] = None,
|
3602
3662
|
eval_set: Optional[List[tuple]] = None,
|
3663
|
+
oot: Optional[pd.DataFrame] = None,
|
3603
3664
|
exclude_features_sources: Optional[List[str]] = None,
|
3604
3665
|
calculate_metrics: Optional[bool] = None,
|
3605
3666
|
cv: Optional[Any] = None,
|
@@ -3668,6 +3729,8 @@ if response.status_code == 200:
|
|
3668
3729
|
self.logger.info(
|
3669
3730
|
f"First 10 rows of the eval_y_{idx} with shape {_num_samples(eval_y)}:\n{sample(eval_y)}"
|
3670
3731
|
)
|
3732
|
+
if oot is not None:
|
3733
|
+
self.logger.info(f"First 10 rows of the oot with shape {oot.shape}:\n{sample(oot)}")
|
3671
3734
|
|
3672
3735
|
do_without_pandas_limits(print_datasets_sample)
|
3673
3736
|
|
@@ -4577,7 +4640,7 @@ if response.status_code == 200:
|
|
4577
4640
|
print(msg)
|
4578
4641
|
|
4579
4642
|
def _validate_PSI(self, df: pd.DataFrame):
|
4580
|
-
if EVAL_SET_INDEX in df.columns:
|
4643
|
+
if EVAL_SET_INDEX in df.columns and (df[EVAL_SET_INDEX] == 1).any():
|
4581
4644
|
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
4582
4645
|
eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
|
4583
4646
|
else:
|
@@ -139,6 +139,13 @@ eval_x_is_empty=X in eval_set is empty.
|
|
139
139
|
eval_y_is_empty=y in eval_set is empty.
|
140
140
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
141
141
|
eval_x_has_train_samples=Eval set X has rows that are present in train set X
|
142
|
+
# OOT
|
143
|
+
oot_is_empty=Out of time data is empty
|
144
|
+
oot_multiindex_unsupported=Multi index in out of time data is not supported
|
145
|
+
unsupported_type_oot=Unsupported type of out of time data: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray
|
146
|
+
non_unique_index_oot=Out of time data has non unique index. Use reset_index
|
147
|
+
oot_and_x_diff_shape=Out of time data has different columns than train set X
|
148
|
+
oot_has_train_samples=Out of time data has rows that are present in train set X
|
142
149
|
|
143
150
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
144
151
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=H2pDgAfR-AAZibgkBF4HysX4COuPa8QPX6H6srUsYKU,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=xFi0a-A3uvtxVwFM6JOyitkEPd1I2slIBj5SWfys3hQ,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=lWcxE606_hSGXcFG3lXKzWCwNaAVHN0_j9AUWi1CTkc,224671
|
7
7
|
upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
|
8
8
|
upgini/metadata.py,sha256=9_0lFEWPpIHRBW-xWYSEcwPzICTC6_bQ6dUUlE75Xns,12773
|
9
9
|
upgini/metrics.py,sha256=V2SP6NS5bfFHzRqufeKVsCXME1yG4t_8Dmk2E3zKdYk,45715
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=2ohGjki3qn1fkRKDCFm8Hy1DU-3HEP_b3AVHBJka4vg,29147
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.113a1.dist-info/METADATA,sha256=An6LMif5xPvDV-osNxUvgAaz3xsvTnsQ4CnsNF5b6AQ,49531
|
75
|
+
upgini-1.2.113a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
76
|
+
upgini-1.2.113a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.113a1.dist-info/RECORD,,
|
File without changes
|
File without changes
|