upgini 1.2.117__py3-none-any.whl → 1.2.118__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.117"
1
+ __version__ = "1.2.118"
@@ -1423,8 +1423,15 @@ class FeaturesEnricher(TransformerMixin):
1423
1423
  # Find latest eval set or earliest if all eval sets are before train set
1424
1424
  date_column = self._get_date_column(search_keys)
1425
1425
 
1426
- # Get minimum date from main dataset X
1427
- main_min_date = X[date_column].dropna().min()
1426
+ x_date = X[date_column].dropna()
1427
+ if not is_numeric_dtype(x_date):
1428
+ x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
1429
+ main_min_date = x_date.min()
1430
+
1431
+ for eval_x, _ in eval_set:
1432
+ eval_x_date = eval_x[date_column].dropna()
1433
+ if not is_numeric_dtype(eval_x_date):
1434
+ eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1428
1435
 
1429
1436
  # Find minimum date for each eval_set and compare with main dataset
1430
1437
  eval_dates = []
@@ -1433,8 +1440,11 @@ class FeaturesEnricher(TransformerMixin):
1433
1440
  if len(eval_x) < 1000:
1434
1441
  self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1435
1442
  continue
1436
- eval_min_date = eval_x[date_column].dropna().min()
1437
- eval_max_date = eval_x[date_column].dropna().max()
1443
+ eval_x_date = eval_x[date_column].dropna()
1444
+ if not is_numeric_dtype(eval_x_date):
1445
+ eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1446
+ eval_min_date = eval_x_date.min()
1447
+ eval_max_date = eval_x_date.max()
1438
1448
  eval_dates.append((i, eval_min_date, eval_max_date))
1439
1449
 
1440
1450
  if not eval_dates:
@@ -1460,6 +1470,10 @@ class FeaturesEnricher(TransformerMixin):
1460
1470
  checking_eval_set_df = checking_eval_set_df.copy()
1461
1471
 
1462
1472
  checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1473
+ if not is_numeric_dtype(checking_eval_set_df[date_column]):
1474
+ checking_eval_set_df[date_column] = (
1475
+ pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
1476
+ )
1463
1477
 
1464
1478
  psi_values_sparse = calculate_sparsity_psi(
1465
1479
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -3708,6 +3722,25 @@ if response.status_code == 200:
3708
3722
  else:
3709
3723
  raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
3710
3724
 
3725
+ if any(validated_eval_X.dtypes != X.dtypes):
3726
+ x_types = X.dtypes
3727
+ eval_types = validated_eval_X.dtypes
3728
+ # Find columns with different types
3729
+ diff_cols = [
3730
+ (col, x_types[col], eval_types[col])
3731
+ for col in x_types.index
3732
+ if x_types[col] != eval_types[col]
3733
+ ]
3734
+ diff_col_names = [col for col, _, _ in diff_cols]
3735
+ # print columns with different types
3736
+ print("Columns with different types:")
3737
+ for col, x_type, eval_type in diff_cols:
3738
+ print("-" * 50)
3739
+ print(f"Column: {col}")
3740
+ print(f"X type: {x_type}")
3741
+ print(f"Eval_set type: {eval_type}")
3742
+ raise ValidationError(self.bundle.get("eval_x_and_x_diff_dtypes").format(diff_col_names))
3743
+
3711
3744
  if _num_samples(validated_eval_X) != _num_samples(eval_y):
3712
3745
  raise ValidationError(
3713
3746
  self.bundle.get("x_and_y_diff_size_eval_set").format(
@@ -4420,7 +4453,8 @@ if response.status_code == 200:
4420
4453
 
4421
4454
  if len(features_info) > 0:
4422
4455
  self.features_info = pd.DataFrame(features_info)
4423
- if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4456
+ # If all psi values are 0 or null, drop psi column
4457
+ if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
4424
4458
  self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4425
4459
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4426
4460
  self._internal_features_info = pd.DataFrame(internal_features_info)
@@ -123,6 +123,7 @@ unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of
123
123
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
124
124
  unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
125
125
  eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
126
+ eval_x_and_x_diff_dtypes=The column types in eval_set are different from the column types in X: {}
126
127
  unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
127
128
  y_is_constant_eval_set=y in eval_set is a constant. Relevant feature search requires a non-constant y
128
129
  x_and_y_diff_size_eval_set=X and y in eval_set contain different number of rows: {}, {}
upgini/utils/psi.py CHANGED
@@ -82,9 +82,6 @@ def calculate_features_psi(
82
82
  ) -> dict[str, float]:
83
83
  empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
84
84
 
85
- if not is_numeric_dtype(df[date_column]):
86
- df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
87
-
88
85
  # Filter out rows with missing dates
89
86
  df = df[df[date_column].notna()].copy()
90
87
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.117
3
+ Version: 1.2.118
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=MY7Fho70n94XaciDTteAp4gxVleTVF6idcG3aECjijI,24
1
+ upgini/__about__.py,sha256=q02CtZPV2DVtBrD7C_RFfsbI15l7QGmxfUzcCx5UykM,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=iYay-Ye5WGntieg3X7uyg9W3x_1FUELrmhJnJIvQMeI,228897
6
+ upgini/features_enricher.py,sha256=zGWU8l6dWZwV1fsQD-j9tTKP9X6mUO9HPnwcGrJFS8o,230596
7
7
  upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
8
  upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=VbfRqgg2kuWqmUyieGNxuY5dy0TpOQ-L3fHlWB7o2_w,29186
41
+ upgini/resource_bundle/strings.properties,sha256=cNeVkWZMyjGCYGqmOOeJqisqPSEBtmfIw_U1rmgQw4w,29285
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -66,7 +66,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
66
66
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
67
67
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
68
68
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
69
- upgini/utils/psi.py,sha256=vuVAo5-4HplpblQA7BP8bouI8VQDEb___MW98WQ6ik8,11258
69
+ upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
70
70
  upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
71
71
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
72
72
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.117.dist-info/METADATA,sha256=3onXIkh96-rh_Q0DIuHUihe07upcEFukS5WsVS1R2yc,50743
78
- upgini-1.2.117.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.117.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.117.dist-info/RECORD,,
77
+ upgini-1.2.118.dist-info/METADATA,sha256=fEdGQaho0hyf9dXC_fL1AxuJFI46-zCMvm_U_O6hOec,50743
78
+ upgini-1.2.118.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.118.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.118.dist-info/RECORD,,