upgini 1.2.117a1__tar.gz → 1.2.119__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {upgini-1.2.117a1 → upgini-1.2.119}/.gitignore +1 -0
  2. {upgini-1.2.117a1 → upgini-1.2.119}/PKG-INFO +1 -1
  3. upgini-1.2.119/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/features_enricher.py +103 -40
  5. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/metrics.py +3 -2
  6. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/resource_bundle/strings.properties +1 -0
  7. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/display_utils.py +12 -9
  8. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/psi.py +0 -3
  9. upgini-1.2.117a1/src/upgini/__about__.py +0 -1
  10. {upgini-1.2.117a1 → upgini-1.2.119}/LICENSE +0 -0
  11. {upgini-1.2.117a1 → upgini-1.2.119}/README.md +0 -0
  12. {upgini-1.2.117a1 → upgini-1.2.119}/pyproject.toml +0 -0
  13. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/all_operators.py +0 -0
  19. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/operator.py +0 -0
  24. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/__init__.py +0 -0
  25. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/base.py +0 -0
  26. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/cross.py +0 -0
  27. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/delta.py +0 -0
  28. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/lag.py +0 -0
  29. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/roll.py +0 -0
  30. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/trend.py +0 -0
  31. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/timeseries/volatility.py +0 -0
  32. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/unary.py +0 -0
  33. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/utils.py +0 -0
  34. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/autofe/vector.py +0 -0
  35. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/data_source/__init__.py +0 -0
  36. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/data_source/data_source_publisher.py +0 -0
  37. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/dataset.py +0 -0
  38. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/errors.py +0 -0
  39. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/http.py +0 -0
  40. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/mdc/__init__.py +0 -0
  41. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/mdc/context.py +0 -0
  42. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/metadata.py +0 -0
  43. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/normalizer/__init__.py +0 -0
  44. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/normalizer/normalize_utils.py +0 -0
  45. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/resource_bundle/__init__.py +0 -0
  46. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/resource_bundle/exceptions.py +0 -0
  47. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/config.py +0 -0
  59. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/country_utils.py +0 -0
  60. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/custom_loss_utils.py +0 -0
  61. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/cv_utils.py +0 -0
  62. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/datetime_utils.py +0 -0
  63. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/deduplicate_utils.py +0 -0
  64. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/hash_utils.py +0 -0
  70. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/postal_code_utils.py +0 -0
  74. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/progress_bar.py +0 -0
  75. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.117a1 → upgini-1.2.119}/src/upgini/version_validator.py +0 -0
@@ -111,6 +111,7 @@ env10/
111
111
  .env10/
112
112
  .env310/
113
113
  env11/
114
+ env12/
114
115
  venv/
115
116
  ENV/
116
117
  env.bak/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.117a1
3
+ Version: 1.2.119
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.119"
@@ -854,7 +854,7 @@ class FeaturesEnricher(TransformerMixin):
854
854
  raise e
855
855
  finally:
856
856
  self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
857
-
857
+
858
858
  return result
859
859
 
860
860
  def calculate_metrics(
@@ -1423,8 +1423,15 @@ class FeaturesEnricher(TransformerMixin):
1423
1423
  # Find latest eval set or earliest if all eval sets are before train set
1424
1424
  date_column = self._get_date_column(search_keys)
1425
1425
 
1426
- # Get minimum date from main dataset X
1427
- main_min_date = X[date_column].dropna().min()
1426
+ x_date = X[date_column].dropna()
1427
+ if not is_numeric_dtype(x_date):
1428
+ x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
1429
+ main_min_date = x_date.min()
1430
+
1431
+ for eval_x, _ in eval_set:
1432
+ eval_x_date = eval_x[date_column].dropna()
1433
+ if not is_numeric_dtype(eval_x_date):
1434
+ eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1428
1435
 
1429
1436
  # Find minimum date for each eval_set and compare with main dataset
1430
1437
  eval_dates = []
@@ -1433,8 +1440,11 @@ class FeaturesEnricher(TransformerMixin):
1433
1440
  if len(eval_x) < 1000:
1434
1441
  self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1435
1442
  continue
1436
- eval_min_date = eval_x[date_column].dropna().min()
1437
- eval_max_date = eval_x[date_column].dropna().max()
1443
+ eval_x_date = eval_x[date_column].dropna()
1444
+ if not is_numeric_dtype(eval_x_date):
1445
+ eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1446
+ eval_min_date = eval_x_date.min()
1447
+ eval_max_date = eval_x_date.max()
1438
1448
  eval_dates.append((i, eval_min_date, eval_max_date))
1439
1449
 
1440
1450
  if not eval_dates:
@@ -1460,6 +1470,10 @@ class FeaturesEnricher(TransformerMixin):
1460
1470
  checking_eval_set_df = checking_eval_set_df.copy()
1461
1471
 
1462
1472
  checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1473
+ if not is_numeric_dtype(checking_eval_set_df[date_column]):
1474
+ checking_eval_set_df[date_column] = (
1475
+ pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
1476
+ )
1463
1477
 
1464
1478
  psi_values_sparse = calculate_sparsity_psi(
1465
1479
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -1727,7 +1741,8 @@ class FeaturesEnricher(TransformerMixin):
1727
1741
  c
1728
1742
  for c in (validated_X.columns.to_list() + generated_features)
1729
1743
  if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1730
- and c not in (
1744
+ and c
1745
+ not in (
1731
1746
  excluding_search_keys
1732
1747
  + list(self.fit_dropped_features)
1733
1748
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -2201,7 +2216,8 @@ class FeaturesEnricher(TransformerMixin):
2201
2216
  progress_callback=progress_callback,
2202
2217
  add_fit_system_record_id=True,
2203
2218
  )
2204
- if enriched_df is None:
2219
+ if enriched_df is None or len(enriched_df) == 0 or len(enriched_df.columns) == 0:
2220
+ self.logger.warning(f"Empty enriched dataframe returned: {enriched_df}, returning None")
2205
2221
  return None
2206
2222
 
2207
2223
  x_columns = [
@@ -2505,7 +2521,7 @@ if response.status_code == 200:
2505
2521
  if len(self.feature_names_) == 0:
2506
2522
  msg = self.bundle.get("no_important_features_for_transform")
2507
2523
  self.__log_warning(msg, show_support_link=True)
2508
- return X, {c: c for c in X.columns}, [], dict()
2524
+ return None, {}, [], self.search_keys
2509
2525
 
2510
2526
  self.__validate_search_keys(self.search_keys, self.search_id)
2511
2527
 
@@ -2513,7 +2529,7 @@ if response.status_code == 200:
2513
2529
  msg = self.bundle.get("transform_with_paid_features")
2514
2530
  self.logger.warning(msg)
2515
2531
  self.__display_support_link(msg)
2516
- return None, {c: c for c in X.columns}, [], {}
2532
+ return None, {}, [], self.search_keys
2517
2533
 
2518
2534
  features_meta = self._search_task.get_all_features_metadata_v2()
2519
2535
  online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
@@ -2536,7 +2552,7 @@ if response.status_code == 200:
2536
2552
  self.logger.warning(msg)
2537
2553
  print(msg)
2538
2554
  show_request_quote_button()
2539
- return None, {c: c for c in X.columns}, [], {}
2555
+ return None, {}, [], {}
2540
2556
  else:
2541
2557
  msg = self.bundle.get("transform_usage_info").format(
2542
2558
  transform_usage.limit, transform_usage.transformed_rows
@@ -2606,14 +2622,33 @@ if response.status_code == 200:
2606
2622
 
2607
2623
  # If there are no external features, we don't call backend on transform
2608
2624
  external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2609
- if not external_features:
2625
+ if len(external_features) == 0:
2610
2626
  self.logger.warning(
2611
2627
  "No external features found, returning original dataframe"
2612
2628
  f" with generated important features: {self.feature_names_}"
2613
2629
  )
2614
- filtered_columns = [c for c in self.feature_names_ if c in df.columns]
2615
- self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2616
- return df[filtered_columns], columns_renaming, generated_features, search_keys
2630
+ df = df.rename(columns=columns_renaming)
2631
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2632
+ search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
2633
+ selecting_columns = self._selecting_input_and_generated_columns(
2634
+ validated_Xy, generated_features, keep_input, trace_id
2635
+ )
2636
+ self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
2637
+ if add_fit_system_record_id:
2638
+ df = self._add_fit_system_record_id(
2639
+ df,
2640
+ search_keys,
2641
+ SYSTEM_RECORD_ID,
2642
+ TARGET,
2643
+ columns_renaming,
2644
+ self.id_columns,
2645
+ self.cv,
2646
+ self.model_task_type,
2647
+ self.logger,
2648
+ self.bundle,
2649
+ )
2650
+ selecting_columns.append(SYSTEM_RECORD_ID)
2651
+ return df[selecting_columns], columns_renaming, generated_features, search_keys
2617
2652
 
2618
2653
  # Don't pass all features in backend on transform
2619
2654
  runtime_parameters = self._get_copy_of_runtime_parameters()
@@ -2831,29 +2866,12 @@ if response.status_code == 200:
2831
2866
  how="left",
2832
2867
  )
2833
2868
 
2834
- fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2835
- new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2836
-
2837
- selected_generated_features = [
2838
- c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2839
- ]
2840
- if keep_input is True:
2841
- selected_input_columns = [
2842
- c
2843
- for c in validated_Xy.columns
2844
- if not self.fit_select_features
2845
- or c in self.feature_names_
2846
- or c in new_columns_on_transform
2847
- or c in self.search_keys
2848
- or c in (self.id_columns or [])
2849
- or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2850
- ]
2851
- else:
2852
- selected_input_columns = []
2853
-
2854
- selecting_columns = selected_input_columns + selected_generated_features
2869
+ selecting_columns = self._selecting_input_and_generated_columns(
2870
+ validated_Xy, generated_features, keep_input, trace_id
2871
+ )
2855
2872
  selecting_columns.extend(
2856
- c for c in result.columns
2873
+ c
2874
+ for c in result.columns
2857
2875
  if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2858
2876
  )
2859
2877
  if add_fit_system_record_id:
@@ -2881,6 +2899,35 @@ if response.status_code == 200:
2881
2899
 
2882
2900
  return result, columns_renaming, generated_features, search_keys
2883
2901
 
2902
+ def _selecting_input_and_generated_columns(
2903
+ self,
2904
+ validated_Xy: pd.DataFrame,
2905
+ generated_features: list[str],
2906
+ keep_input: bool,
2907
+ trace_id: str,
2908
+ ):
2909
+ fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2910
+ new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2911
+
2912
+ selected_generated_features = [
2913
+ c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2914
+ ]
2915
+ if keep_input is True:
2916
+ selected_input_columns = [
2917
+ c
2918
+ for c in validated_Xy.columns
2919
+ if not self.fit_select_features
2920
+ or c in self.feature_names_
2921
+ or c in new_columns_on_transform
2922
+ or c in self.search_keys
2923
+ or c in (self.id_columns or [])
2924
+ or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2925
+ ]
2926
+ else:
2927
+ selected_input_columns = []
2928
+
2929
+ return selected_input_columns + selected_generated_features
2930
+
2884
2931
  def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
2885
2932
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2886
2933
  if search_id:
@@ -3708,6 +3755,23 @@ if response.status_code == 200:
3708
3755
  else:
3709
3756
  raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
3710
3757
 
3758
+ if any(validated_eval_X.dtypes != X.dtypes):
3759
+ x_types = X.dtypes
3760
+ eval_types = validated_eval_X.dtypes
3761
+ # Find columns with different types
3762
+ diff_cols = [
3763
+ (col, x_types[col], eval_types[col]) for col in x_types.index if x_types[col] != eval_types[col]
3764
+ ]
3765
+ diff_col_names = [col for col, _, _ in diff_cols]
3766
+ # print columns with different types
3767
+ print("Columns with different types:")
3768
+ for col, x_type, eval_type in diff_cols:
3769
+ print("-" * 50)
3770
+ print(f"Column: {col}")
3771
+ print(f"X type: {x_type}")
3772
+ print(f"Eval_set type: {eval_type}")
3773
+ raise ValidationError(self.bundle.get("eval_x_and_x_diff_dtypes").format(diff_col_names))
3774
+
3711
3775
  if _num_samples(validated_eval_X) != _num_samples(eval_y):
3712
3776
  raise ValidationError(
3713
3777
  self.bundle.get("x_and_y_diff_size_eval_set").format(
@@ -3782,9 +3846,7 @@ if response.status_code == 200:
3782
3846
  return Xy[X.columns].copy(), Xy[TARGET].copy()
3783
3847
 
3784
3848
  @staticmethod
3785
- def _sort_by_system_record_id(
3786
- X: pd.DataFrame, y: pd.Series, cv: CVType | None
3787
- ) -> tuple[pd.DataFrame, pd.Series]:
3849
+ def _sort_by_system_record_id(X: pd.DataFrame, y: pd.Series, cv: CVType | None) -> tuple[pd.DataFrame, pd.Series]:
3788
3850
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3789
3851
  record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
3790
3852
  Xy = X.copy()
@@ -4420,7 +4482,8 @@ if response.status_code == 200:
4420
4482
 
4421
4483
  if len(features_info) > 0:
4422
4484
  self.features_info = pd.DataFrame(features_info)
4423
- if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4485
+ # If all psi values are 0 or null, drop psi column
4486
+ if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
4424
4487
  self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4425
4488
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4426
4489
  self._internal_features_info = pd.DataFrame(internal_features_info)
@@ -847,7 +847,7 @@ class CatBoostWrapper(EstimatorWrapper):
847
847
 
848
848
  feature_importance = {}
849
849
  for i, col in enumerate(x.columns):
850
- feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
850
+ feature_importance[col] = float(np.mean(np.abs(shap_values[:, i])))
851
851
 
852
852
  return feature_importance
853
853
 
@@ -922,6 +922,7 @@ class LightGBMWrapper(EstimatorWrapper):
922
922
  encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
923
923
  else:
924
924
  encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
925
+ x_copy = x_copy.drop(columns=self.cat_features, errors="ignore")
925
926
  x_copy[self.cat_features] = encoded
926
927
 
927
928
  shap_matrix = estimator.predict(
@@ -943,7 +944,7 @@ class LightGBMWrapper(EstimatorWrapper):
943
944
 
944
945
  feature_importance = {}
945
946
  for i, col in enumerate(x.columns):
946
- feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
947
+ feature_importance[col] = float(np.mean(np.abs(shap_matrix[:, i])))
947
948
 
948
949
  return feature_importance
949
950
 
@@ -123,6 +123,7 @@ unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of
123
123
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
124
124
  unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
125
125
  eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
126
+ eval_x_and_x_diff_dtypes=The column types in eval_set are different from the column types in X: {}
126
127
  unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
127
128
  y_is_constant_eval_set=y in eval_set is a constant. Relevant feature search requires a non-constant y
128
129
  x_and_y_diff_size_eval_set=X and y in eval_set contain different number of rows: {}, {}
@@ -269,19 +269,22 @@ def make_html_report(
269
269
  if search_keys is not None
270
270
  else ""
271
271
  }
272
- {"<h3>All relevant features. Accuracy after enrichment</h3>" + make_table(metrics_df)
273
- if metrics_df is not None
274
- else ""
272
+ {
273
+ "<h3>All relevant features. Accuracy after enrichment</h3>" + make_table(metrics_df)
274
+ if metrics_df is not None
275
+ else ""
275
276
  }
276
- {"<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
277
- if len(relevant_datasources_df) > 0
278
- else ""
277
+ {
278
+ "<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
279
+ if len(relevant_datasources_df) > 0
280
+ else ""
279
281
  }
280
282
  <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
281
283
  {make_table(relevant_features_df, wrap_long_string=25)}
282
- {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
283
- if autofe_descriptions_df is not None
284
- else ""
284
+ {
285
+ "<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
286
+ if autofe_descriptions_df is not None and len(autofe_descriptions_df) > 0
287
+ else ""
285
288
  }
286
289
  <p>To buy found data sources, please contact: <a href='mailto:sales@upgini.com'>sales@upgini.com</a></p>
287
290
  <p>Best regards, </br><b>Upgini Team</b></p>
@@ -82,9 +82,6 @@ def calculate_features_psi(
82
82
  ) -> dict[str, float]:
83
83
  empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
84
84
 
85
- if not is_numeric_dtype(df[date_column]):
86
- df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
87
-
88
85
  # Filter out rows with missing dates
89
86
  df = df[df[date_column].notna()].copy()
90
87
 
@@ -1 +0,0 @@
1
- __version__ = "1.2.117a1"
File without changes
File without changes
File without changes
File without changes
File without changes