upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (49) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads.py +6 -2
  3. upgini/ads_management/ads_manager.py +4 -2
  4. upgini/autofe/all_operands.py +16 -4
  5. upgini/autofe/binary.py +2 -1
  6. upgini/autofe/date.py +74 -7
  7. upgini/autofe/feature.py +1 -1
  8. upgini/autofe/groupby.py +3 -1
  9. upgini/autofe/operand.py +4 -3
  10. upgini/autofe/unary.py +20 -1
  11. upgini/autofe/vector.py +2 -0
  12. upgini/data_source/data_source_publisher.py +14 -4
  13. upgini/dataset.py +8 -7
  14. upgini/errors.py +1 -1
  15. upgini/features_enricher.py +156 -63
  16. upgini/http.py +11 -10
  17. upgini/mdc/__init__.py +1 -3
  18. upgini/mdc/context.py +4 -6
  19. upgini/metadata.py +3 -0
  20. upgini/metrics.py +160 -96
  21. upgini/normalizer/phone_normalizer.py +2 -2
  22. upgini/resource_bundle/__init__.py +5 -5
  23. upgini/resource_bundle/strings.properties +9 -4
  24. upgini/sampler/base.py +1 -4
  25. upgini/sampler/random_under_sampler.py +2 -5
  26. upgini/search_task.py +4 -4
  27. upgini/spinner.py +1 -1
  28. upgini/utils/__init__.py +3 -2
  29. upgini/utils/base_search_key_detector.py +2 -2
  30. upgini/utils/blocked_time_series.py +4 -2
  31. upgini/utils/country_utils.py +2 -2
  32. upgini/utils/custom_loss_utils.py +3 -2
  33. upgini/utils/cv_utils.py +2 -2
  34. upgini/utils/datetime_utils.py +75 -18
  35. upgini/utils/deduplicate_utils.py +61 -18
  36. upgini/utils/email_utils.py +3 -3
  37. upgini/utils/fallback_progress_bar.py +1 -1
  38. upgini/utils/features_validator.py +2 -1
  39. upgini/utils/progress_bar.py +1 -1
  40. upgini/utils/sklearn_ext.py +15 -15
  41. upgini/utils/target_utils.py +21 -7
  42. upgini/utils/track_info.py +27 -15
  43. upgini/version_validator.py +2 -2
  44. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
  45. upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
  46. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
  47. upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
  48. upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
  49. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  import dataclasses
2
+ import datetime
2
3
  import gc
3
4
  import hashlib
4
5
  import itertools
@@ -20,6 +21,7 @@ from pandas.api.types import (
20
21
  is_bool,
21
22
  is_datetime64_any_dtype,
22
23
  is_numeric_dtype,
24
+ is_object_dtype,
23
25
  is_period_dtype,
24
26
  is_string_dtype,
25
27
  )
@@ -70,6 +72,7 @@ from upgini.utils.datetime_utils import (
70
72
  DateTimeSearchKeyConverter,
71
73
  is_blocked_time_series,
72
74
  is_time_series,
75
+ validate_dates_distribution,
73
76
  )
74
77
  from upgini.utils.deduplicate_utils import (
75
78
  clean_full_duplicates,
@@ -93,7 +96,7 @@ try:
93
96
  except Exception:
94
97
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
95
98
 
96
- from upgini.utils.target_utils import define_task
99
+ from upgini.utils.target_utils import calculate_psi, define_task
97
100
  from upgini.utils.warning_counter import WarningCounter
98
101
  from upgini.version_validator import validate_version
99
102
 
@@ -145,6 +148,7 @@ class FeaturesEnricher(TransformerMixin):
145
148
  """
146
149
 
147
150
  TARGET_NAME = "target"
151
+ CURRENT_DATE = "current_date"
148
152
  RANDOM_STATE = 42
149
153
  CALCULATE_METRICS_THRESHOLD = 50_000_000
150
154
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -206,6 +210,7 @@ class FeaturesEnricher(TransformerMixin):
206
210
  client_ip: Optional[str] = None,
207
211
  client_visitorid: Optional[str] = None,
208
212
  custom_bundle_config: Optional[str] = None,
213
+ add_date_if_missing: bool = True,
209
214
  **kwargs,
210
215
  ):
211
216
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -316,6 +321,7 @@ class FeaturesEnricher(TransformerMixin):
316
321
  self.raise_validation_error = raise_validation_error
317
322
  self.exclude_columns = exclude_columns
318
323
  self.baseline_score_column = baseline_score_column
324
+ self.add_date_if_missing = add_date_if_missing
319
325
 
320
326
  def _get_api_key(self):
321
327
  return self._api_key
@@ -419,11 +425,14 @@ class FeaturesEnricher(TransformerMixin):
419
425
 
420
426
  self.__validate_search_keys(self.search_keys, self.search_id)
421
427
 
428
+ # Validate client estimator params
429
+ self._get_client_cat_features(estimator, X, self.search_keys)
430
+
422
431
  try:
423
432
  self.X = X
424
433
  self.y = y
425
434
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
426
- self.dump_input(trace_id, X, y, eval_set)
435
+ self.dump_input(trace_id, X, y, self.eval_set)
427
436
  self.__inner_fit(
428
437
  trace_id,
429
438
  X,
@@ -562,7 +571,7 @@ class FeaturesEnricher(TransformerMixin):
562
571
  self.X = X
563
572
  self.y = y
564
573
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
565
- self.dump_input(trace_id, X, y, eval_set)
574
+ self.dump_input(trace_id, X, y, self.eval_set)
566
575
 
567
576
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
568
577
  raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
@@ -812,6 +821,7 @@ class FeaturesEnricher(TransformerMixin):
812
821
  trace_id = trace_id or str(uuid.uuid4())
813
822
  start_time = time.time()
814
823
  with MDC(trace_id=trace_id):
824
+ self.logger.info("Start calculate metrics")
815
825
  if len(args) > 0:
816
826
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
817
827
  self.logger.warning(msg)
@@ -822,12 +832,16 @@ class FeaturesEnricher(TransformerMixin):
822
832
  print(msg)
823
833
 
824
834
  self.__validate_search_keys(self.search_keys, self.search_id)
835
+ effective_X = X if X is not None else self.X
836
+ effective_y = y if y is not None else self.y
837
+ effective_eval_set = eval_set if eval_set is not None else self.eval_set
838
+ effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
825
839
 
826
840
  try:
827
841
  self.__log_debug_information(
828
- X if X is not None else self.X,
829
- y if y is not None else self.y,
830
- eval_set if eval_set is not None else self.eval_set,
842
+ effective_X,
843
+ effective_y,
844
+ effective_eval_set,
831
845
  exclude_features_sources=exclude_features_sources,
832
846
  cv=cv if cv is not None else self.cv,
833
847
  importance_threshold=importance_threshold,
@@ -841,17 +855,14 @@ class FeaturesEnricher(TransformerMixin):
841
855
  self._search_task is None
842
856
  or self._search_task.provider_metadata_v2 is None
843
857
  or len(self._search_task.provider_metadata_v2) == 0
844
- or (self.X is None and X is None)
845
- or (self.y is None and y is None)
858
+ or effective_X is None
859
+ or effective_y is None
846
860
  ):
847
861
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
848
862
 
849
863
  if X is not None and y is None:
850
864
  raise ValidationError("X passed without y")
851
865
 
852
- effective_X = X if X is not None else self.X
853
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
854
-
855
866
  validate_scoring_argument(scoring)
856
867
 
857
868
  self._validate_baseline_score(effective_X, effective_eval_set)
@@ -862,29 +873,15 @@ class FeaturesEnricher(TransformerMixin):
862
873
  self.__display_support_link(msg)
863
874
  return None
864
875
 
865
- cat_features = None
866
- search_keys_for_metrics = []
867
- if (
868
- estimator is not None
869
- and hasattr(estimator, "get_param")
870
- and estimator.get_param("cat_features") is not None
871
- ):
872
- cat_features = estimator.get_param("cat_features")
873
- if len(cat_features) > 0 and isinstance(cat_features[0], int):
874
- effectiveX = X or self.X
875
- cat_features = [effectiveX.columns[i] for i in cat_features]
876
- for cat_feature in cat_features:
877
- if cat_feature in self.search_keys:
878
- if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
879
- search_keys_for_metrics.append(cat_feature)
880
- else:
881
- raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
876
+ cat_features, search_keys_for_metrics = self._get_client_cat_features(
877
+ estimator, effective_X, self.search_keys
878
+ )
882
879
 
883
880
  prepared_data = self._prepare_data_for_metrics(
884
881
  trace_id=trace_id,
885
- X=X,
886
- y=y,
887
- eval_set=eval_set,
882
+ X=effective_X,
883
+ y=effective_y,
884
+ eval_set=effective_eval_set,
888
885
  exclude_features_sources=exclude_features_sources,
889
886
  importance_threshold=importance_threshold,
890
887
  max_features=max_features,
@@ -893,6 +890,7 @@ class FeaturesEnricher(TransformerMixin):
893
890
  search_keys_for_metrics=search_keys_for_metrics,
894
891
  progress_bar=progress_bar,
895
892
  progress_callback=progress_callback,
893
+ cat_features=cat_features,
896
894
  )
897
895
  if prepared_data is None:
898
896
  return None
@@ -994,8 +992,6 @@ class FeaturesEnricher(TransformerMixin):
994
992
  enriched_metric = None
995
993
  uplift = None
996
994
 
997
- effective_X = X if X is not None else self.X
998
- effective_y = y if y is not None else self.y
999
995
  train_metrics = {
1000
996
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1001
997
  "quality_metrics_train_segment"
@@ -1256,6 +1252,7 @@ class FeaturesEnricher(TransformerMixin):
1256
1252
  ).get_cv_and_groups(X)
1257
1253
  else:
1258
1254
  from sklearn import __version__ as sklearn_version
1255
+
1259
1256
  try:
1260
1257
  from sklearn.model_selection._split import GroupsConsumerMixin
1261
1258
 
@@ -1269,6 +1266,29 @@ class FeaturesEnricher(TransformerMixin):
1269
1266
 
1270
1267
  return _cv, groups
1271
1268
 
1269
+ def _get_client_cat_features(
1270
+ self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1271
+ ) -> Optional[List[str]]:
1272
+ cat_features = None
1273
+ search_keys_for_metrics = []
1274
+ if (
1275
+ estimator is not None
1276
+ and hasattr(estimator, "get_param")
1277
+ and estimator.get_param("cat_features") is not None
1278
+ ):
1279
+ cat_features = estimator.get_param("cat_features")
1280
+ if len(cat_features) > 0:
1281
+ if all([isinstance(f, int) for f in cat_features]):
1282
+ cat_features = [X.columns[i] for i in cat_features]
1283
+ self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1284
+ for cat_feature in cat_features:
1285
+ if cat_feature in search_keys:
1286
+ if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1287
+ search_keys_for_metrics.append(cat_feature)
1288
+ else:
1289
+ raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
1290
+ return cat_features, search_keys_for_metrics
1291
+
1272
1292
  def _prepare_data_for_metrics(
1273
1293
  self,
1274
1294
  trace_id: str,
@@ -1283,6 +1303,7 @@ class FeaturesEnricher(TransformerMixin):
1283
1303
  search_keys_for_metrics: Optional[List[str]] = None,
1284
1304
  progress_bar: Optional[ProgressBar] = None,
1285
1305
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1306
+ cat_features: Optional[List[str]] = None,
1286
1307
  ):
1287
1308
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1288
1309
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
@@ -1340,9 +1361,8 @@ class FeaturesEnricher(TransformerMixin):
1340
1361
 
1341
1362
  # Detect and drop high cardinality columns in train
1342
1363
  columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
1343
- columns_with_high_cardinality = [
1344
- c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
1345
- ]
1364
+ non_excluding_columns = (self.generate_features or []) + (cat_features or [])
1365
+ columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
1346
1366
  if len(columns_with_high_cardinality) > 0:
1347
1367
  self.logger.warning(
1348
1368
  f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
@@ -1684,6 +1704,9 @@ class FeaturesEnricher(TransformerMixin):
1684
1704
  df = validated_X.copy()
1685
1705
 
1686
1706
  df[TARGET] = validated_y
1707
+
1708
+ df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1709
+
1687
1710
  num_samples = _num_samples(df)
1688
1711
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1689
1712
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
@@ -1801,10 +1824,11 @@ class FeaturesEnricher(TransformerMixin):
1801
1824
  else:
1802
1825
  features_section = ""
1803
1826
 
1804
- api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1827
+ search_id = self._search_task.search_task_id
1828
+ api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1805
1829
  -H 'Authorization: {self.api_key}' \\
1806
1830
  -H 'Content-Type: application/json' \\
1807
- -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1831
+ -d '{{"search_keys": {keys}{features_section}}}'"""
1808
1832
  return api_example
1809
1833
 
1810
1834
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1899,6 +1923,8 @@ class FeaturesEnricher(TransformerMixin):
1899
1923
  generated_features.extend(converter.generated_features)
1900
1924
  else:
1901
1925
  self.logger.info("Input dataset hasn't date column")
1926
+ if self.add_date_if_missing:
1927
+ df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1902
1928
  email_column = self._get_email_column(search_keys)
1903
1929
  hem_column = self._get_hem_column(search_keys)
1904
1930
  email_converted_to_hem = False
@@ -1918,6 +1944,7 @@ class FeaturesEnricher(TransformerMixin):
1918
1944
 
1919
1945
  meaning_types = {col: key.value for col, key in search_keys.items()}
1920
1946
  non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1947
+
1921
1948
  if email_converted_to_hem:
1922
1949
  non_keys_columns.append(email_column)
1923
1950
 
@@ -1939,6 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
1939
1966
  if add_fit_system_record_id:
1940
1967
  df = self.__add_fit_system_record_id(df, dict(), search_keys)
1941
1968
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1969
+ non_keys_columns.append(SORT_ID)
1942
1970
 
1943
1971
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1944
1972
 
@@ -2215,14 +2243,13 @@ class FeaturesEnricher(TransformerMixin):
2215
2243
  self.fit_search_keys = self.search_keys.copy()
2216
2244
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2217
2245
 
2218
- has_date = self._get_date_column(self.fit_search_keys) is not None
2246
+ validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2247
+
2248
+ maybe_date_column = self._get_date_column(self.fit_search_keys)
2249
+ has_date = maybe_date_column is not None
2219
2250
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2220
2251
  self._validate_binary_observations(validated_y, model_task_type)
2221
2252
 
2222
- df = self.__handle_index_search_keys(df, self.fit_search_keys)
2223
-
2224
- df = self.__correct_target(df)
2225
-
2226
2253
  self.runtime_parameters = get_runtime_params_custom_loss(
2227
2254
  self.loss, model_task_type, self.runtime_parameters, self.logger
2228
2255
  )
@@ -2234,6 +2261,13 @@ class FeaturesEnricher(TransformerMixin):
2234
2261
  eval_df[EVAL_SET_INDEX] = idx + 1
2235
2262
  df = pd.concat([df, eval_df])
2236
2263
 
2264
+ df = self.__correct_target(df)
2265
+
2266
+ df = self.__handle_index_search_keys(df, self.fit_search_keys)
2267
+
2268
+ if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2269
+ self._validate_PSI(df.sort_values(by=maybe_date_column))
2270
+
2237
2271
  if DEFAULT_INDEX in df.columns:
2238
2272
  msg = self.bundle.get("unsupported_index_column")
2239
2273
  self.logger.info(msg)
@@ -2260,6 +2294,8 @@ class FeaturesEnricher(TransformerMixin):
2260
2294
  self.fit_generated_features.extend(converter.generated_features)
2261
2295
  else:
2262
2296
  self.logger.info("Input dataset hasn't date column")
2297
+ if self.add_date_if_missing:
2298
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2263
2299
  email_column = self._get_email_column(self.fit_search_keys)
2264
2300
  hem_column = self._get_hem_column(self.fit_search_keys)
2265
2301
  email_converted_to_hem = False
@@ -2512,7 +2548,7 @@ class FeaturesEnricher(TransformerMixin):
2512
2548
  validated_X = X.copy()
2513
2549
  elif isinstance(X, pd.Series):
2514
2550
  validated_X = X.to_frame()
2515
- elif isinstance(X, np.ndarray) or isinstance(X, list):
2551
+ elif isinstance(X, (list, np.ndarray)):
2516
2552
  validated_X = pd.DataFrame(X)
2517
2553
  renaming = {c: str(c) for c in validated_X.columns}
2518
2554
  validated_X = validated_X.rename(columns=renaming)
@@ -2601,7 +2637,7 @@ class FeaturesEnricher(TransformerMixin):
2601
2637
  validated_eval_X = eval_X.copy()
2602
2638
  elif isinstance(eval_X, pd.Series):
2603
2639
  validated_eval_X = eval_X.to_frame()
2604
- elif isinstance(eval_X, np.ndarray) or isinstance(eval_X, list):
2640
+ elif isinstance(eval_X, (list, np.ndarray)):
2605
2641
  validated_eval_X = pd.DataFrame(eval_X)
2606
2642
  renaming = {c: str(c) for c in validated_eval_X.columns}
2607
2643
  validated_eval_X = validated_eval_X.rename(columns=renaming)
@@ -2783,7 +2819,7 @@ class FeaturesEnricher(TransformerMixin):
2783
2819
  )
2784
2820
 
2785
2821
  def sample(df):
2786
- if isinstance(df, pd.Series) or isinstance(df, pd.DataFrame):
2822
+ if isinstance(df, (pd.DataFrame, pd.Series)):
2787
2823
  return df.head(10)
2788
2824
  else:
2789
2825
  return df[:10]
@@ -2808,6 +2844,7 @@ class FeaturesEnricher(TransformerMixin):
2808
2844
 
2809
2845
  maybe_date_col = self._get_date_column(self.search_keys)
2810
2846
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2847
+ # TODO cast date column to single dtype
2811
2848
  min_date = X[maybe_date_col].min()
2812
2849
  max_date = X[maybe_date_col].max()
2813
2850
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
@@ -2839,6 +2876,25 @@ class FeaturesEnricher(TransformerMixin):
2839
2876
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2840
2877
  return col
2841
2878
 
2879
+ @staticmethod
2880
+ def _add_current_date_as_key(
2881
+ df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
2882
+ ) -> pd.DataFrame:
2883
+ if (
2884
+ set(search_keys.values()) == {SearchKey.PHONE}
2885
+ or set(search_keys.values()) == {SearchKey.EMAIL}
2886
+ or set(search_keys.values()) == {SearchKey.HEM}
2887
+ or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
2888
+ ):
2889
+ msg = bundle.get("current_date_added")
2890
+ print(msg)
2891
+ logger.warning(msg)
2892
+ df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2893
+ search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2894
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
2895
+ df = converter.convert(df)
2896
+ return df
2897
+
2842
2898
  @staticmethod
2843
2899
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2844
2900
  return [
@@ -2877,26 +2933,33 @@ class FeaturesEnricher(TransformerMixin):
2877
2933
 
2878
2934
  # order by date and idempotent order by other keys
2879
2935
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
2936
+ sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
2880
2937
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2881
2938
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2939
+ sort_exclude_columns.append(self._get_date_column(search_keys))
2882
2940
  else:
2883
2941
  date_column = self._get_date_column(search_keys)
2884
2942
  sort_columns = [date_column] if date_column is not None else []
2885
2943
 
2886
- other_search_keys = sorted(
2944
+ other_columns = sorted(
2887
2945
  [
2888
- sk
2889
- for sk, key_type in search_keys.items()
2890
- if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2891
- and sk in df.columns
2892
- and df[sk].nunique() > 1 # don't use constant keys for hash
2946
+ c
2947
+ for c in df.columns
2948
+ if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
2893
2949
  ]
2950
+ # [
2951
+ # sk
2952
+ # for sk, key_type in search_keys.items()
2953
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2954
+ # and sk in df.columns
2955
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
2956
+ # ]
2894
2957
  )
2895
2958
 
2896
2959
  search_keys_hash = "search_keys_hash"
2897
- if len(other_search_keys) > 0:
2960
+ if len(other_columns) > 0:
2898
2961
  sort_columns.append(search_keys_hash)
2899
- df[search_keys_hash] = pd.util.hash_pandas_object(df[sorted(other_search_keys)], index=False)
2962
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
2900
2963
 
2901
2964
  df = df.sort_values(by=sort_columns)
2902
2965
 
@@ -2920,7 +2983,7 @@ class FeaturesEnricher(TransformerMixin):
2920
2983
 
2921
2984
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
2922
2985
  target = df[self.TARGET_NAME]
2923
- if is_string_dtype(target):
2986
+ if is_string_dtype(target) or is_object_dtype(target):
2924
2987
  maybe_numeric_target = pd.to_numeric(target, errors="coerce")
2925
2988
  # If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
2926
2989
  if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
@@ -3185,22 +3248,23 @@ class FeaturesEnricher(TransformerMixin):
3185
3248
  return None
3186
3249
  features_meta = self._search_task.get_all_features_metadata_v2()
3187
3250
 
3188
- def get_feature_by_display_index(idx, op):
3251
+ def get_feature_by_name(name: str):
3189
3252
  for m in features_meta:
3190
- if m.name.endswith(f"_{op}_{idx}"):
3253
+ if m.name == name:
3191
3254
  return m
3192
3255
 
3193
3256
  descriptions = []
3194
3257
  for m in autofe_meta:
3195
3258
  autofe_feature = Feature.from_formula(m.formula)
3259
+ orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
3260
+ autofe_feature.rename_columns(orig_to_hashed)
3261
+ autofe_feature.set_display_index(m.display_index)
3196
3262
  if autofe_feature.op.is_vector:
3197
3263
  continue
3198
3264
 
3199
3265
  description = dict()
3200
3266
 
3201
- feature_meta = get_feature_by_display_index(
3202
- m.display_index, autofe_feature.op.alias or autofe_feature.op.name
3203
- )
3267
+ feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3204
3268
  if feature_meta is None:
3205
3269
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
3206
3270
  continue
@@ -3321,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
3321
3385
  valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
3322
3386
  else:
3323
3387
  if x[column_name].isnull().all() or (
3324
- is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
3388
+ (is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
3389
+ and (x[column_name].astype("string").str.strip() == "").all()
3325
3390
  ):
3326
3391
  raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
3327
3392
 
@@ -3547,6 +3612,34 @@ class FeaturesEnricher(TransformerMixin):
3547
3612
  self.logger.warning(msg)
3548
3613
  print(msg)
3549
3614
 
3615
+ def _validate_PSI(self, df: pd.DataFrame):
3616
+ if EVAL_SET_INDEX in df.columns:
3617
+ train = df.query(f"{EVAL_SET_INDEX} == 0")
3618
+ eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
3619
+ else:
3620
+ train = df
3621
+ eval1 = None
3622
+
3623
+ # 1. Check train PSI
3624
+ half_train = round(len(train) / 2)
3625
+ part1 = train[:half_train]
3626
+ part2 = train[half_train:]
3627
+ train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
3628
+ if train_psi > 0.2:
3629
+ self.warning_counter.increment()
3630
+ msg = self.bundle.get("train_unstable_target").format(train_psi)
3631
+ print(msg)
3632
+ self.logger.warning(msg)
3633
+
3634
+ # 2. Check train-test PSI
3635
+ if eval1 is not None:
3636
+ train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
3637
+ if train_test_psi > 0.2:
3638
+ self.warning_counter.increment()
3639
+ msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
3640
+ print(msg)
3641
+ self.logger.warning(msg)
3642
+
3550
3643
  def _dump_python_libs(self):
3551
3644
  try:
3552
3645
  from pip._internal.operations.freeze import freeze
@@ -3600,7 +3693,7 @@ class FeaturesEnricher(TransformerMixin):
3600
3693
  def sample(inp, sample_index):
3601
3694
  if _num_samples(inp) <= 1000:
3602
3695
  return inp
3603
- if isinstance(inp, pd.DataFrame) or isinstance(inp, pd.Series):
3696
+ if isinstance(inp, (pd.DataFrame, pd.Series)):
3604
3697
  return inp.sample(n=1000, random_state=random_state)
3605
3698
  if isinstance(inp, np.ndarray):
3606
3699
  return inp[sample_index]
@@ -3613,7 +3706,7 @@ class FeaturesEnricher(TransformerMixin):
3613
3706
  if y is not None:
3614
3707
  with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
3615
3708
  pickle.dump(sample(y, xy_sample_index), y_file)
3616
- if eval_set is not None:
3709
+ if eval_set:
3617
3710
  eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
3618
3711
  with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
3619
3712
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
upgini/http.py CHANGED
@@ -22,6 +22,7 @@ from pydantic import BaseModel
22
22
  from pythonjsonlogger import jsonlogger
23
23
  from requests.exceptions import RequestException
24
24
 
25
+ from upgini.__about__ import __version__
25
26
  from upgini.errors import (
26
27
  HttpError,
27
28
  UnauthorizedError,
@@ -38,17 +39,17 @@ from upgini.metadata import (
38
39
  from upgini.resource_bundle import bundle
39
40
  from upgini.utils.track_info import get_track_metrics
40
41
 
41
- try:
42
- from importlib_metadata import version # type: ignore
42
+ # try:
43
+ # from importlib.metadata import version # type: ignore
43
44
 
44
- __version__ = version("upgini")
45
- except ImportError:
46
- try:
47
- from importlib.metadata import version # type: ignore
45
+ # __version__ = version("upgini")
46
+ # except ImportError:
47
+ # try:
48
+ # from importlib_metadata import version # type: ignore
48
49
 
49
- __version__ = version("upgini")
50
- except ImportError:
51
- __version__ = "Upgini wasn't installed"
50
+ # __version__ = version("upgini")
51
+ # except ImportError:
52
+ # __version__ = "Upgini wasn't installed"
52
53
 
53
54
  UPGINI_URL: str = "UPGINI_URL"
54
55
  UPGINI_API_KEY: str = "UPGINI_API_KEY"
@@ -925,7 +926,7 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
925
926
  return api_token is None or api_token == "" or api_token == DEMO_API_KEY
926
927
 
927
928
 
928
- @lru_cache()
929
+ @lru_cache
929
930
  def _get_rest_client(
930
931
  backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
931
932
  ) -> _RestClient:
upgini/mdc/__init__.py CHANGED
@@ -1,15 +1,13 @@
1
- # -*- coding: utf-8 -*-
2
1
  """
3
2
  .. module: mdc
4
3
  .. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
5
4
  """
6
- from __future__ import absolute_import, division, print_function, unicode_literals
7
5
 
8
6
  import logging
9
7
 
10
- from upgini.mdc.context import new_log_context, get_mdc_fields
11
8
  from pythonjsonlogger import jsonlogger
12
9
 
10
+ from upgini.mdc.context import get_mdc_fields, new_log_context
13
11
 
14
12
  MDContext = new_log_context
15
13
  MDC = new_log_context
upgini/mdc/context.py CHANGED
@@ -1,4 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
1
  """
3
2
  .. module: TODO
4
3
  :platform: TODO
@@ -7,12 +6,11 @@
7
6
  .. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
8
7
  """
9
8
 
10
- import time
11
- import uuid
9
+ import collections
12
10
  import logging
13
11
  import threading
14
- import collections
15
-
12
+ import time
13
+ import uuid
16
14
  from contextlib import contextmanager
17
15
 
18
16
  LOGGER = logging.getLogger(__name__)
@@ -32,7 +30,7 @@ def get_mdc_fields():
32
30
 
33
31
  @contextmanager
34
32
  def new_log_context(**kwargs):
35
- context_id = "mdc-{thread}-{context}".format(thread=threading.current_thread().ident, context=uuid.uuid4())
33
+ context_id = f"mdc-{threading.current_thread().ident}-{uuid.uuid4()}"
36
34
 
37
35
  LOGGER.debug("creating context %s", context_id)
38
36
 
upgini/metadata.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from enum import Enum
2
4
  from typing import Dict, List, Optional, Set
3
5
 
@@ -201,6 +203,7 @@ class FileMetadata(BaseModel):
201
203
  for c in self.columns:
202
204
  if c.name == name:
203
205
  return c
206
+ return None
204
207
 
205
208
  def search_types(self) -> Set[SearchKey]:
206
209
  search_keys = set()