upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.274a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  from typing import Dict
2
- from upgini.autofe.date import DateDiff, DateDiffFuture
2
+ from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
3
3
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
4
4
  from upgini.autofe.operand import Operand
5
5
  from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -37,7 +37,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
37
37
  Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
38
38
  Sim(),
39
39
  DateDiff(),
40
- DateDiffFuture(),
40
+ DateDiffType2(),
41
+ DateListDiff(aggregation="min"),
42
+ DateListDiff(aggregation="max"),
43
+ DateListDiff(aggregation="mean"),
44
+ DateListDiff(aggregation="nunique"),
45
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
46
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
47
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
48
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
49
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
50
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
41
51
  ]
42
52
  }
43
53
 
upgini/autofe/date.py CHANGED
@@ -1,11 +1,12 @@
1
- from typing import Optional, Union
1
+ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
+ from pydantic import BaseModel
4
5
 
5
6
  from upgini.autofe.operand import PandasOperand
6
7
 
7
8
 
8
- class DateDiffMixin:
9
+ class DateDiffMixin(BaseModel):
9
10
  diff_unit: str = "D"
10
11
  left_unit: Optional[str] = None
11
12
  right_unit: Optional[str] = None
@@ -34,18 +35,77 @@ class DateDiff(PandasOperand, DateDiffMixin):
34
35
  return x
35
36
 
36
37
 
37
- class DateDiffFuture(PandasOperand, DateDiffMixin):
38
- name = "date_diff_future"
38
+ class DateDiffType2(PandasOperand, DateDiffMixin):
39
+ name = "date_diff_type2"
39
40
  is_binary = True
40
41
  has_symmetry_importance = True
41
- is_vectorizable = False
42
42
 
43
43
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
44
  left = self._convert_to_date(left, self.left_unit)
45
45
  right = self._convert_to_date(right, self.right_unit)
46
- future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
46
+ future = right + (left.dt.year - right.dt.year).apply(
47
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
+ )
49
+ future = pd.to_datetime(future)
47
50
  before = future[future < left]
48
- future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
51
+ future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
49
52
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
50
53
 
51
54
  return diff
55
+
56
+
57
+ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
58
+
59
+
60
+ class DateListDiff(PandasOperand, DateDiffMixin):
61
+ is_binary = True
62
+ has_symmetry_importance = True
63
+ aggregation: str
64
+
65
+ def __init__(self, **data: Any) -> None:
66
+ if "name" not in data:
67
+ data["name"] = f"date_diff_{data.get('aggregation')}"
68
+ super().__init__(**data)
69
+
70
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
71
+ left = self._convert_to_date(left, self.left_unit)
72
+ right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
73
+
74
+ return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
75
+
76
+ def _diff(self, x):
77
+ x = x / np.timedelta64(1, self.diff_unit)
78
+ return x[x > 0]
79
+
80
+ def _agg(self, x):
81
+ method = getattr(np, self.aggregation, None)
82
+ default = np.nan
83
+ if method is None and self.aggregation in _ext_aggregations:
84
+ method, default = _ext_aggregations[self.aggregation]
85
+ elif not callable(method):
86
+ raise ValueError(f"Unsupported aggregation: {self.aggregation}")
87
+
88
+ return method(x) if len(x) > 0 else default
89
+
90
+
91
+ class DateListDiffBounded(DateListDiff):
92
+ lower_bound: Optional[int]
93
+ upper_bound: Optional[int]
94
+
95
+ def __init__(self, **data: Any) -> None:
96
+ if "name" not in data:
97
+ lower_bound = data.get("lower_bound")
98
+ upper_bound = data.get("upper_bound")
99
+ components = [
100
+ "date_diff",
101
+ data.get("diff_unit"),
102
+ str(lower_bound if lower_bound is not None else "minusinf"),
103
+ str(upper_bound if upper_bound is not None else "plusinf"),
104
+ ]
105
+ components.append(data.get("aggregation"))
106
+ data["name"] = "_".join(components)
107
+ super().__init__(**data)
108
+
109
+ def _agg(self, x):
110
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
111
+ return super()._agg(x)
@@ -48,6 +48,7 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
+ exclude_from_autofe_generation: Optional[List[str]],
51
52
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
52
53
  sort_column: Optional[str] = None,
53
54
  date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
57
58
  join_date_abs_limit_days: Optional[int] = None,
58
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
59
60
  data_table_id_to_replace: Optional[str] = None,
60
- exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -72,8 +72,8 @@ class DataSourcePublisher:
72
72
  )
73
73
  if search_keys is None or len(search_keys) == 0:
74
74
  raise ValidationError("Empty search keys")
75
- if SearchKey.DATE in search_keys.values() and date_format is None:
76
- raise ValidationError("date_format is required for DATE search key")
75
+ # if SearchKey.DATE in search_keys.values() and date_format is None:
76
+ # raise ValidationError("date_format is required for DATE search key")
77
77
  if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
78
78
  raise ValidationError(
79
79
  f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
85
85
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
86
86
  ) and sort_column is None:
87
87
  raise ValidationError("Sort column is required for passed search keys")
88
+ if (
89
+ set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
90
+ and snapshot_frequency_days is None
91
+ and join_date_abs_limit_days is None
92
+ ):
93
+ raise ValidationError(
94
+ "With MSISDN and DATE keys one of the snapshot_frequency_days or"
95
+ " join_date_abs_limit_days parameters is required"
96
+ )
88
97
 
89
98
  request = {
90
99
  "dataTableUri": data_table_uri,
91
100
  "searchKeys": {k: v.value.value for k, v in search_keys.items()},
92
- "dateFormat": date_format,
93
101
  "excludeColumns": exclude_columns,
94
102
  "hashFeatureNames": str(hash_feature_names).lower(),
95
103
  "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
98
106
  "featuresForEmbeddings": features_for_embeddings,
99
107
  "forceGeneration": str(_force_generation).lower(),
100
108
  }
109
+ if date_format is not None:
110
+ request["dateFormat"] = date_format
101
111
  if secondary_search_keys is not None:
102
112
  request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
103
113
  if sort_column is not None:
upgini/dataset.py CHANGED
@@ -60,7 +60,7 @@ class Dataset: # (pd.DataFrame):
60
60
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
61
61
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
62
62
  MIN_SAMPLE_THRESHOLD = 5_000
63
- IMBALANCE_THESHOLD = 0.4
63
+ IMBALANCE_THESHOLD = 0.6
64
64
  BINARY_BOOTSTRAP_LOOPS = 5
65
65
  MULTICLASS_BOOTSTRAP_LOOPS = 2
66
66
  MIN_TARGET_CLASS_ROWS = 100
@@ -1,4 +1,5 @@
1
1
  import dataclasses
2
+ import datetime
2
3
  import gc
3
4
  import hashlib
4
5
  import itertools
@@ -70,6 +71,7 @@ from upgini.utils.datetime_utils import (
70
71
  DateTimeSearchKeyConverter,
71
72
  is_blocked_time_series,
72
73
  is_time_series,
74
+ validate_dates_distribution,
73
75
  )
74
76
  from upgini.utils.deduplicate_utils import (
75
77
  clean_full_duplicates,
@@ -93,7 +95,7 @@ try:
93
95
  except Exception:
94
96
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
95
97
 
96
- from upgini.utils.target_utils import define_task
98
+ from upgini.utils.target_utils import calculate_psi, define_task
97
99
  from upgini.utils.warning_counter import WarningCounter
98
100
  from upgini.version_validator import validate_version
99
101
 
@@ -145,6 +147,7 @@ class FeaturesEnricher(TransformerMixin):
145
147
  """
146
148
 
147
149
  TARGET_NAME = "target"
150
+ CURRENT_DATE = "current_date"
148
151
  RANDOM_STATE = 42
149
152
  CALCULATE_METRICS_THRESHOLD = 50_000_000
150
153
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -206,6 +209,7 @@ class FeaturesEnricher(TransformerMixin):
206
209
  client_ip: Optional[str] = None,
207
210
  client_visitorid: Optional[str] = None,
208
211
  custom_bundle_config: Optional[str] = None,
212
+ add_date_if_missing: bool = True,
209
213
  **kwargs,
210
214
  ):
211
215
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -316,6 +320,7 @@ class FeaturesEnricher(TransformerMixin):
316
320
  self.raise_validation_error = raise_validation_error
317
321
  self.exclude_columns = exclude_columns
318
322
  self.baseline_score_column = baseline_score_column
323
+ self.add_date_if_missing = add_date_if_missing
319
324
 
320
325
  def _get_api_key(self):
321
326
  return self._api_key
@@ -423,7 +428,7 @@ class FeaturesEnricher(TransformerMixin):
423
428
  self.X = X
424
429
  self.y = y
425
430
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
426
- self.dump_input(trace_id, X, y, eval_set)
431
+ self.dump_input(trace_id, X, y, self.eval_set)
427
432
  self.__inner_fit(
428
433
  trace_id,
429
434
  X,
@@ -562,7 +567,7 @@ class FeaturesEnricher(TransformerMixin):
562
567
  self.X = X
563
568
  self.y = y
564
569
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
565
- self.dump_input(trace_id, X, y, eval_set)
570
+ self.dump_input(trace_id, X, y, self.eval_set)
566
571
 
567
572
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
568
573
  raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
@@ -822,12 +827,16 @@ class FeaturesEnricher(TransformerMixin):
822
827
  print(msg)
823
828
 
824
829
  self.__validate_search_keys(self.search_keys, self.search_id)
830
+ effective_X = X if X is not None else self.X
831
+ effective_y = y if y is not None else self.y
832
+ effective_eval_set = eval_set if eval_set is not None else self.eval_set
833
+ effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
825
834
 
826
835
  try:
827
836
  self.__log_debug_information(
828
- X if X is not None else self.X,
829
- y if y is not None else self.y,
830
- eval_set if eval_set is not None else self.eval_set,
837
+ effective_X,
838
+ effective_y,
839
+ effective_eval_set,
831
840
  exclude_features_sources=exclude_features_sources,
832
841
  cv=cv if cv is not None else self.cv,
833
842
  importance_threshold=importance_threshold,
@@ -841,17 +850,14 @@ class FeaturesEnricher(TransformerMixin):
841
850
  self._search_task is None
842
851
  or self._search_task.provider_metadata_v2 is None
843
852
  or len(self._search_task.provider_metadata_v2) == 0
844
- or (self.X is None and X is None)
845
- or (self.y is None and y is None)
853
+ or effective_X is None
854
+ or effective_y is None
846
855
  ):
847
856
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
848
857
 
849
858
  if X is not None and y is None:
850
859
  raise ValidationError("X passed without y")
851
860
 
852
- effective_X = X if X is not None else self.X
853
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
854
-
855
861
  validate_scoring_argument(scoring)
856
862
 
857
863
  self._validate_baseline_score(effective_X, effective_eval_set)
@@ -871,8 +877,7 @@ class FeaturesEnricher(TransformerMixin):
871
877
  ):
872
878
  cat_features = estimator.get_param("cat_features")
873
879
  if len(cat_features) > 0 and isinstance(cat_features[0], int):
874
- effectiveX = X or self.X
875
- cat_features = [effectiveX.columns[i] for i in cat_features]
880
+ cat_features = [effective_X.columns[i] for i in cat_features]
876
881
  for cat_feature in cat_features:
877
882
  if cat_feature in self.search_keys:
878
883
  if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
@@ -882,9 +887,9 @@ class FeaturesEnricher(TransformerMixin):
882
887
 
883
888
  prepared_data = self._prepare_data_for_metrics(
884
889
  trace_id=trace_id,
885
- X=X,
886
- y=y,
887
- eval_set=eval_set,
890
+ X=effective_X,
891
+ y=effective_y,
892
+ eval_set=effective_eval_set,
888
893
  exclude_features_sources=exclude_features_sources,
889
894
  importance_threshold=importance_threshold,
890
895
  max_features=max_features,
@@ -994,8 +999,6 @@ class FeaturesEnricher(TransformerMixin):
994
999
  enriched_metric = None
995
1000
  uplift = None
996
1001
 
997
- effective_X = X if X is not None else self.X
998
- effective_y = y if y is not None else self.y
999
1002
  train_metrics = {
1000
1003
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1001
1004
  "quality_metrics_train_segment"
@@ -1256,6 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
1256
1259
  ).get_cv_and_groups(X)
1257
1260
  else:
1258
1261
  from sklearn import __version__ as sklearn_version
1262
+
1259
1263
  try:
1260
1264
  from sklearn.model_selection._split import GroupsConsumerMixin
1261
1265
 
@@ -1684,6 +1688,9 @@ class FeaturesEnricher(TransformerMixin):
1684
1688
  df = validated_X.copy()
1685
1689
 
1686
1690
  df[TARGET] = validated_y
1691
+
1692
+ df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1693
+
1687
1694
  num_samples = _num_samples(df)
1688
1695
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1689
1696
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
@@ -1801,10 +1808,11 @@ class FeaturesEnricher(TransformerMixin):
1801
1808
  else:
1802
1809
  features_section = ""
1803
1810
 
1804
- api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1811
+ search_id = self._search_task.search_task_id
1812
+ api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1805
1813
  -H 'Authorization: {self.api_key}' \\
1806
1814
  -H 'Content-Type: application/json' \\
1807
- -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1815
+ -d '{{"search_keys": {keys}{features_section}}}'"""
1808
1816
  return api_example
1809
1817
 
1810
1818
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1899,6 +1907,8 @@ class FeaturesEnricher(TransformerMixin):
1899
1907
  generated_features.extend(converter.generated_features)
1900
1908
  else:
1901
1909
  self.logger.info("Input dataset hasn't date column")
1910
+ if self.add_date_if_missing:
1911
+ df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1902
1912
  email_column = self._get_email_column(search_keys)
1903
1913
  hem_column = self._get_hem_column(search_keys)
1904
1914
  email_converted_to_hem = False
@@ -1918,6 +1928,7 @@ class FeaturesEnricher(TransformerMixin):
1918
1928
 
1919
1929
  meaning_types = {col: key.value for col, key in search_keys.items()}
1920
1930
  non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1931
+
1921
1932
  if email_converted_to_hem:
1922
1933
  non_keys_columns.append(email_column)
1923
1934
 
@@ -1939,6 +1950,7 @@ class FeaturesEnricher(TransformerMixin):
1939
1950
  if add_fit_system_record_id:
1940
1951
  df = self.__add_fit_system_record_id(df, dict(), search_keys)
1941
1952
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1953
+ non_keys_columns.append(SORT_ID)
1942
1954
 
1943
1955
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1944
1956
 
@@ -2215,14 +2227,13 @@ class FeaturesEnricher(TransformerMixin):
2215
2227
  self.fit_search_keys = self.search_keys.copy()
2216
2228
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2217
2229
 
2218
- has_date = self._get_date_column(self.fit_search_keys) is not None
2230
+ validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2231
+
2232
+ maybe_date_column = self._get_date_column(self.fit_search_keys)
2233
+ has_date = maybe_date_column is not None
2219
2234
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2220
2235
  self._validate_binary_observations(validated_y, model_task_type)
2221
2236
 
2222
- df = self.__handle_index_search_keys(df, self.fit_search_keys)
2223
-
2224
- df = self.__correct_target(df)
2225
-
2226
2237
  self.runtime_parameters = get_runtime_params_custom_loss(
2227
2238
  self.loss, model_task_type, self.runtime_parameters, self.logger
2228
2239
  )
@@ -2234,6 +2245,13 @@ class FeaturesEnricher(TransformerMixin):
2234
2245
  eval_df[EVAL_SET_INDEX] = idx + 1
2235
2246
  df = pd.concat([df, eval_df])
2236
2247
 
2248
+ df = self.__correct_target(df)
2249
+
2250
+ df = self.__handle_index_search_keys(df, self.fit_search_keys)
2251
+
2252
+ if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2253
+ self._validate_PSI(df.sort_values(by=maybe_date_column))
2254
+
2237
2255
  if DEFAULT_INDEX in df.columns:
2238
2256
  msg = self.bundle.get("unsupported_index_column")
2239
2257
  self.logger.info(msg)
@@ -2260,6 +2278,8 @@ class FeaturesEnricher(TransformerMixin):
2260
2278
  self.fit_generated_features.extend(converter.generated_features)
2261
2279
  else:
2262
2280
  self.logger.info("Input dataset hasn't date column")
2281
+ if self.add_date_if_missing:
2282
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2263
2283
  email_column = self._get_email_column(self.fit_search_keys)
2264
2284
  hem_column = self._get_hem_column(self.fit_search_keys)
2265
2285
  email_converted_to_hem = False
@@ -2808,6 +2828,7 @@ class FeaturesEnricher(TransformerMixin):
2808
2828
 
2809
2829
  maybe_date_col = self._get_date_column(self.search_keys)
2810
2830
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2831
+ # TODO cast date column to single dtype
2811
2832
  min_date = X[maybe_date_col].min()
2812
2833
  max_date = X[maybe_date_col].max()
2813
2834
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
@@ -2839,6 +2860,25 @@ class FeaturesEnricher(TransformerMixin):
2839
2860
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2840
2861
  return col
2841
2862
 
2863
+ @staticmethod
2864
+ def _add_current_date_as_key(
2865
+ df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
2866
+ ) -> pd.DataFrame:
2867
+ if (
2868
+ set(search_keys.values()) == {SearchKey.PHONE}
2869
+ or set(search_keys.values()) == {SearchKey.EMAIL}
2870
+ or set(search_keys.values()) == {SearchKey.HEM}
2871
+ or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
2872
+ ):
2873
+ msg = bundle.get("current_date_added")
2874
+ print(msg)
2875
+ logger.warning(msg)
2876
+ df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2877
+ search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2878
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
2879
+ df = converter.convert(df)
2880
+ return df
2881
+
2842
2882
  @staticmethod
2843
2883
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2844
2884
  return [
@@ -2877,26 +2917,33 @@ class FeaturesEnricher(TransformerMixin):
2877
2917
 
2878
2918
  # order by date and idempotent order by other keys
2879
2919
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
2920
+ sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
2880
2921
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2881
2922
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2923
+ sort_exclude_columns.append(self._get_date_column(search_keys))
2882
2924
  else:
2883
2925
  date_column = self._get_date_column(search_keys)
2884
2926
  sort_columns = [date_column] if date_column is not None else []
2885
2927
 
2886
- other_search_keys = sorted(
2928
+ other_columns = sorted(
2887
2929
  [
2888
- sk
2889
- for sk, key_type in search_keys.items()
2890
- if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2891
- and sk in df.columns
2892
- and df[sk].nunique() > 1 # don't use constant keys for hash
2930
+ c
2931
+ for c in df.columns
2932
+ if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
2893
2933
  ]
2934
+ # [
2935
+ # sk
2936
+ # for sk, key_type in search_keys.items()
2937
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2938
+ # and sk in df.columns
2939
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
2940
+ # ]
2894
2941
  )
2895
2942
 
2896
2943
  search_keys_hash = "search_keys_hash"
2897
- if len(other_search_keys) > 0:
2944
+ if len(other_columns) > 0:
2898
2945
  sort_columns.append(search_keys_hash)
2899
- df[search_keys_hash] = pd.util.hash_pandas_object(df[sorted(other_search_keys)], index=False)
2946
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
2900
2947
 
2901
2948
  df = df.sort_values(by=sort_columns)
2902
2949
 
@@ -3185,22 +3232,21 @@ class FeaturesEnricher(TransformerMixin):
3185
3232
  return None
3186
3233
  features_meta = self._search_task.get_all_features_metadata_v2()
3187
3234
 
3188
- def get_feature_by_display_index(idx, op):
3235
+ def get_feature_by_name(name: str):
3189
3236
  for m in features_meta:
3190
- if m.name.endswith(f"_{op}_{idx}"):
3237
+ if m.name == name:
3191
3238
  return m
3192
3239
 
3193
3240
  descriptions = []
3194
3241
  for m in autofe_meta:
3195
3242
  autofe_feature = Feature.from_formula(m.formula)
3243
+ autofe_feature.set_display_index(m.display_index)
3196
3244
  if autofe_feature.op.is_vector:
3197
3245
  continue
3198
3246
 
3199
3247
  description = dict()
3200
3248
 
3201
- feature_meta = get_feature_by_display_index(
3202
- m.display_index, autofe_feature.op.alias or autofe_feature.op.name
3203
- )
3249
+ feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3204
3250
  if feature_meta is None:
3205
3251
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
3206
3252
  continue
@@ -3547,6 +3593,34 @@ class FeaturesEnricher(TransformerMixin):
3547
3593
  self.logger.warning(msg)
3548
3594
  print(msg)
3549
3595
 
3596
+ def _validate_PSI(self, df: pd.DataFrame):
3597
+ if EVAL_SET_INDEX in df.columns:
3598
+ train = df.query(f"{EVAL_SET_INDEX} == 0")
3599
+ eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
3600
+ else:
3601
+ train = df
3602
+ eval1 = None
3603
+
3604
+ # 1. Check train PSI
3605
+ half_train = round(len(train) / 2)
3606
+ part1 = train[:half_train]
3607
+ part2 = train[half_train:]
3608
+ train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
3609
+ if train_psi > 0.2:
3610
+ self.warning_counter.increment()
3611
+ msg = self.bundle.get("train_unstable_target").format(train_psi)
3612
+ print(msg)
3613
+ self.logger.warning(msg)
3614
+
3615
+ # 2. Check train-test PSI
3616
+ if eval1 is not None:
3617
+ train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
3618
+ if train_test_psi > 0.2:
3619
+ self.warning_counter.increment()
3620
+ msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
3621
+ print(msg)
3622
+ self.logger.warning(msg)
3623
+
3550
3624
  def _dump_python_libs(self):
3551
3625
  try:
3552
3626
  from pip._internal.operations.freeze import freeze
@@ -3613,7 +3687,7 @@ class FeaturesEnricher(TransformerMixin):
3613
3687
  if y is not None:
3614
3688
  with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
3615
3689
  pickle.dump(sample(y, xy_sample_index), y_file)
3616
- if eval_set is not None:
3690
+ if eval_set:
3617
3691
  eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
3618
3692
  with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
3619
3693
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
upgini/fingerprint.js ADDED
@@ -0,0 +1,8 @@
1
+ /**
2
+ * FingerprintJS v3.4.2 - Copyright (c) FingerprintJS, Inc, 2023 (https://fingerprint.com)
3
+ * Licensed under the MIT (http://www.opensource.org/licenses/mit-license.php) license.
4
+ *
5
+ * This software contains code from open-source projects:
6
+ * MurmurHash3 by Karan Lyons (https://github.com/karanlyons/murmurHash3.js)
7
+ */
8
+ var e=function(){return e=Object.assign||function(e){for(var n,t=1,r=arguments.length;t<r;t++)for(var o in n=arguments[t])Object.prototype.hasOwnProperty.call(n,o)&&(e[o]=n[o]);return e},e.apply(this,arguments)};function n(e,n,t,r){return new(t||(t=Promise))((function(o,a){function i(e){try{u(r.next(e))}catch(n){a(n)}}function c(e){try{u(r.throw(e))}catch(n){a(n)}}function u(e){var n;e.done?o(e.value):(n=e.value,n instanceof t?n:new t((function(e){e(n)}))).then(i,c)}u((r=r.apply(e,n||[])).next())}))}function t(e,n){var t,r,o,a,i={label:0,sent:function(){if(1&o[0])throw o[1];return o[1]},trys:[],ops:[]};return a={next:c(0),throw:c(1),return:c(2)},"function"==typeof Symbol&&(a[Symbol.iterator]=function(){return this}),a;function c(c){return function(u){return function(c){if(t)throw new TypeError("Generator is already executing.");for(;a&&(a=0,c[0]&&(i=0)),i;)try{if(t=1,r&&(o=2&c[0]?r.return:c[0]?r.throw||((o=r.return)&&o.call(r),0):r.next)&&!(o=o.call(r,c[1])).done)return o;switch(r=0,o&&(c=[2&c[0],o.value]),c[0]){case 0:case 1:o=c;break;case 4:return i.label++,{value:c[1],done:!1};case 5:i.label++,r=c[1],c=[0];continue;case 7:c=i.ops.pop(),i.trys.pop();continue;default:if(!(o=i.trys,(o=o.length>0&&o[o.length-1])||6!==c[0]&&2!==c[0])){i=0;continue}if(3===c[0]&&(!o||c[1]>o[0]&&c[1]<o[3])){i.label=c[1];break}if(6===c[0]&&i.label<o[1]){i.label=o[1],o=c;break}if(o&&i.label<o[2]){i.label=o[2],i.ops.push(c);break}o[2]&&i.ops.pop(),i.trys.pop();continue}c=n.call(e,i)}catch(u){c=[6,u],r=0}finally{t=o=0}if(5&c[0])throw c[1];return{value:c[0]?c[1]:void 0,done:!0}}([c,u])}}}function r(e,n,t){if(t||2===arguments.length)for(var r,o=0,a=n.length;o<a;o++)!r&&o in n||(r||(r=Array.prototype.slice.call(n,0,o)),r[o]=n[o]);return e.concat(r||Array.prototype.slice.call(n))}function o(e,n){return new Promise((function(t){return setTimeout(t,e,n)}))}function a(e){return!!e&&"function"==typeof e.then}function i(e,n){try{var t=e();a(t)?t.then((function(e){return n(!0,e)}),(function(e){return n(!1,e)})):n(!0,t)}catch(r){n(!1,r)}}function c(e,r,a){return void 0===a&&(a=16),n(this,void 0,void 0,(function(){var n,i,c,u;return t(this,(function(t){switch(t.label){case 0:n=Array(e.length),i=Date.now(),c=0,t.label=1;case 1:return c<e.length?(n[c]=r(e[c],c),(u=Date.now())>=i+a?(i=u,[4,o(0)]):[3,3]):[3,4];case 2:t.sent(),t.label=3;case 3:return++c,[3,1];case 4:return[2,n]}}))}))}function u(e){e.then(void 0,(function(){}))}function l(e,n){e=[e[0]>>>16,65535&e[0],e[1]>>>16,65535&e[1]],n=[n[0]>>>16,65535&n[0],n[1]>>>16,65535&n[1]];var t=[0,0,0,0];return t[3]+=e[3]+n[3],t[2]+=t[3]>>>16,t[3]&=65535,t[2]+=e[2]+n[2],t[1]+=t[2]>>>16,t[2]&=65535,t[1]+=e[1]+n[1],t[0]+=t[1]>>>16,t[1]&=65535,t[0]+=e[0]+n[0],t[0]&=65535,[t[0]<<16|t[1],t[2]<<16|t[3]]}function s(e,n){e=[e[0]>>>16,65535&e[0],e[1]>>>16,65535&e[1]],n=[n[0]>>>16,65535&n[0],n[1]>>>16,65535&n[1]];var t=[0,0,0,0];return t[3]+=e[3]*n[3],t[2]+=t[3]>>>16,t[3]&=65535,t[2]+=e[2]*n[3],t[1]+=t[2]>>>16,t[2]&=65535,t[2]+=e[3]*n[2],t[1]+=t[2]>>>16,t[2]&=65535,t[1]+=e[1]*n[3],t[0]+=t[1]>>>16,t[1]&=65535,t[1]+=e[2]*n[2],t[0]+=t[1]>>>16,t[1]&=65535,t[1]+=e[3]*n[1],t[0]+=t[1]>>>16,t[1]&=65535,t[0]+=e[0]*n[3]+e[1]*n[2]+e[2]*n[1]+e[3]*n[0],t[0]&=65535,[t[0]<<16|t[1],t[2]<<16|t[3]]}function d(e,n){return 32===(n%=64)?[e[1],e[0]]:n<32?[e[0]<<n|e[1]>>>32-n,e[1]<<n|e[0]>>>32-n]:(n-=32,[e[1]<<n|e[0]>>>32-n,e[0]<<n|e[1]>>>32-n])}function m(e,n){return 0===(n%=64)?e:n<32?[e[0]<<n|e[1]>>>32-n,e[1]<<n]:[e[1]<<n-32,0]}function f(e,n){return[e[0]^n[0],e[1]^n[1]]}function v(e){return e=f(e,[0,e[0]>>>1]),e=f(e=s(e,[4283543511,3981806797]),[0,e[0]>>>1]),e=f(e=s(e,[3301882366,444984403]),[0,e[0]>>>1])}function h(e,n){n=n||0;var t,r=(e=e||"").length%16,o=e.length-r,a=[0,n],i=[0,n],c=[0,0],u=[0,0],h=[2277735313,289559509],p=[1291169091,658871167];for(t=0;t<o;t+=16)c=[255&e.charCodeAt(t+4)|(255&e.charCodeAt(t+5))<<8|(255&e.charCodeAt(t+6))<<16|(255&e.charCodeAt(t+7))<<24,255&e.charCodeAt(t)|(255&e.charCodeAt(t+1))<<8|(255&e.charCodeAt(t+2))<<16|(255&e.charCodeAt(t+3))<<24],u=[255&e.charCodeAt(t+12)|(255&e.charCodeAt(t+13))<<8|(255&e.charCodeAt(t+14))<<16|(255&e.charCodeAt(t+15))<<24,255&e.charCodeAt(t+8)|(255&e.charCodeAt(t+9))<<8|(255&e.charCodeAt(t+10))<<16|(255&e.charCodeAt(t+11))<<24],c=d(c=s(c,h),31),a=l(a=d(a=f(a,c=s(c,p)),27),i),a=l(s(a,[0,5]),[0,1390208809]),u=d(u=s(u,p),33),i=l(i=d(i=f(i,u=s(u,h)),31),a),i=l(s(i,[0,5]),[0,944331445]);switch(c=[0,0],u=[0,0],r){case 15:u=f(u,m([0,e.charCodeAt(t+14)],48));case 14:u=f(u,m([0,e.charCodeAt(t+13)],40));case 13:u=f(u,m([0,e.charCodeAt(t+12)],32));case 12:u=f(u,m([0,e.charCodeAt(t+11)],24));case 11:u=f(u,m([0,e.charCodeAt(t+10)],16));case 10:u=f(u,m([0,e.charCodeAt(t+9)],8));case 9:u=s(u=f(u,[0,e.charCodeAt(t+8)]),p),i=f(i,u=s(u=d(u,33),h));case 8:c=f(c,m([0,e.charCodeAt(t+7)],56));case 7:c=f(c,m([0,e.charCodeAt(t+6)],48));case 6:c=f(c,m([0,e.charCodeAt(t+5)],40));case 5:c=f(c,m([0,e.charCodeAt(t+4)],32));case 4:c=f(c,m([0,e.charCodeAt(t+3)],24));case 3:c=f(c,m([0,e.charCodeAt(t+2)],16));case 2:c=f(c,m([0,e.charCodeAt(t+1)],8));case 1:c=s(c=f(c,[0,e.charCodeAt(t)]),h),a=f(a,c=s(c=d(c,31),p))}return a=l(a=f(a,[0,e.length]),i=f(i,[0,e.length])),i=l(i,a),a=l(a=v(a),i=v(i)),i=l(i,a),("00000000"+(a[0]>>>0).toString(16)).slice(-8)+("00000000"+(a[1]>>>0).toString(16)).slice(-8)+("00000000"+(i[0]>>>0).toString(16)).slice(-8)+("00000000"+(i[1]>>>0).toString(16)).slice(-8)}function p(e){return parseInt(e)}function b(e){return parseFloat(e)}function y(e,n){return"number"==typeof e&&isNaN(e)?n:e}function g(e){return e.reduce((function(e,n){return e+(n?1:0)}),0)}function w(e,n){if(void 0===n&&(n=1),Math.abs(n)>=1)return Math.round(e/n)*n;var t=1/n;return Math.round(e*t)/t}function L(e){return e&&"object"==typeof e&&"message"in e?e:{message:e}}function k(e){return"function"!=typeof e}function V(e,r,o){var a=Object.keys(e).filter((function(e){return!function(e,n){for(var t=0,r=e.length;t<r;++t)if(e[t]===n)return!0;return!1}(o,e)})),l=c(a,(function(n){return function(e,n){var t=new Promise((function(t){var r=Date.now();i(e.bind(null,n),(function(){for(var e=[],n=0;n<arguments.length;n++)e[n]=arguments[n];var o=Date.now()-r;if(!e[0])return t((function(){return{error:L(e[1]),duration:o}}));var a=e[1];if(k(a))return t((function(){return{value:a,duration:o}}));t((function(){return new Promise((function(e){var n=Date.now();i(a,(function(){for(var t=[],r=0;r<arguments.length;r++)t[r]=arguments[r];var a=o+Date.now()-n;if(!t[0])return e({error:L(t[1]),duration:a});e({value:t[1],duration:a})}))}))}))}))}));return u(t),function(){return t.then((function(e){return e()}))}}(e[n],r)}));return u(l),function(){return n(this,void 0,void 0,(function(){var e,n,r,o;return t(this,(function(t){switch(t.label){case 0:return[4,l];case 1:return[4,c(t.sent(),(function(e){var n=e();return u(n),n}))];case 2:return e=t.sent(),[4,Promise.all(e)];case 3:for(n=t.sent(),r={},o=0;o<a.length;++o)r[a[o]]=n[o];return[2,r]}}))}))}}function Z(e,n){var t=function(e){return k(e)?n(e):function(){var t=e();return a(t)?t.then(n):n(t)}};return function(n){var r=e(n);return a(r)?r.then(t):t(r)}}function W(){var e=window,n=navigator;return g(["MSCSSMatrix"in e,"msSetImmediate"in e,"msIndexedDB"in e,"msMaxTouchPoints"in n,"msPointerEnabled"in n])>=4}function C(){var e=window,n=navigator;return g(["msWriteProfilerMark"in e,"MSStream"in e,"msLaunchUri"in n,"msSaveBlob"in n])>=3&&!W()}function S(){var e=window,n=navigator;return g(["webkitPersistentStorage"in n,"webkitTemporaryStorage"in n,0===n.vendor.indexOf("Google"),"webkitResolveLocalFileSystemURL"in e,"BatteryManager"in e,"webkitMediaStream"in e,"webkitSpeechGrammar"in e])>=5}function x(){var e=window,n=navigator;return g(["ApplePayError"in e,"CSSPrimitiveValue"in e,"Counter"in e,0===n.vendor.indexOf("Apple"),"getStorageUpdates"in n,"WebKitMediaKeys"in e])>=4}function F(){var e=window;return g(["safari"in e,!("DeviceMotionEvent"in e),!("ongestureend"in e),!("standalone"in navigator)])>=3}function Y(){var e,n,t=window;return g(["buildID"in navigator,"MozAppearance"in(null!==(n=null===(e=document.documentElement)||void 0===e?void 0:e.style)&&void 0!==n?n:{}),"onmozfullscreenchange"in t,"mozInnerScreenX"in t,"CSSMozDocumentRule"in t,"CanvasCaptureMediaStream"in t])>=4}function M(){var e=document;return e.fullscreenElement||e.msFullscreenElement||e.mozFullScreenElement||e.webkitFullscreenElement||null}function G(){var e=S(),n=Y();if(!e&&!n)return!1;var t=window;return g(["onorientationchange"in t,"orientation"in t,e&&!("SharedWorker"in t),n&&/android/i.test(navigator.appVersion)])>=2}function R(e){var n=new Error(e);return n.name=e,n}function X(e,r,a){var i,c,u;return void 0===a&&(a=50),n(this,void 0,void 0,(function(){var n,l;return t(this,(function(t){switch(t.label){case 0:n=document,t.label=1;case 1:return n.body?[3,3]:[4,o(a)];case 2:return t.sent(),[3,1];case 3:l=n.createElement("iframe"),t.label=4;case 4:return t.trys.push([4,,10,11]),[4,new Promise((function(e,t){var o=!1,a=function(){o=!0,e()};l.onload=a,l.onerror=function(e){o=!0,t(e)};var i=l.style;i.setProperty("display","block","important"),i.position="absolute",i.top="0",i.left="0",i.visibility="hidden",r&&"srcdoc"in l?l.srcdoc=r:l.src="about:blank",n.body.appendChild(l);var c=function(){var e,n;o||("complete"===(null===(n=null===(e=l.contentWindow)||void 0===e?void 0:e.document)||void 0===n?void 0:n.readyState)?a():setTimeout(c,10))};c()}))];case 5:t.sent(),t.label=6;case 6:return(null===(c=null===(i=l.contentWindow)||void 0===i?void 0:i.document)||void 0===c?void 0:c.body)?[3,8]:[4,o(a)];case 7:return t.sent(),[3,6];case 8:return[4,e(l,l.contentWindow)];case 9:return[2,t.sent()];case 10:return null===(u=l.parentNode)||void 0===u||u.removeChild(l),[7];case 11:return[2]}}))}))}function A(e){for(var n=function(e){for(var n,t,r="Unexpected syntax '".concat(e,"'"),o=/^\s*([a-z-]*)(.*)$/i.exec(e),a=o[1]||void 0,i={},c=/([.:#][\w-]+|\[.+?\])/gi,u=function(e,n){i[e]=i[e]||[],i[e].push(n)};;){var l=c.exec(o[2]);if(!l)break;var s=l[0];switch(s[0]){case".":u("class",s.slice(1));break;case"#":u("id",s.slice(1));break;case"[":var d=/^\[([\w-]+)([~|^$*]?=("(.*?)"|([\w-]+)))?(\s+[is])?\]$/.exec(s);if(!d)throw new Error(r);u(d[1],null!==(t=null!==(n=d[4])&&void 0!==n?n:d[5])&&void 0!==t?t:"");break;default:throw new Error(r)}}return[a,i]}(e),t=n[0],r=n[1],o=document.createElement(null!=t?t:"div"),a=0,i=Object.keys(r);a<i.length;a++){var c=i[a],u=r[c].join(" ");"style"===c?j(o.style,u):o.setAttribute(c,u)}return o}function j(e,n){for(var t=0,r=n.split(";");t<r.length;t++){var o=r[t],a=/^\s*([\w-]+)\s*:\s*(.+?)(\s*!([\w-]+))?\s*$/.exec(o);if(a){var i=a[1],c=a[2],u=a[4];e.setProperty(i,c,u||"")}}}var I=["monospace","sans-serif","serif"],J=["sans-serif-thin","ARNO PRO","Agency FB","Arabic Typesetting","Arial Unicode MS","AvantGarde Bk BT","BankGothic Md BT","Batang","Bitstream Vera Sans Mono","Calibri","Century","Century Gothic","Clarendon","EUROSTILE","Franklin Gothic","Futura Bk BT","Futura Md BT","GOTHAM","Gill Sans","HELV","Haettenschweiler","Helvetica Neue","Humanst521 BT","Leelawadee","Letter Gothic","Levenim MT","Lucida Bright","Lucida Sans","Menlo","MS Mincho","MS Outlook","MS Reference Specialty","MS UI Gothic","MT Extra","MYRIAD PRO","Marlett","Meiryo UI","Microsoft Uighur","Minion Pro","Monotype Corsiva","PMingLiU","Pristina","SCRIPTINA","Segoe UI Light","Serifa","SimHei","Small Fonts","Staccato222 BT","TRAJAN PRO","Univers CE 55 Medium","Vrinda","ZWAdobeF"];function H(e){return e.toDataURL()}var P,N;function z(){var e=this;return function(){if(void 0===N){var e=function(){var n=D();E(n)?N=setTimeout(e,2500):(P=n,N=void 0)};e()}}(),function(){return n(e,void 0,void 0,(function(){var e;return t(this,(function(n){switch(n.label){case 0:return E(e=D())?P?[2,r([],P,!0)]:M()?[4,(t=document,(t.exitFullscreen||t.msExitFullscreen||t.mozCancelFullScreen||t.webkitExitFullscreen).call(t))]:[3,2]:[3,2];case 1:n.sent(),e=D(),n.label=2;case 2:return E(e)||(P=e),[2,e]}var t}))}))}}function D(){var e=screen;return[y(b(e.availTop),null),y(b(e.width)-b(e.availWidth)-y(b(e.availLeft),0),null),y(b(e.height)-b(e.availHeight)-y(b(e.availTop),0),null),y(b(e.availLeft),null)]}function E(e){for(var n=0;n<4;++n)if(e[n])return!1;return!0}function T(e){var r;return n(this,void 0,void 0,(function(){var n,a,i,c,u,l,s;return t(this,(function(t){switch(t.label){case 0:for(n=document,a=n.createElement("div"),i=new Array(e.length),c={},B(a),s=0;s<e.length;++s)"DIALOG"===(u=A(e[s])).tagName&&u.show(),B(l=n.createElement("div")),l.appendChild(u),a.appendChild(l),i[s]=u;t.label=1;case 1:return n.body?[3,3]:[4,o(50)];case 2:return t.sent(),[3,1];case 3:n.body.appendChild(a);try{for(s=0;s<e.length;++s)i[s].offsetParent||(c[e[s]]=!0)}finally{null===(r=a.parentNode)||void 0===r||r.removeChild(a)}return[2,c]}}))}))}function B(e){e.style.setProperty("display","block","important")}function _(e){return matchMedia("(inverted-colors: ".concat(e,")")).matches}function O(e){return matchMedia("(forced-colors: ".concat(e,")")).matches}function U(e){return matchMedia("(prefers-contrast: ".concat(e,")")).matches}function Q(e){return matchMedia("(prefers-reduced-motion: ".concat(e,")")).matches}function K(e){return matchMedia("(dynamic-range: ".concat(e,")")).matches}var q=Math,$=function(){return 0};var ee={default:[],apple:[{font:"-apple-system-body"}],serif:[{fontFamily:"serif"}],sans:[{fontFamily:"sans-serif"}],mono:[{fontFamily:"monospace"}],min:[{fontSize:"1px"}],system:[{fontFamily:"system-ui"}]};var ne={fonts:function(){return X((function(e,n){var t=n.document,r=t.body;r.style.fontSize="48px";var o=t.createElement("div"),a={},i={},c=function(e){var n=t.createElement("span"),r=n.style;return r.position="absolute",r.top="0",r.left="0",r.fontFamily=e,n.textContent="mmMwWLliI0O&1",o.appendChild(n),n},u=I.map(c),l=function(){for(var e={},n=function(n){e[n]=I.map((function(e){return function(e,n){return c("'".concat(e,"',").concat(n))}(n,e)}))},t=0,r=J;t<r.length;t++){n(r[t])}return e}();r.appendChild(o);for(var s=0;s<I.length;s++)a[I[s]]=u[s].offsetWidth,i[I[s]]=u[s].offsetHeight;return J.filter((function(e){return n=l[e],I.some((function(e,t){return n[t].offsetWidth!==a[e]||n[t].offsetHeight!==i[e]}));var n}))}))},domBlockers:function(e){var r=(void 0===e?{}:e).debug;return n(this,void 0,void 0,(function(){var e,n,o,a,i;return t(this,(function(t){switch(t.label){case 0:return x()||G()?(c=atob,e={abpIndo:["#Iklan-Melayang","#Kolom-Iklan-728","#SidebarIklan-wrapper",'[title="ALIENBOLA" i]',c("I0JveC1CYW5uZXItYWRz")],abpvn:[".quangcao","#mobileCatfish",c("LmNsb3NlLWFkcw=="),'[id^="bn_bottom_fixed_"]',"#pmadv"],adBlockFinland:[".mainostila",c("LnNwb25zb3JpdA=="),".ylamainos",c("YVtocmVmKj0iL2NsaWNrdGhyZ2guYXNwPyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9hcHAucmVhZHBlYWsuY29tL2FkcyJd")],adBlockPersian:["#navbar_notice_50",".kadr",'TABLE[width="140px"]',"#divAgahi",c("YVtocmVmXj0iaHR0cDovL2cxLnYuZndtcm0ubmV0L2FkLyJd")],adBlockWarningRemoval:["#adblock-honeypot",".adblocker-root",".wp_adblock_detect",c("LmhlYWRlci1ibG9ja2VkLWFk"),c("I2FkX2Jsb2NrZXI=")],adGuardAnnoyances:[".hs-sosyal","#cookieconsentdiv",'div[class^="app_gdpr"]',".as-oil",'[data-cypress="soft-push-notification-modal"]'],adGuardBase:[".BetterJsPopOverlay",c("I2FkXzMwMFgyNTA="),c("I2Jhbm5lcmZsb2F0MjI="),c("I2NhbXBhaWduLWJhbm5lcg=="),c("I0FkLUNvbnRlbnQ=")],adGuardChinese:[c("LlppX2FkX2FfSA=="),c("YVtocmVmKj0iLmh0aGJldDM0LmNvbSJd"),"#widget-quan",c("YVtocmVmKj0iLzg0OTkyMDIwLnh5eiJd"),c("YVtocmVmKj0iLjE5NTZobC5jb20vIl0=")],adGuardFrench:["#pavePub",c("LmFkLWRlc2t0b3AtcmVjdGFuZ2xl"),".mobile_adhesion",".widgetadv",c("LmFkc19iYW4=")],adGuardGerman:['aside[data-portal-id="leaderboard"]'],adGuardJapanese:["#kauli_yad_1",c("YVtocmVmXj0iaHR0cDovL2FkMi50cmFmZmljZ2F0ZS5uZXQvIl0="),c("Ll9wb3BJbl9pbmZpbml0ZV9hZA=="),c("LmFkZ29vZ2xl"),c("Ll9faXNib29zdFJldHVybkFk")],adGuardMobile:[c("YW1wLWF1dG8tYWRz"),c("LmFtcF9hZA=="),'amp-embed[type="24smi"]',"#mgid_iframe1",c("I2FkX2ludmlld19hcmVh")],adGuardRussian:[c("YVtocmVmXj0iaHR0cHM6Ly9hZC5sZXRtZWFkcy5jb20vIl0="),c("LnJlY2xhbWE="),'div[id^="smi2adblock"]',c("ZGl2W2lkXj0iQWRGb3hfYmFubmVyXyJd"),"#psyduckpockeball"],adGuardSocial:[c("YVtocmVmXj0iLy93d3cuc3R1bWJsZXVwb24uY29tL3N1Ym1pdD91cmw9Il0="),c("YVtocmVmXj0iLy90ZWxlZ3JhbS5tZS9zaGFyZS91cmw/Il0="),".etsy-tweet","#inlineShare",".popup-social"],adGuardSpanishPortuguese:["#barraPublicidade","#Publicidade","#publiEspecial","#queTooltip",".cnt-publi"],adGuardTrackingProtection:["#qoo-counter",c("YVtocmVmXj0iaHR0cDovL2NsaWNrLmhvdGxvZy5ydS8iXQ=="),c("YVtocmVmXj0iaHR0cDovL2hpdGNvdW50ZXIucnUvdG9wL3N0YXQucGhwIl0="),c("YVtocmVmXj0iaHR0cDovL3RvcC5tYWlsLnJ1L2p1bXAiXQ=="),"#top100counter"],adGuardTurkish:["#backkapat",c("I3Jla2xhbWk="),c("YVtocmVmXj0iaHR0cDovL2Fkc2Vydi5vbnRlay5jb20udHIvIl0="),c("YVtocmVmXj0iaHR0cDovL2l6bGVuemkuY29tL2NhbXBhaWduLyJd"),c("YVtocmVmXj0iaHR0cDovL3d3dy5pbnN0YWxsYWRzLm5ldC8iXQ==")],bulgarian:[c("dGQjZnJlZW5ldF90YWJsZV9hZHM="),"#ea_intext_div",".lapni-pop-over","#xenium_hot_offers"],easyList:[".yb-floorad",c("LndpZGdldF9wb19hZHNfd2lkZ2V0"),c("LnRyYWZmaWNqdW5reS1hZA=="),".textad_headline",c("LnNwb25zb3JlZC10ZXh0LWxpbmtz")],easyListChina:[c("LmFwcGd1aWRlLXdyYXBbb25jbGljayo9ImJjZWJvcy5jb20iXQ=="),c("LmZyb250cGFnZUFkdk0="),"#taotaole","#aafoot.top_box",".cfa_popup"],easyListCookie:[".ezmob-footer",".cc-CookieWarning","[data-cookie-number]",c("LmF3LWNvb2tpZS1iYW5uZXI="),".sygnal24-gdpr-modal-wrap"],easyListCzechSlovak:["#onlajny-stickers",c("I3Jla2xhbW5pLWJveA=="),c("LnJla2xhbWEtbWVnYWJvYXJk"),".sklik",c("W2lkXj0ic2tsaWtSZWtsYW1hIl0=")],easyListDutch:[c("I2FkdmVydGVudGll"),c("I3ZpcEFkbWFya3RCYW5uZXJCbG9jaw=="),".adstekst",c("YVtocmVmXj0iaHR0cHM6Ly94bHR1YmUubmwvY2xpY2svIl0="),"#semilo-lrectangle"],easyListGermany:["#SSpotIMPopSlider",c("LnNwb25zb3JsaW5rZ3J1ZW4="),c("I3dlcmJ1bmdza3k="),c("I3Jla2xhbWUtcmVjaHRzLW1pdHRl"),c("YVtocmVmXj0iaHR0cHM6Ly9iZDc0Mi5jb20vIl0=")],easyListItaly:[c("LmJveF9hZHZfYW5udW5jaQ=="),".sb-box-pubbliredazionale",c("YVtocmVmXj0iaHR0cDovL2FmZmlsaWF6aW9uaWFkcy5zbmFpLml0LyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9hZHNlcnZlci5odG1sLml0LyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9hZmZpbGlhemlvbmlhZHMuc25haS5pdC8iXQ==")],easyListLithuania:[c("LnJla2xhbW9zX3RhcnBhcw=="),c("LnJla2xhbW9zX251b3JvZG9z"),c("aW1nW2FsdD0iUmVrbGFtaW5pcyBza3lkZWxpcyJd"),c("aW1nW2FsdD0iRGVkaWt1b3RpLmx0IHNlcnZlcmlhaSJd"),c("aW1nW2FsdD0iSG9zdGluZ2FzIFNlcnZlcmlhaS5sdCJd")],estonian:[c("QVtocmVmKj0iaHR0cDovL3BheTRyZXN1bHRzMjQuZXUiXQ==")],fanboyAnnoyances:["#ac-lre-player",".navigate-to-top","#subscribe_popup",".newsletter_holder","#back-top"],fanboyAntiFacebook:[".util-bar-module-firefly-visible"],fanboyEnhancedTrackers:[".open.pushModal","#issuem-leaky-paywall-articles-zero-remaining-nag","#sovrn_container",'div[class$="-hide"][zoompage-fontsize][style="display: block;"]',".BlockNag__Card"],fanboySocial:["#FollowUs","#meteored_share","#social_follow",".article-sharer",".community__social-desc"],frellwitSwedish:[c("YVtocmVmKj0iY2FzaW5vcHJvLnNlIl1bdGFyZ2V0PSJfYmxhbmsiXQ=="),c("YVtocmVmKj0iZG9rdG9yLXNlLm9uZWxpbmsubWUiXQ=="),"article.category-samarbete",c("ZGl2LmhvbGlkQWRz"),"ul.adsmodern"],greekAdBlock:[c("QVtocmVmKj0iYWRtYW4ub3RlbmV0LmdyL2NsaWNrPyJd"),c("QVtocmVmKj0iaHR0cDovL2F4aWFiYW5uZXJzLmV4b2R1cy5nci8iXQ=="),c("QVtocmVmKj0iaHR0cDovL2ludGVyYWN0aXZlLmZvcnRobmV0LmdyL2NsaWNrPyJd"),"DIV.agores300","TABLE.advright"],hungarian:["#cemp_doboz",".optimonk-iframe-container",c("LmFkX19tYWlu"),c("W2NsYXNzKj0iR29vZ2xlQWRzIl0="),"#hirdetesek_box"],iDontCareAboutCookies:['.alert-info[data-block-track*="CookieNotice"]',".ModuleTemplateCookieIndicator",".o--cookies--container","#cookies-policy-sticky","#stickyCookieBar"],icelandicAbp:[c("QVtocmVmXj0iL2ZyYW1ld29yay9yZXNvdXJjZXMvZm9ybXMvYWRzLmFzcHgiXQ==")],latvian:[c("YVtocmVmPSJodHRwOi8vd3d3LnNhbGlkemluaS5sdi8iXVtzdHlsZT0iZGlzcGxheTogYmxvY2s7IHdpZHRoOiAxMjBweDsgaGVpZ2h0OiA0MHB4OyBvdmVyZmxvdzogaGlkZGVuOyBwb3NpdGlvbjogcmVsYXRpdmU7Il0="),c("YVtocmVmPSJodHRwOi8vd3d3LnNhbGlkemluaS5sdi8iXVtzdHlsZT0iZGlzcGxheTogYmxvY2s7IHdpZHRoOiA4OHB4OyBoZWlnaHQ6IDMxcHg7IG92ZXJmbG93OiBoaWRkZW47IHBvc2l0aW9uOiByZWxhdGl2ZTsiXQ==")],listKr:[c("YVtocmVmKj0iLy9hZC5wbGFuYnBsdXMuY28ua3IvIl0="),c("I2xpdmVyZUFkV3JhcHBlcg=="),c("YVtocmVmKj0iLy9hZHYuaW1hZHJlcC5jby5rci8iXQ=="),c("aW5zLmZhc3R2aWV3LWFk"),".revenue_unit_item.dable"],listeAr:[c("LmdlbWluaUxCMUFk"),".right-and-left-sponsers",c("YVtocmVmKj0iLmFmbGFtLmluZm8iXQ=="),c("YVtocmVmKj0iYm9vcmFxLm9yZyJd"),c("YVtocmVmKj0iZHViaXp6bGUuY29tL2FyLz91dG1fc291cmNlPSJd")],listeFr:[c("YVtocmVmXj0iaHR0cDovL3Byb21vLnZhZG9yLmNvbS8iXQ=="),c("I2FkY29udGFpbmVyX3JlY2hlcmNoZQ=="),c("YVtocmVmKj0id2Vib3JhbWEuZnIvZmNnaS1iaW4vIl0="),".site-pub-interstitiel",'div[id^="crt-"][data-criteo-id]'],officialPolish:["#ceneo-placeholder-ceneo-12",c("W2hyZWZePSJodHRwczovL2FmZi5zZW5kaHViLnBsLyJd"),c("YVtocmVmXj0iaHR0cDovL2Fkdm1hbmFnZXIudGVjaGZ1bi5wbC9yZWRpcmVjdC8iXQ=="),c("YVtocmVmXj0iaHR0cDovL3d3dy50cml6ZXIucGwvP3V0bV9zb3VyY2UiXQ=="),c("ZGl2I3NrYXBpZWNfYWQ=")],ro:[c("YVtocmVmXj0iLy9hZmZ0cmsuYWx0ZXgucm8vQ291bnRlci9DbGljayJd"),c("YVtocmVmXj0iaHR0cHM6Ly9ibGFja2ZyaWRheXNhbGVzLnJvL3Ryay9zaG9wLyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9ldmVudC4ycGVyZm9ybWFudC5jb20vZXZlbnRzL2NsaWNrIl0="),c("YVtocmVmXj0iaHR0cHM6Ly9sLnByb2ZpdHNoYXJlLnJvLyJd"),'a[href^="/url/"]'],ruAd:[c("YVtocmVmKj0iLy9mZWJyYXJlLnJ1LyJd"),c("YVtocmVmKj0iLy91dGltZy5ydS8iXQ=="),c("YVtocmVmKj0iOi8vY2hpa2lkaWtpLnJ1Il0="),"#pgeldiz",".yandex-rtb-block"],thaiAds:["a[href*=macau-uta-popup]",c("I2Fkcy1nb29nbGUtbWlkZGxlX3JlY3RhbmdsZS1ncm91cA=="),c("LmFkczMwMHM="),".bumq",".img-kosana"],webAnnoyancesUltralist:["#mod-social-share-2","#social-tools",c("LmN0cGwtZnVsbGJhbm5lcg=="),".zergnet-recommend",".yt.btn-link.btn-md.btn"]},n=Object.keys(e),[4,T((i=[]).concat.apply(i,n.map((function(n){return e[n]}))))]):[2,void 0];case 1:return o=t.sent(),r&&function(e,n){for(var t="DOM blockers debug:\n```",r=0,o=Object.keys(e);r<o.length;r++){var a=o[r];t+="\n".concat(a,":");for(var i=0,c=e[a];i<c.length;i++){var u=c[i];t+="\n ".concat(n[u]?"🚫":"➡️"," ").concat(u)}}console.log("".concat(t,"\n```"))}(e,o),(a=n.filter((function(n){var t=e[n];return g(t.map((function(e){return o[e]})))>.6*t.length}))).sort(),[2,a]}var c}))}))},fontPreferences:function(){return function(e,n){void 0===n&&(n=4e3);return X((function(t,o){var a=o.document,i=a.body,c=i.style;c.width="".concat(n,"px"),c.webkitTextSizeAdjust=c.textSizeAdjust="none",S()?i.style.zoom="".concat(1/o.devicePixelRatio):x()&&(i.style.zoom="reset");var u=a.createElement("div");return u.textContent=r([],Array(n/20<<0),!0).map((function(){return"word"})).join(" "),i.appendChild(u),e(a,i)}),'<!doctype html><html><head><meta name="viewport" content="width=device-width, initial-scale=1">')}((function(e,n){for(var t={},r={},o=0,a=Object.keys(ee);o<a.length;o++){var i=a[o],c=ee[i],u=c[0],l=void 0===u?{}:u,s=c[1],d=void 0===s?"mmMwWLliI0fiflO&1":s,m=e.createElement("span");m.textContent=d,m.style.whiteSpace="nowrap";for(var f=0,v=Object.keys(l);f<v.length;f++){var h=v[f],p=l[h];void 0!==p&&(m.style[h]=p)}t[i]=m,n.appendChild(e.createElement("br")),n.appendChild(m)}for(var b=0,y=Object.keys(ee);b<y.length;b++){r[i=y[b]]=t[i].getBoundingClientRect().width}return r}))},audio:function(){var e=window,n=e.OfflineAudioContext||e.webkitOfflineAudioContext;if(!n)return-2;if(x()&&!F()&&!function(){var e=window;return g(["DOMRectList"in e,"RTCPeerConnectionIceEvent"in e,"SVGGeometryElement"in e,"ontransitioncancel"in e])>=3}())return-1;var t=new n(1,5e3,44100),r=t.createOscillator();r.type="triangle",r.frequency.value=1e4;var o=t.createDynamicsCompressor();o.threshold.value=-50,o.knee.value=40,o.ratio.value=12,o.attack.value=0,o.release.value=.25,r.connect(o),o.connect(t.destination),r.start(0);var i=function(e){var n=3,t=500,r=500,o=5e3,i=function(){};return[new Promise((function(c,l){var s=!1,d=0,m=0;e.oncomplete=function(e){return c(e.renderedBuffer)};var f=function(){setTimeout((function(){return l(R("timeout"))}),Math.min(r,m+o-Date.now()))},v=function(){try{var r=e.startRendering();switch(a(r)&&u(r),e.state){case"running":m=Date.now(),s&&f();break;case"suspended":document.hidden||d++,s&&d>=n?l(R("suspended")):setTimeout(v,t)}}catch(o){l(o)}};v(),i=function(){s||(s=!0,m>0&&f())}})),i]}(t),c=i[0],l=i[1],s=c.then((function(e){return function(e){for(var n=0,t=0;t<e.length;++t)n+=Math.abs(e[t]);return n}(e.getChannelData(0).subarray(4500))}),(function(e){if("timeout"===e.name||"suspended"===e.name)return-3;throw e}));return u(s),function(){return l(),s}},screenFrame:function(){var e=this,r=z();return function(){return n(e,void 0,void 0,(function(){var e,n;return t(this,(function(t){switch(t.label){case 0:return[4,r()];case 1:return e=t.sent(),[2,[(n=function(e){return null===e?null:w(e,10)})(e[0]),n(e[1]),n(e[2]),n(e[3])]]}}))}))}},osCpu:function(){return navigator.oscpu},languages:function(){var e,n=navigator,t=[],r=n.language||n.userLanguage||n.browserLanguage||n.systemLanguage;if(void 0!==r&&t.push([r]),Array.isArray(n.languages))S()&&g([!("MediaSettingsRange"in(e=window)),"RTCEncodedAudioFrame"in e,""+e.Intl=="[object Intl]",""+e.Reflect=="[object Reflect]"])>=3||t.push(n.languages);else if("string"==typeof n.languages){var o=n.languages;o&&t.push(o.split(","))}return t},colorDepth:function(){return window.screen.colorDepth},deviceMemory:function(){return y(b(navigator.deviceMemory),void 0)},screenResolution:function(){var e=screen,n=function(e){return y(p(e),null)},t=[n(e.width),n(e.height)];return t.sort().reverse(),t},hardwareConcurrency:function(){return y(p(navigator.hardwareConcurrency),void 0)},timezone:function(){var e,n=null===(e=window.Intl)||void 0===e?void 0:e.DateTimeFormat;if(n){var t=(new n).resolvedOptions().timeZone;if(t)return t}var r,o=(r=(new Date).getFullYear(),-Math.max(b(new Date(r,0,1).getTimezoneOffset()),b(new Date(r,6,1).getTimezoneOffset())));return"UTC".concat(o>=0?"+":"").concat(Math.abs(o))},sessionStorage:function(){try{return!!window.sessionStorage}catch(e){return!0}},localStorage:function(){try{return!!window.localStorage}catch(e){return!0}},indexedDB:function(){if(!W()&&!C())try{return!!window.indexedDB}catch(e){return!0}},openDatabase:function(){return!!window.openDatabase},cpuClass:function(){return navigator.cpuClass},platform:function(){var e=navigator.platform;return"MacIntel"===e&&x()&&!F()?function(){if("iPad"===navigator.platform)return!0;var e=screen,n=e.width/e.height;return g(["MediaSource"in window,!!Element.prototype.webkitRequestFullscreen,n>.65&&n<1.53])>=2}()?"iPad":"iPhone":e},plugins:function(){var e=navigator.plugins;if(e){for(var n=[],t=0;t<e.length;++t){var r=e[t];if(r){for(var o=[],a=0;a<r.length;++a){var i=r[a];o.push({type:i.type,suffixes:i.suffixes})}n.push({name:r.name,description:r.description,mimeTypes:o})}}return n}},canvas:function(){var e,n,t=!1,r=function(){var e=document.createElement("canvas");return e.width=1,e.height=1,[e,e.getContext("2d")]}(),o=r[0],a=r[1];if(function(e,n){return!(!n||!e.toDataURL)}(o,a)){t=function(e){return e.rect(0,0,10,10),e.rect(2,2,6,6),!e.isPointInPath(5,5,"evenodd")}(a),function(e,n){e.width=240,e.height=60,n.textBaseline="alphabetic",n.fillStyle="#f60",n.fillRect(100,1,62,20),n.fillStyle="#069",n.font='11pt "Times New Roman"';var t="Cwm fjordbank gly ".concat(String.fromCharCode(55357,56835));n.fillText(t,2,15),n.fillStyle="rgba(102, 204, 0, 0.2)",n.font="18pt Arial",n.fillText(t,4,45)}(o,a);var i=H(o);i!==H(o)?e=n="unstable":(n=i,function(e,n){e.width=122,e.height=110,n.globalCompositeOperation="multiply";for(var t=0,r=[["#f2f",40,40],["#2ff",80,40],["#ff2",60,80]];t<r.length;t++){var o=r[t],a=o[0],i=o[1],c=o[2];n.fillStyle=a,n.beginPath(),n.arc(i,c,40,0,2*Math.PI,!0),n.closePath(),n.fill()}n.fillStyle="#f9c",n.arc(60,60,60,0,2*Math.PI,!0),n.arc(60,60,20,0,2*Math.PI,!0),n.fill("evenodd")}(o,a),e=H(o))}else e=n="";return{winding:t,geometry:e,text:n}},touchSupport:function(){var e,n=navigator,t=0;void 0!==n.maxTouchPoints?t=p(n.maxTouchPoints):void 0!==n.msMaxTouchPoints&&(t=n.msMaxTouchPoints);try{document.createEvent("TouchEvent"),e=!0}catch(r){e=!1}return{maxTouchPoints:t,touchEvent:e,touchStart:"ontouchstart"in window}},vendor:function(){return navigator.vendor||""},vendorFlavors:function(){for(var e=[],n=0,t=["chrome","safari","__crWeb","__gCrWeb","yandex","__yb","__ybro","__firefox__","__edgeTrackingPreventionStatistics","webkit","oprt","samsungAr","ucweb","UCShellJava","puffinDevice"];n<t.length;n++){var r=t[n],o=window[r];o&&"object"==typeof o&&e.push(r)}return e.sort()},cookiesEnabled:function(){var e=document;try{e.cookie="cookietest=1; SameSite=Strict;";var n=-1!==e.cookie.indexOf("cookietest=");return e.cookie="cookietest=1; SameSite=Strict; expires=Thu, 01-Jan-1970 00:00:01 GMT",n}catch(t){return!1}},colorGamut:function(){for(var e=0,n=["rec2020","p3","srgb"];e<n.length;e++){var t=n[e];if(matchMedia("(color-gamut: ".concat(t,")")).matches)return t}},invertedColors:function(){return!!_("inverted")||!_("none")&&void 0},forcedColors:function(){return!!O("active")||!O("none")&&void 0},monochrome:function(){if(matchMedia("(min-monochrome: 0)").matches){for(var e=0;e<=100;++e)if(matchMedia("(max-monochrome: ".concat(e,")")).matches)return e;throw new Error("Too high value")}},contrast:function(){return U("no-preference")?0:U("high")||U("more")?1:U("low")||U("less")?-1:U("forced")?10:void 0},reducedMotion:function(){return!!Q("reduce")||!Q("no-preference")&&void 0},hdr:function(){return!!K("high")||!K("standard")&&void 0},math:function(){var e,n=q.acos||$,t=q.acosh||$,r=q.asin||$,o=q.asinh||$,a=q.atanh||$,i=q.atan||$,c=q.sin||$,u=q.sinh||$,l=q.cos||$,s=q.cosh||$,d=q.tan||$,m=q.tanh||$,f=q.exp||$,v=q.expm1||$,h=q.log1p||$;return{acos:n(.12312423423423424),acosh:t(1e308),acoshPf:(e=1e154,q.log(e+q.sqrt(e*e-1))),asin:r(.12312423423423424),asinh:o(1),asinhPf:function(e){return q.log(e+q.sqrt(e*e+1))}(1),atanh:a(.5),atanhPf:function(e){return q.log((1+e)/(1-e))/2}(.5),atan:i(.5),sin:c(-1e300),sinh:u(1),sinhPf:function(e){return q.exp(e)-1/q.exp(e)/2}(1),cos:l(10.000000000123),cosh:s(1),coshPf:function(e){return(q.exp(e)+1/q.exp(e))/2}(1),tan:d(-1e300),tanh:m(1),tanhPf:function(e){return(q.exp(2*e)-1)/(q.exp(2*e)+1)}(1),exp:f(1),expm1:v(1),expm1Pf:function(e){return q.exp(e)-1}(1),log1p:h(10),log1pPf:function(e){return q.log(1+e)}(10),powPI:function(e){return q.pow(q.PI,e)}(-100)}},videoCard:function(){var e,n=document.createElement("canvas"),t=null!==(e=n.getContext("webgl"))&&void 0!==e?e:n.getContext("experimental-webgl");if(t&&"getExtension"in t){var r=t.getExtension("WEBGL_debug_renderer_info");if(r)return{vendor:(t.getParameter(r.UNMASKED_VENDOR_WEBGL)||"").toString(),renderer:(t.getParameter(r.UNMASKED_RENDERER_WEBGL)||"").toString()}}},pdfViewerEnabled:function(){return navigator.pdfViewerEnabled},architecture:function(){var e=new Float32Array(1),n=new Uint8Array(e.buffer);return e[0]=1/0,e[0]=e[0]-e[0],n[3]}};function te(e){var n=function(e){if(G())return.4;if(x())return F()?.5:.3;var n=e.platform.value||"";if(/^Win/.test(n))return.6;if(/^Mac/.test(n))return.5;return.7}(e),t=function(e){return w(.99+.01*e,1e-4)}(n);return{score:n,comment:"$ if upgrade to Pro: https://fpjs.dev/pro".replace(/\$/g,"".concat(t))}}function re(n){return JSON.stringify(n,(function(n,t){return t instanceof Error?e({name:(r=t).name,message:r.message,stack:null===(o=r.stack)||void 0===o?void 0:o.split("\n")},r):t;var r,o}),2)}function oe(e){return h(function(e){for(var n="",t=0,r=Object.keys(e).sort();t<r.length;t++){var o=r[t],a=e[o],i=a.error?"error":JSON.stringify(a.value);n+="".concat(n?"|":"").concat(o.replace(/([:|\\])/g,"\\$1"),":").concat(i)}return n}(e))}function ae(e){return void 0===e&&(e=50),function(e,n){void 0===n&&(n=1/0);var t=window.requestIdleCallback;return t?new Promise((function(e){return t.call(window,(function(){return e()}),{timeout:n})})):o(Math.min(e,n))}(e,2*e)}function ie(e,r){var o=Date.now();return{get:function(a){return n(this,void 0,void 0,(function(){var n,i,c;return t(this,(function(t){switch(t.label){case 0:return n=Date.now(),[4,e()];case 1:return i=t.sent(),c=function(e){var n;return{get visitorId(){return void 0===n&&(n=oe(this.components)),n},set visitorId(e){n=e},confidence:te(e),components:e,version:"3.4.2"}}(i),(r||(null==a?void 0:a.debug))&&console.log("Copy the text below to get the debug data:\n\n```\nversion: ".concat(c.version,"\nuserAgent: ").concat(navigator.userAgent,"\ntimeBetweenLoadAndGet: ").concat(n-o,"\nvisitorId: ").concat(c.visitorId,"\ncomponents: ").concat(re(i),"\n```")),[2,c]}}))}))}}}function ce(e){var r=void 0===e?{}:e,o=r.delayFallback,a=r.debug;return r.monitoring,n(this,void 0,void 0,(function(){return t(this,(function(e){switch(e.label){case 0:return[4,ae(o)];case 1:return e.sent(),[2,ie(V(ne,{debug:a},[]),a)]}}))}))}var ue={load:ce,hashComponents:oe,componentsToDebugString:re},le=h;export{re as componentsToDebugString,ue as default,M as getFullscreenElement,z as getScreenFrame,oe as hashComponents,G as isAndroid,S as isChromium,F as isDesktopSafari,C as isEdgeHTML,Y as isGecko,W as isTrident,x as isWebKit,ce as load,V as loadSources,le as murmurX64Hash128,ae as prepareForSources,ne as sources,Z as transformSource,X as withIframe};
upgini/metrics.py CHANGED
@@ -3,15 +3,16 @@ import re
3
3
  from copy import deepcopy
4
4
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
5
5
 
6
+ import catboost
6
7
  import numpy as np
7
8
  import pandas as pd
8
9
  from catboost import CatBoostClassifier, CatBoostRegressor
9
- import catboost
10
10
  from lightgbm import LGBMClassifier, LGBMRegressor
11
11
  from numpy import log1p
12
12
  from pandas.api.types import is_numeric_dtype
13
13
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
14
14
 
15
+ from upgini.utils.features_validator import FeaturesValidator
15
16
  from upgini.utils.sklearn_ext import cross_validate
16
17
 
17
18
  try:
@@ -352,6 +353,7 @@ class EstimatorWrapper:
352
353
  "target_type": target_type,
353
354
  "groups": groups,
354
355
  "text_features": text_features,
356
+ "logger": logger,
355
357
  }
356
358
  if estimator is None:
357
359
  params = dict()
@@ -414,12 +416,22 @@ class CatBoostWrapper(EstimatorWrapper):
414
416
  target_type: ModelTaskType,
415
417
  groups: Optional[List[str]] = None,
416
418
  text_features: Optional[List[str]] = None,
419
+ logger: Optional[logging.Logger] = None,
417
420
  ):
418
421
  super(CatBoostWrapper, self).__init__(
419
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
422
+ estimator,
423
+ scorer,
424
+ metric_name,
425
+ multiplier,
426
+ cv,
427
+ target_type,
428
+ groups=groups,
429
+ text_features=text_features,
430
+ logger=logger,
420
431
  )
421
432
  self.cat_features = None
422
433
  self.emb_features = None
434
+ self.exclude_features = []
423
435
 
424
436
  def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
425
437
  X, y, groups, params = super()._prepare_to_fit(X, y)
@@ -437,9 +449,7 @@ class CatBoostWrapper(EstimatorWrapper):
437
449
  X, embedding_features = self.group_embeddings(X)
438
450
  params["embedding_features"] = embedding_features
439
451
  else:
440
- self.logger.info(
441
- f"Embedding features count less than 3, so use them separately: {self.emb_features}"
442
- )
452
+ self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
443
453
  self.emb_features = []
444
454
  else:
445
455
  self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
@@ -498,6 +508,8 @@ class CatBoostWrapper(EstimatorWrapper):
498
508
  return df, [emb_name]
499
509
 
500
510
  def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
511
+ if self.exclude_features:
512
+ X = X.drop(columns=self.exclude_features)
501
513
  X, y, params = super()._prepare_to_calculate(X, y)
502
514
  if self.text_features:
503
515
  params["text_features"] = self.text_features
@@ -510,6 +522,26 @@ class CatBoostWrapper(EstimatorWrapper):
510
522
 
511
523
  return X, y, params
512
524
 
525
+ def cross_val_predict(
526
+ self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
527
+ ) -> Optional[float]:
528
+ try:
529
+ return super().cross_val_predict(X, y, baseline_score_column)
530
+ except Exception as e:
531
+ if "Dictionary size is 0" in e.args[0] and self.text_features:
532
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
533
+ self.logger.warning(
534
+ "Failed to calculate metrics. Try to remove high cardinality"
535
+ f" text features {high_cardinality_features} and retry"
536
+ )
537
+ for f in high_cardinality_features:
538
+ self.text_features.remove(f)
539
+ self.exclude_features.append(f)
540
+ X = X.drop(columns=f)
541
+ return super().cross_val_predict(X, y, baseline_score_column)
542
+ else:
543
+ raise e
544
+
513
545
 
514
546
  class LightGBMWrapper(EstimatorWrapper):
515
547
  def __init__(
@@ -522,9 +554,18 @@ class LightGBMWrapper(EstimatorWrapper):
522
554
  target_type: ModelTaskType,
523
555
  groups: Optional[List[str]] = None,
524
556
  text_features: Optional[List[str]] = None,
557
+ logger: Optional[logging.Logger] = None,
525
558
  ):
526
559
  super(LightGBMWrapper, self).__init__(
527
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
560
+ estimator,
561
+ scorer,
562
+ metric_name,
563
+ multiplier,
564
+ cv,
565
+ target_type,
566
+ groups=groups,
567
+ text_features=text_features,
568
+ logger=logger,
528
569
  )
529
570
  self.cat_features = None
530
571
 
@@ -561,9 +602,18 @@ class OtherEstimatorWrapper(EstimatorWrapper):
561
602
  target_type: ModelTaskType,
562
603
  groups: Optional[List[str]] = None,
563
604
  text_features: Optional[List[str]] = None,
605
+ logger: Optional[logging.Logger] = None,
564
606
  ):
565
607
  super(OtherEstimatorWrapper, self).__init__(
566
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
608
+ estimator,
609
+ scorer,
610
+ metric_name,
611
+ multiplier,
612
+ cv,
613
+ target_type,
614
+ groups=groups,
615
+ text_features=text_features,
616
+ logger=logger,
567
617
  )
568
618
  self.cat_features = None
569
619
 
@@ -595,6 +645,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
595
645
 
596
646
 
597
647
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
648
+ # TODO validate that if it is Callable then it accepts 3 arguments
598
649
  if isinstance(scoring, str) and scoring is not None:
599
650
  _get_scorer_by_name(scoring)
600
651
 
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -44,7 +44,7 @@ class PhoneNormalizer:
44
44
  Method will remove all non numeric chars from string and convert it to int.
45
45
  None will be set for phone numbers that couldn"t be converted to int
46
46
  """
47
- if is_string_dtype(self.df[self.phone_column_name]):
47
+ if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
48
  convert_func = self.phone_str_to_int_safe
49
49
  elif is_float_dtype(self.df[self.phone_column_name]):
50
50
  convert_func = self.phone_float_to_int_safe
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
38
38
  loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
39
39
  multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
40
40
  group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
41
+ current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
41
42
 
42
43
  # Errors
43
44
  failed_search_by_task_id=Failed to retrieve the specified search results
@@ -111,6 +112,9 @@ x_is_empty=X is empty
111
112
  y_is_empty=y is empty
112
113
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
114
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
115
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
+ train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
+ eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
114
118
  # eval set validation
115
119
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
116
120
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
145
149
  dataset_empty_column_names=Some column names are empty. Add names please
146
150
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
147
151
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
148
- dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
152
+ dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
153
+ dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
149
154
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
150
155
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
151
156
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
196
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
197
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
198
203
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
199
- target_type_detected=Detected task type: {}\n
204
+ target_type_detected=\nDetected task type: {}\n
200
205
  # all_ok_community_invite=Chat with us in Slack community:
201
206
  all_ok_community_invite=❓ Support request
202
- too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
207
+ too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
203
208
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
204
209
  loss_selection_info=Using loss `{}` for feature selection
205
210
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- from typing import List, Optional
4
+ from typing import Dict, List, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
+ from upgini.metadata import SearchKey
12
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
+ from upgini.utils.warning_counter import WarningCounter
13
15
 
14
16
  DATE_FORMATS = [
15
17
  "%Y-%m-%d",
@@ -98,6 +100,9 @@ class DateTimeSearchKeyConverter:
98
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
99
101
  self.logger.warning(msg)
100
102
  raise ValidationError(msg)
103
+ else:
104
+ df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
+ df[self.date_column] = self.parse_date(df)
101
106
 
102
107
  # If column with date is datetime then extract seconds of the day and minute of the hour
103
108
  # as additional features
@@ -225,3 +230,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
225
230
 
226
231
  is_diff_less_than_two_columns = grouped.apply(check_differences)
227
232
  return is_diff_less_than_two_columns.all()
233
+
234
+
235
+ def validate_dates_distribution(
236
+ X: pd.DataFrame,
237
+ search_keys: Dict[str, SearchKey],
238
+ logger: Optional[logging.Logger] = None,
239
+ bundle: Optional[ResourceBundle] = None,
240
+ warning_counter: Optional[WarningCounter] = None,
241
+ ):
242
+ maybe_date_col = None
243
+ for key, key_type in search_keys.items():
244
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
245
+ maybe_date_col = key
246
+
247
+ if maybe_date_col is None:
248
+ for col in X.columns:
249
+ if col in search_keys:
250
+ continue
251
+ try:
252
+ pd.to_datetime(X[col])
253
+ maybe_date_col = col
254
+ break
255
+ except Exception:
256
+ pass
257
+
258
+ if maybe_date_col is None:
259
+ return
260
+
261
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
262
+
263
+ date_counts = dates.value_counts().sort_index()
264
+
265
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
266
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
267
+ ratio = date_counts_2.mean() / date_counts_1.mean()
268
+
269
+ if ratio > 1.2 or ratio < 0.8:
270
+ if warning_counter is not None:
271
+ warning_counter.increment()
272
+ if logger is None:
273
+ logger = logging.getLogger("muted_logger")
274
+ logger.setLevel("FATAL")
275
+ bundle = bundle or get_custom_bundle()
276
+ msg = bundle.get("x_unstable_by_date")
277
+ print(msg)
278
+ logger.warning(msg)
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
7
  from upgini.resource_bundle import ResourceBundle
8
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
9
  from upgini.utils.target_utils import define_task
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
78
78
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
79
79
  if len(rows_with_diff_target) > 0:
80
80
  unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
81
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
82
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
83
- perc = len(rows_to_remove) * 100 / len(df)
84
- msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
85
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
86
- )
87
- if not silent:
88
- print(msg)
89
- if logger:
90
- logger.warning(msg)
91
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
92
- df = df[~df.index.isin(rows_to_remove.index)]
93
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
94
-
81
+ if EVAL_SET_INDEX not in df.columns:
82
+ rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
83
+ rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
84
+ perc = len(rows_to_remove) * 100 / len(df)
85
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
86
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
87
+ )
88
+ if not silent:
89
+ print(msg)
90
+ if logger:
91
+ logger.warning(msg)
92
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
93
+ df = df[~df.index.isin(rows_to_remove.index)]
94
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
+ else:
96
+ # Indices in train and eval_set can be the same so we remove rows from them separately
97
+ train = df.query(f"{EVAL_SET_INDEX} == 0")
98
+ train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
99
+ train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
100
+ train_perc = len(train_rows_to_remove) * 100 / len(train)
101
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
102
+ train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
103
+ )
104
+ if not silent:
105
+ print(msg)
106
+ if logger:
107
+ logger.warning(msg)
108
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
109
+ train = train[~train.index.isin(train_rows_to_remove.index)]
110
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
111
+
112
+ evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
113
+ new_evals = []
114
+ for i, eval in enumerate(evals):
115
+ eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
116
+ eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
117
+ eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
118
+ msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
119
+ eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
120
+ )
121
+ if not silent:
122
+ print(msg)
123
+ if logger:
124
+ logger.warning(msg)
125
+ logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
126
+ eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
127
+ logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
128
+ new_evals.append(eval)
129
+
130
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
131
+ df = pd.concat([train] + new_evals)
132
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
133
  return df
96
134
 
97
135
 
@@ -101,14 +139,18 @@ def clean_full_duplicates(
101
139
  nrows = len(df)
102
140
  if nrows == 0:
103
141
  return df
104
- # Remove absolute duplicates (exclude system_record_id)
142
+ # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
105
143
  unique_columns = df.columns.tolist()
106
144
  if SYSTEM_RECORD_ID in unique_columns:
107
145
  unique_columns.remove(SYSTEM_RECORD_ID)
108
146
  if SORT_ID in unique_columns:
109
147
  unique_columns.remove(SORT_ID)
148
+ if EVAL_SET_INDEX in unique_columns:
149
+ unique_columns.remove(EVAL_SET_INDEX)
110
150
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
111
- df = df.drop_duplicates(subset=unique_columns)
151
+ # Train segment goes first so if duplicates are found in train and eval set
152
+ # then we keep unique rows in train segment
153
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
112
154
  logger.info(f"Dataset shape after clean duplicates: {df.shape}")
113
155
  nrows_after_full_dedup = len(df)
114
156
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
@@ -123,7 +165,7 @@ def clean_full_duplicates(
123
165
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
124
166
  if marked_duplicates.sum() > 0:
125
167
  dups_indices = df[marked_duplicates].index.to_list()
126
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
168
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
127
169
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
128
170
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
129
171
 
@@ -133,6 +175,7 @@ def clean_full_duplicates(
133
175
  print(msg)
134
176
  df = df.drop_duplicates(subset=unique_columns, keep=False)
135
177
  logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
178
+
136
179
  return df
137
180
 
138
181
 
@@ -1,5 +1,4 @@
1
1
  import functools
2
- import logging
3
2
  import numbers
4
3
  import time
5
4
  import warnings
@@ -313,7 +312,7 @@ def cross_validate(
313
312
 
314
313
  return ret
315
314
  except Exception:
316
- logging.exception("Failed to execute overriden cross_validate. Fallback to original")
315
+ # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
317
316
  raise
318
317
  # fit_params["use_best_model"] = False
319
318
  # return original_cross_validate(
@@ -132,9 +132,7 @@ def balance_undersample(
132
132
  class_value = classes[class_idx]
133
133
  class_count = vc[class_value]
134
134
  sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
- sampler = RandomUnderSampler(
136
- sampling_strategy=sample_strategy, random_state=random_state
137
- )
135
+ sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
138
136
  X = df[SYSTEM_RECORD_ID]
139
137
  X = X.to_frame(SYSTEM_RECORD_ID)
140
138
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
@@ -153,9 +151,7 @@ def balance_undersample(
153
151
  minority_class = df[df[target_column] == min_class_value]
154
152
  majority_class = df[df[target_column] != min_class_value]
155
153
  sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
156
- sampled_majority_class = majority_class.sample(
157
- n=sample_size, random_state=random_state
158
- )
154
+ sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
159
155
  resampled_data = df[
160
156
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
161
157
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
@@ -181,3 +177,21 @@ def balance_undersample(
181
177
 
182
178
  logger.info(f"Shape after rebalance resampling: {resampled_data}")
183
179
  return resampled_data
180
+
181
+
182
+ def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
183
+ df = pd.concat([expected, actual])
184
+
185
+ # Define the bins for the target variable
186
+ df_min = df.min()
187
+ df_max = df.max()
188
+ bins = [df_min, (df_min + df_max) / 2, df_max]
189
+
190
+ # Calculate the base distribution
191
+ train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
192
+
193
+ # Calculate the target distribution
194
+ test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
+
196
+ # Calculate the PSI
197
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.262a3250.post4
3
+ Version: 1.1.274a4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
28
28
  License-File: LICENSE
29
29
  Requires-Dist: python-dateutil >=2.8.0
30
30
  Requires-Dist: requests >=2.8.0
31
- Requires-Dist: pandas <2.0.0,>=1.1.0
31
+ Requires-Dist: pandas <2.1.0,>=1.1.0
32
32
  Requires-Dist: numpy >=1.19.0
33
33
  Requires-Dist: scikit-learn >=1.3.0
34
34
  Requires-Dist: pydantic <2.0.0,>=1.8.2
@@ -1,34 +1,35 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=4LfrUwxhd__ZVqZkjPVxbC4SW3YLsk1sMMqnYPUaVpw,45529
3
+ upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=WbwnLvPVqn4m995b6jSamWkXyRVy18fnG7faBeuJbWI,172132
5
+ upgini/features_enricher.py,sha256=WDj4DO5lqANBdihEcRmwox4w1kqWVOorlIKY4dbsqrU,175376
6
+ upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
6
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
7
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
8
- upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
9
+ upgini/metrics.py,sha256=U3VJKbKmuWACqI4jTcszXo0WqeXFtV8bWyY9VLBL-rw,29129
9
10
  upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
10
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
11
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
12
13
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
13
14
  upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
14
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- upgini/autofe/all_operands.py,sha256=KWAdcYv6cToc6NZPcCmz6P3N8Nwjp8UqojKuz-f2BZY,1589
16
+ upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
16
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
17
- upgini/autofe/date.py,sha256=AC7Gabc7x2n4-_EmO1Q-7ncfCI_5-kPMQ3r3vFgQ1g4,1788
18
+ upgini/autofe/date.py,sha256=_6RoEJZ5Kf-Q_aMOFucS6YSIZpCcelgpw-edV4qmRIM,3935
18
19
  upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
19
20
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
21
  upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
21
22
  upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
22
23
  upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
23
24
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- upgini/data_source/data_source_publisher.py,sha256=QASEDhJ9SxJKcWxoN2vUPxrM_HTlwKQOPa92L7EQneA,15962
25
+ upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
25
26
  upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
26
27
  upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
27
28
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
29
+ upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
29
30
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
30
31
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
31
- upgini/resource_bundle/strings.properties,sha256=MGU_oBc15VAmbPZdThCpm3B4xERAKwbCIUTIG66dvUo,25228
32
+ upgini/resource_bundle/strings.properties,sha256=x-2fXtGc5Z2n7eUg9b6I4yhok56TTXDvzwU1JUaKcj4,26285
32
33
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
33
34
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
35
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -40,8 +41,8 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
40
41
  upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
41
42
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
42
43
  upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
43
- upgini/utils/datetime_utils.py,sha256=ol5Bgh98wU6KBY9z4QskNO0ja-L7HJL70HmTAjl7iRU,8836
44
- upgini/utils/deduplicate_utils.py,sha256=ckJrpU8Ruc_vcwIPTopbUjyJuNiseLHNAbQlLfhUCxo,5888
44
+ upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
45
+ upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
45
46
  upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
46
47
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
47
48
  upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -51,12 +52,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
51
52
  upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
52
53
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
53
54
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
54
- upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
55
- upgini/utils/target_utils.py,sha256=WVhhxpQVvnhsDV7ctlds51VFg7hz59S_MFUSoRZFszw,7204
55
+ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
56
+ upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
56
57
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
57
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
58
- upgini-1.1.262a3250.post4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
- upgini-1.1.262a3250.post4.dist-info/METADATA,sha256=XfUGTmbya5IYq0uJYXwhUGxBy9DAnrQyWvNsyiZl6gM,48167
60
- upgini-1.1.262a3250.post4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
- upgini-1.1.262a3250.post4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
- upgini-1.1.262a3250.post4.dist-info/RECORD,,
59
+ upgini-1.1.274a4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.274a4.dist-info/METADATA,sha256=xng0cJvEGeFT2zSBqLDy-qf9I6ONKxdKtXsFWokPpPs,48158
61
+ upgini-1.1.274a4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
+ upgini-1.1.274a4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.274a4.dist-info/RECORD,,