upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.274a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/autofe/all_operands.py +12 -2
- upgini/autofe/date.py +67 -7
- upgini/data_source/data_source_publisher.py +14 -4
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +113 -39
- upgini/fingerprint.js +8 -0
- upgini/metrics.py +58 -7
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +8 -3
- upgini/utils/datetime_utils.py +52 -1
- upgini/utils/deduplicate_utils.py +61 -18
- upgini/utils/sklearn_ext.py +1 -2
- upgini/utils/target_utils.py +20 -6
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.274a4.dist-info}/METADATA +2 -2
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.274a4.dist-info}/RECORD +18 -17
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.274a4.dist-info}/LICENSE +0 -0
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.274a4.dist-info}/WHEEL +0 -0
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.274a4.dist-info}/top_level.txt +0 -0
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
|
-
from upgini.autofe.date import DateDiff,
|
|
2
|
+
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
3
|
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
4
4
|
from upgini.autofe.operand import Operand
|
|
5
5
|
from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
|
|
@@ -37,7 +37,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
37
37
|
Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
|
|
38
38
|
Sim(),
|
|
39
39
|
DateDiff(),
|
|
40
|
-
|
|
40
|
+
DateDiffType2(),
|
|
41
|
+
DateListDiff(aggregation="min"),
|
|
42
|
+
DateListDiff(aggregation="max"),
|
|
43
|
+
DateListDiff(aggregation="mean"),
|
|
44
|
+
DateListDiff(aggregation="nunique"),
|
|
45
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
|
|
46
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
|
|
47
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
|
|
48
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
49
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
50
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
41
51
|
]
|
|
42
52
|
}
|
|
43
53
|
|
upgini/autofe/date.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from pydantic import BaseModel
|
|
4
5
|
|
|
5
6
|
from upgini.autofe.operand import PandasOperand
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
class DateDiffMixin:
|
|
9
|
+
class DateDiffMixin(BaseModel):
|
|
9
10
|
diff_unit: str = "D"
|
|
10
11
|
left_unit: Optional[str] = None
|
|
11
12
|
right_unit: Optional[str] = None
|
|
@@ -34,18 +35,77 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
34
35
|
return x
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
class
|
|
38
|
-
name = "
|
|
38
|
+
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
39
|
+
name = "date_diff_type2"
|
|
39
40
|
is_binary = True
|
|
40
41
|
has_symmetry_importance = True
|
|
41
|
-
is_vectorizable = False
|
|
42
42
|
|
|
43
43
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
44
44
|
left = self._convert_to_date(left, self.left_unit)
|
|
45
45
|
right = self._convert_to_date(right, self.right_unit)
|
|
46
|
-
future =
|
|
46
|
+
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
|
+
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
|
+
)
|
|
49
|
+
future = pd.to_datetime(future)
|
|
47
50
|
before = future[future < left]
|
|
48
|
-
future[future < left] = pd.
|
|
51
|
+
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
49
52
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
50
53
|
|
|
51
54
|
return diff
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
61
|
+
is_binary = True
|
|
62
|
+
has_symmetry_importance = True
|
|
63
|
+
aggregation: str
|
|
64
|
+
|
|
65
|
+
def __init__(self, **data: Any) -> None:
|
|
66
|
+
if "name" not in data:
|
|
67
|
+
data["name"] = f"date_diff_{data.get('aggregation')}"
|
|
68
|
+
super().__init__(**data)
|
|
69
|
+
|
|
70
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
71
|
+
left = self._convert_to_date(left, self.left_unit)
|
|
72
|
+
right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
|
|
73
|
+
|
|
74
|
+
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
75
|
+
|
|
76
|
+
def _diff(self, x):
|
|
77
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
78
|
+
return x[x > 0]
|
|
79
|
+
|
|
80
|
+
def _agg(self, x):
|
|
81
|
+
method = getattr(np, self.aggregation, None)
|
|
82
|
+
default = np.nan
|
|
83
|
+
if method is None and self.aggregation in _ext_aggregations:
|
|
84
|
+
method, default = _ext_aggregations[self.aggregation]
|
|
85
|
+
elif not callable(method):
|
|
86
|
+
raise ValueError(f"Unsupported aggregation: {self.aggregation}")
|
|
87
|
+
|
|
88
|
+
return method(x) if len(x) > 0 else default
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DateListDiffBounded(DateListDiff):
|
|
92
|
+
lower_bound: Optional[int]
|
|
93
|
+
upper_bound: Optional[int]
|
|
94
|
+
|
|
95
|
+
def __init__(self, **data: Any) -> None:
|
|
96
|
+
if "name" not in data:
|
|
97
|
+
lower_bound = data.get("lower_bound")
|
|
98
|
+
upper_bound = data.get("upper_bound")
|
|
99
|
+
components = [
|
|
100
|
+
"date_diff",
|
|
101
|
+
data.get("diff_unit"),
|
|
102
|
+
str(lower_bound if lower_bound is not None else "minusinf"),
|
|
103
|
+
str(upper_bound if upper_bound is not None else "plusinf"),
|
|
104
|
+
]
|
|
105
|
+
components.append(data.get("aggregation"))
|
|
106
|
+
data["name"] = "_".join(components)
|
|
107
|
+
super().__init__(**data)
|
|
108
|
+
|
|
109
|
+
def _agg(self, x):
|
|
110
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
111
|
+
return super()._agg(x)
|
|
@@ -48,6 +48,7 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
+
exclude_from_autofe_generation: Optional[List[str]],
|
|
51
52
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
52
53
|
sort_column: Optional[str] = None,
|
|
53
54
|
date_format: Optional[str] = None,
|
|
@@ -57,7 +58,6 @@ class DataSourcePublisher:
|
|
|
57
58
|
join_date_abs_limit_days: Optional[int] = None,
|
|
58
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
59
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
-
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
|
@@ -72,8 +72,8 @@ class DataSourcePublisher:
|
|
|
72
72
|
)
|
|
73
73
|
if search_keys is None or len(search_keys) == 0:
|
|
74
74
|
raise ValidationError("Empty search keys")
|
|
75
|
-
if SearchKey.DATE in search_keys.values() and date_format is None:
|
|
76
|
-
|
|
75
|
+
# if SearchKey.DATE in search_keys.values() and date_format is None:
|
|
76
|
+
# raise ValidationError("date_format is required for DATE search key")
|
|
77
77
|
if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
|
|
78
78
|
raise ValidationError(
|
|
79
79
|
f"Invalid update frequency: {update_frequency}. "
|
|
@@ -85,11 +85,19 @@ class DataSourcePublisher:
|
|
|
85
85
|
or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
|
|
86
86
|
) and sort_column is None:
|
|
87
87
|
raise ValidationError("Sort column is required for passed search keys")
|
|
88
|
+
if (
|
|
89
|
+
set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
|
|
90
|
+
and snapshot_frequency_days is None
|
|
91
|
+
and join_date_abs_limit_days is None
|
|
92
|
+
):
|
|
93
|
+
raise ValidationError(
|
|
94
|
+
"With MSISDN and DATE keys one of the snapshot_frequency_days or"
|
|
95
|
+
" join_date_abs_limit_days parameters is required"
|
|
96
|
+
)
|
|
88
97
|
|
|
89
98
|
request = {
|
|
90
99
|
"dataTableUri": data_table_uri,
|
|
91
100
|
"searchKeys": {k: v.value.value for k, v in search_keys.items()},
|
|
92
|
-
"dateFormat": date_format,
|
|
93
101
|
"excludeColumns": exclude_columns,
|
|
94
102
|
"hashFeatureNames": str(hash_feature_names).lower(),
|
|
95
103
|
"snapshotFrequencyDays": snapshot_frequency_days,
|
|
@@ -98,6 +106,8 @@ class DataSourcePublisher:
|
|
|
98
106
|
"featuresForEmbeddings": features_for_embeddings,
|
|
99
107
|
"forceGeneration": str(_force_generation).lower(),
|
|
100
108
|
}
|
|
109
|
+
if date_format is not None:
|
|
110
|
+
request["dateFormat"] = date_format
|
|
101
111
|
if secondary_search_keys is not None:
|
|
102
112
|
request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
|
|
103
113
|
if sort_column is not None:
|
upgini/dataset.py
CHANGED
|
@@ -60,7 +60,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
60
60
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
61
61
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
62
62
|
MIN_SAMPLE_THRESHOLD = 5_000
|
|
63
|
-
IMBALANCE_THESHOLD = 0.
|
|
63
|
+
IMBALANCE_THESHOLD = 0.6
|
|
64
64
|
BINARY_BOOTSTRAP_LOOPS = 5
|
|
65
65
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
66
66
|
MIN_TARGET_CLASS_ROWS = 100
|
upgini/features_enricher.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import datetime
|
|
2
3
|
import gc
|
|
3
4
|
import hashlib
|
|
4
5
|
import itertools
|
|
@@ -70,6 +71,7 @@ from upgini.utils.datetime_utils import (
|
|
|
70
71
|
DateTimeSearchKeyConverter,
|
|
71
72
|
is_blocked_time_series,
|
|
72
73
|
is_time_series,
|
|
74
|
+
validate_dates_distribution,
|
|
73
75
|
)
|
|
74
76
|
from upgini.utils.deduplicate_utils import (
|
|
75
77
|
clean_full_duplicates,
|
|
@@ -93,7 +95,7 @@ try:
|
|
|
93
95
|
except Exception:
|
|
94
96
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
95
97
|
|
|
96
|
-
from upgini.utils.target_utils import define_task
|
|
98
|
+
from upgini.utils.target_utils import calculate_psi, define_task
|
|
97
99
|
from upgini.utils.warning_counter import WarningCounter
|
|
98
100
|
from upgini.version_validator import validate_version
|
|
99
101
|
|
|
@@ -145,6 +147,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
145
147
|
"""
|
|
146
148
|
|
|
147
149
|
TARGET_NAME = "target"
|
|
150
|
+
CURRENT_DATE = "current_date"
|
|
148
151
|
RANDOM_STATE = 42
|
|
149
152
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
150
153
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -206,6 +209,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
206
209
|
client_ip: Optional[str] = None,
|
|
207
210
|
client_visitorid: Optional[str] = None,
|
|
208
211
|
custom_bundle_config: Optional[str] = None,
|
|
212
|
+
add_date_if_missing: bool = True,
|
|
209
213
|
**kwargs,
|
|
210
214
|
):
|
|
211
215
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -316,6 +320,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
316
320
|
self.raise_validation_error = raise_validation_error
|
|
317
321
|
self.exclude_columns = exclude_columns
|
|
318
322
|
self.baseline_score_column = baseline_score_column
|
|
323
|
+
self.add_date_if_missing = add_date_if_missing
|
|
319
324
|
|
|
320
325
|
def _get_api_key(self):
|
|
321
326
|
return self._api_key
|
|
@@ -423,7 +428,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
423
428
|
self.X = X
|
|
424
429
|
self.y = y
|
|
425
430
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
426
|
-
self.dump_input(trace_id, X, y, eval_set)
|
|
431
|
+
self.dump_input(trace_id, X, y, self.eval_set)
|
|
427
432
|
self.__inner_fit(
|
|
428
433
|
trace_id,
|
|
429
434
|
X,
|
|
@@ -562,7 +567,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
562
567
|
self.X = X
|
|
563
568
|
self.y = y
|
|
564
569
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
565
|
-
self.dump_input(trace_id, X, y, eval_set)
|
|
570
|
+
self.dump_input(trace_id, X, y, self.eval_set)
|
|
566
571
|
|
|
567
572
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
568
573
|
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
|
|
@@ -822,12 +827,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
822
827
|
print(msg)
|
|
823
828
|
|
|
824
829
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
830
|
+
effective_X = X if X is not None else self.X
|
|
831
|
+
effective_y = y if y is not None else self.y
|
|
832
|
+
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
833
|
+
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
|
|
825
834
|
|
|
826
835
|
try:
|
|
827
836
|
self.__log_debug_information(
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
837
|
+
effective_X,
|
|
838
|
+
effective_y,
|
|
839
|
+
effective_eval_set,
|
|
831
840
|
exclude_features_sources=exclude_features_sources,
|
|
832
841
|
cv=cv if cv is not None else self.cv,
|
|
833
842
|
importance_threshold=importance_threshold,
|
|
@@ -841,17 +850,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
841
850
|
self._search_task is None
|
|
842
851
|
or self._search_task.provider_metadata_v2 is None
|
|
843
852
|
or len(self._search_task.provider_metadata_v2) == 0
|
|
844
|
-
or
|
|
845
|
-
or
|
|
853
|
+
or effective_X is None
|
|
854
|
+
or effective_y is None
|
|
846
855
|
):
|
|
847
856
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
848
857
|
|
|
849
858
|
if X is not None and y is None:
|
|
850
859
|
raise ValidationError("X passed without y")
|
|
851
860
|
|
|
852
|
-
effective_X = X if X is not None else self.X
|
|
853
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
854
|
-
|
|
855
861
|
validate_scoring_argument(scoring)
|
|
856
862
|
|
|
857
863
|
self._validate_baseline_score(effective_X, effective_eval_set)
|
|
@@ -871,8 +877,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
871
877
|
):
|
|
872
878
|
cat_features = estimator.get_param("cat_features")
|
|
873
879
|
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
874
|
-
|
|
875
|
-
cat_features = [effectiveX.columns[i] for i in cat_features]
|
|
880
|
+
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
876
881
|
for cat_feature in cat_features:
|
|
877
882
|
if cat_feature in self.search_keys:
|
|
878
883
|
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
@@ -882,9 +887,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
882
887
|
|
|
883
888
|
prepared_data = self._prepare_data_for_metrics(
|
|
884
889
|
trace_id=trace_id,
|
|
885
|
-
X=
|
|
886
|
-
y=
|
|
887
|
-
eval_set=
|
|
890
|
+
X=effective_X,
|
|
891
|
+
y=effective_y,
|
|
892
|
+
eval_set=effective_eval_set,
|
|
888
893
|
exclude_features_sources=exclude_features_sources,
|
|
889
894
|
importance_threshold=importance_threshold,
|
|
890
895
|
max_features=max_features,
|
|
@@ -994,8 +999,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
994
999
|
enriched_metric = None
|
|
995
1000
|
uplift = None
|
|
996
1001
|
|
|
997
|
-
effective_X = X if X is not None else self.X
|
|
998
|
-
effective_y = y if y is not None else self.y
|
|
999
1002
|
train_metrics = {
|
|
1000
1003
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1001
1004
|
"quality_metrics_train_segment"
|
|
@@ -1256,6 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1256
1259
|
).get_cv_and_groups(X)
|
|
1257
1260
|
else:
|
|
1258
1261
|
from sklearn import __version__ as sklearn_version
|
|
1262
|
+
|
|
1259
1263
|
try:
|
|
1260
1264
|
from sklearn.model_selection._split import GroupsConsumerMixin
|
|
1261
1265
|
|
|
@@ -1684,6 +1688,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1684
1688
|
df = validated_X.copy()
|
|
1685
1689
|
|
|
1686
1690
|
df[TARGET] = validated_y
|
|
1691
|
+
|
|
1692
|
+
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1693
|
+
|
|
1687
1694
|
num_samples = _num_samples(df)
|
|
1688
1695
|
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1689
1696
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
@@ -1801,10 +1808,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1801
1808
|
else:
|
|
1802
1809
|
features_section = ""
|
|
1803
1810
|
|
|
1804
|
-
|
|
1811
|
+
search_id = self._search_task.search_task_id
|
|
1812
|
+
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1805
1813
|
-H 'Authorization: {self.api_key}' \\
|
|
1806
1814
|
-H 'Content-Type: application/json' \\
|
|
1807
|
-
-d '{{"
|
|
1815
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1808
1816
|
return api_example
|
|
1809
1817
|
|
|
1810
1818
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1899,6 +1907,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1899
1907
|
generated_features.extend(converter.generated_features)
|
|
1900
1908
|
else:
|
|
1901
1909
|
self.logger.info("Input dataset hasn't date column")
|
|
1910
|
+
if self.add_date_if_missing:
|
|
1911
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1902
1912
|
email_column = self._get_email_column(search_keys)
|
|
1903
1913
|
hem_column = self._get_hem_column(search_keys)
|
|
1904
1914
|
email_converted_to_hem = False
|
|
@@ -1918,6 +1928,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1918
1928
|
|
|
1919
1929
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1920
1930
|
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1931
|
+
|
|
1921
1932
|
if email_converted_to_hem:
|
|
1922
1933
|
non_keys_columns.append(email_column)
|
|
1923
1934
|
|
|
@@ -1939,6 +1950,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1939
1950
|
if add_fit_system_record_id:
|
|
1940
1951
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1941
1952
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1953
|
+
non_keys_columns.append(SORT_ID)
|
|
1942
1954
|
|
|
1943
1955
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1944
1956
|
|
|
@@ -2215,14 +2227,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2215
2227
|
self.fit_search_keys = self.search_keys.copy()
|
|
2216
2228
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2217
2229
|
|
|
2218
|
-
|
|
2230
|
+
validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2231
|
+
|
|
2232
|
+
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2233
|
+
has_date = maybe_date_column is not None
|
|
2219
2234
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2220
2235
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2221
2236
|
|
|
2222
|
-
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
2223
|
-
|
|
2224
|
-
df = self.__correct_target(df)
|
|
2225
|
-
|
|
2226
2237
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
2227
2238
|
self.loss, model_task_type, self.runtime_parameters, self.logger
|
|
2228
2239
|
)
|
|
@@ -2234,6 +2245,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2234
2245
|
eval_df[EVAL_SET_INDEX] = idx + 1
|
|
2235
2246
|
df = pd.concat([df, eval_df])
|
|
2236
2247
|
|
|
2248
|
+
df = self.__correct_target(df)
|
|
2249
|
+
|
|
2250
|
+
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
2251
|
+
|
|
2252
|
+
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2253
|
+
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2254
|
+
|
|
2237
2255
|
if DEFAULT_INDEX in df.columns:
|
|
2238
2256
|
msg = self.bundle.get("unsupported_index_column")
|
|
2239
2257
|
self.logger.info(msg)
|
|
@@ -2260,6 +2278,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2260
2278
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2261
2279
|
else:
|
|
2262
2280
|
self.logger.info("Input dataset hasn't date column")
|
|
2281
|
+
if self.add_date_if_missing:
|
|
2282
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2263
2283
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2264
2284
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2265
2285
|
email_converted_to_hem = False
|
|
@@ -2808,6 +2828,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2808
2828
|
|
|
2809
2829
|
maybe_date_col = self._get_date_column(self.search_keys)
|
|
2810
2830
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2831
|
+
# TODO cast date column to single dtype
|
|
2811
2832
|
min_date = X[maybe_date_col].min()
|
|
2812
2833
|
max_date = X[maybe_date_col].max()
|
|
2813
2834
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
|
@@ -2839,6 +2860,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2839
2860
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2840
2861
|
return col
|
|
2841
2862
|
|
|
2863
|
+
@staticmethod
|
|
2864
|
+
def _add_current_date_as_key(
|
|
2865
|
+
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2866
|
+
) -> pd.DataFrame:
|
|
2867
|
+
if (
|
|
2868
|
+
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2869
|
+
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2870
|
+
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2871
|
+
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2872
|
+
):
|
|
2873
|
+
msg = bundle.get("current_date_added")
|
|
2874
|
+
print(msg)
|
|
2875
|
+
logger.warning(msg)
|
|
2876
|
+
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2877
|
+
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2878
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2879
|
+
df = converter.convert(df)
|
|
2880
|
+
return df
|
|
2881
|
+
|
|
2842
2882
|
@staticmethod
|
|
2843
2883
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2844
2884
|
return [
|
|
@@ -2877,26 +2917,33 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2877
2917
|
|
|
2878
2918
|
# order by date and idempotent order by other keys
|
|
2879
2919
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2920
|
+
sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
|
|
2880
2921
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2881
2922
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2923
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
2882
2924
|
else:
|
|
2883
2925
|
date_column = self._get_date_column(search_keys)
|
|
2884
2926
|
sort_columns = [date_column] if date_column is not None else []
|
|
2885
2927
|
|
|
2886
|
-
|
|
2928
|
+
other_columns = sorted(
|
|
2887
2929
|
[
|
|
2888
|
-
|
|
2889
|
-
for
|
|
2890
|
-
if
|
|
2891
|
-
and sk in df.columns
|
|
2892
|
-
and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2930
|
+
c
|
|
2931
|
+
for c in df.columns
|
|
2932
|
+
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
2893
2933
|
]
|
|
2934
|
+
# [
|
|
2935
|
+
# sk
|
|
2936
|
+
# for sk, key_type in search_keys.items()
|
|
2937
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
2938
|
+
# and sk in df.columns
|
|
2939
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2940
|
+
# ]
|
|
2894
2941
|
)
|
|
2895
2942
|
|
|
2896
2943
|
search_keys_hash = "search_keys_hash"
|
|
2897
|
-
if len(
|
|
2944
|
+
if len(other_columns) > 0:
|
|
2898
2945
|
sort_columns.append(search_keys_hash)
|
|
2899
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
2946
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
|
|
2900
2947
|
|
|
2901
2948
|
df = df.sort_values(by=sort_columns)
|
|
2902
2949
|
|
|
@@ -3185,22 +3232,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3185
3232
|
return None
|
|
3186
3233
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3187
3234
|
|
|
3188
|
-
def
|
|
3235
|
+
def get_feature_by_name(name: str):
|
|
3189
3236
|
for m in features_meta:
|
|
3190
|
-
if m.name
|
|
3237
|
+
if m.name == name:
|
|
3191
3238
|
return m
|
|
3192
3239
|
|
|
3193
3240
|
descriptions = []
|
|
3194
3241
|
for m in autofe_meta:
|
|
3195
3242
|
autofe_feature = Feature.from_formula(m.formula)
|
|
3243
|
+
autofe_feature.set_display_index(m.display_index)
|
|
3196
3244
|
if autofe_feature.op.is_vector:
|
|
3197
3245
|
continue
|
|
3198
3246
|
|
|
3199
3247
|
description = dict()
|
|
3200
3248
|
|
|
3201
|
-
feature_meta =
|
|
3202
|
-
m.display_index, autofe_feature.op.alias or autofe_feature.op.name
|
|
3203
|
-
)
|
|
3249
|
+
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3204
3250
|
if feature_meta is None:
|
|
3205
3251
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
3206
3252
|
continue
|
|
@@ -3547,6 +3593,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3547
3593
|
self.logger.warning(msg)
|
|
3548
3594
|
print(msg)
|
|
3549
3595
|
|
|
3596
|
+
def _validate_PSI(self, df: pd.DataFrame):
|
|
3597
|
+
if EVAL_SET_INDEX in df.columns:
|
|
3598
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
3599
|
+
eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
|
|
3600
|
+
else:
|
|
3601
|
+
train = df
|
|
3602
|
+
eval1 = None
|
|
3603
|
+
|
|
3604
|
+
# 1. Check train PSI
|
|
3605
|
+
half_train = round(len(train) / 2)
|
|
3606
|
+
part1 = train[:half_train]
|
|
3607
|
+
part2 = train[half_train:]
|
|
3608
|
+
train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
3609
|
+
if train_psi > 0.2:
|
|
3610
|
+
self.warning_counter.increment()
|
|
3611
|
+
msg = self.bundle.get("train_unstable_target").format(train_psi)
|
|
3612
|
+
print(msg)
|
|
3613
|
+
self.logger.warning(msg)
|
|
3614
|
+
|
|
3615
|
+
# 2. Check train-test PSI
|
|
3616
|
+
if eval1 is not None:
|
|
3617
|
+
train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
3618
|
+
if train_test_psi > 0.2:
|
|
3619
|
+
self.warning_counter.increment()
|
|
3620
|
+
msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
|
|
3621
|
+
print(msg)
|
|
3622
|
+
self.logger.warning(msg)
|
|
3623
|
+
|
|
3550
3624
|
def _dump_python_libs(self):
|
|
3551
3625
|
try:
|
|
3552
3626
|
from pip._internal.operations.freeze import freeze
|
|
@@ -3613,7 +3687,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3613
3687
|
if y is not None:
|
|
3614
3688
|
with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
|
|
3615
3689
|
pickle.dump(sample(y, xy_sample_index), y_file)
|
|
3616
|
-
if eval_set
|
|
3690
|
+
if eval_set:
|
|
3617
3691
|
eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
|
|
3618
3692
|
with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
|
|
3619
3693
|
pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
|
upgini/fingerprint.js
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* FingerprintJS v3.4.2 - Copyright (c) FingerprintJS, Inc, 2023 (https://fingerprint.com)
|
|
3
|
+
* Licensed under the MIT (http://www.opensource.org/licenses/mit-license.php) license.
|
|
4
|
+
*
|
|
5
|
+
* This software contains code from open-source projects:
|
|
6
|
+
* MurmurHash3 by Karan Lyons (https://github.com/karanlyons/murmurHash3.js)
|
|
7
|
+
*/
|
|
8
|
+
var e=function(){return e=Object.assign||function(e){for(var n,t=1,r=arguments.length;t<r;t++)for(var o in n=arguments[t])Object.prototype.hasOwnProperty.call(n,o)&&(e[o]=n[o]);return e},e.apply(this,arguments)};function n(e,n,t,r){return new(t||(t=Promise))((function(o,a){function i(e){try{u(r.next(e))}catch(n){a(n)}}function c(e){try{u(r.throw(e))}catch(n){a(n)}}function u(e){var n;e.done?o(e.value):(n=e.value,n instanceof t?n:new t((function(e){e(n)}))).then(i,c)}u((r=r.apply(e,n||[])).next())}))}function t(e,n){var t,r,o,a,i={label:0,sent:function(){if(1&o[0])throw o[1];return o[1]},trys:[],ops:[]};return a={next:c(0),throw:c(1),return:c(2)},"function"==typeof Symbol&&(a[Symbol.iterator]=function(){return this}),a;function c(c){return function(u){return function(c){if(t)throw new TypeError("Generator is already executing.");for(;a&&(a=0,c[0]&&(i=0)),i;)try{if(t=1,r&&(o=2&c[0]?r.return:c[0]?r.throw||((o=r.return)&&o.call(r),0):r.next)&&!(o=o.call(r,c[1])).done)return o;switch(r=0,o&&(c=[2&c[0],o.value]),c[0]){case 0:case 1:o=c;break;case 4:return i.label++,{value:c[1],done:!1};case 5:i.label++,r=c[1],c=[0];continue;case 7:c=i.ops.pop(),i.trys.pop();continue;default:if(!(o=i.trys,(o=o.length>0&&o[o.length-1])||6!==c[0]&&2!==c[0])){i=0;continue}if(3===c[0]&&(!o||c[1]>o[0]&&c[1]<o[3])){i.label=c[1];break}if(6===c[0]&&i.label<o[1]){i.label=o[1],o=c;break}if(o&&i.label<o[2]){i.label=o[2],i.ops.push(c);break}o[2]&&i.ops.pop(),i.trys.pop();continue}c=n.call(e,i)}catch(u){c=[6,u],r=0}finally{t=o=0}if(5&c[0])throw c[1];return{value:c[0]?c[1]:void 0,done:!0}}([c,u])}}}function r(e,n,t){if(t||2===arguments.length)for(var r,o=0,a=n.length;o<a;o++)!r&&o in n||(r||(r=Array.prototype.slice.call(n,0,o)),r[o]=n[o]);return e.concat(r||Array.prototype.slice.call(n))}function o(e,n){return new Promise((function(t){return setTimeout(t,e,n)}))}function a(e){return!!e&&"function"==typeof e.then}function i(e,n){try{var t=e();a(t)?t.then((function(e){return n(!0,e)}),(function(e){return n(!1,e)})):n(!0,t)}catch(r){n(!1,r)}}function c(e,r,a){return void 0===a&&(a=16),n(this,void 0,void 0,(function(){var n,i,c,u;return t(this,(function(t){switch(t.label){case 0:n=Array(e.length),i=Date.now(),c=0,t.label=1;case 1:return c<e.length?(n[c]=r(e[c],c),(u=Date.now())>=i+a?(i=u,[4,o(0)]):[3,3]):[3,4];case 2:t.sent(),t.label=3;case 3:return++c,[3,1];case 4:return[2,n]}}))}))}function u(e){e.then(void 0,(function(){}))}function l(e,n){e=[e[0]>>>16,65535&e[0],e[1]>>>16,65535&e[1]],n=[n[0]>>>16,65535&n[0],n[1]>>>16,65535&n[1]];var t=[0,0,0,0];return t[3]+=e[3]+n[3],t[2]+=t[3]>>>16,t[3]&=65535,t[2]+=e[2]+n[2],t[1]+=t[2]>>>16,t[2]&=65535,t[1]+=e[1]+n[1],t[0]+=t[1]>>>16,t[1]&=65535,t[0]+=e[0]+n[0],t[0]&=65535,[t[0]<<16|t[1],t[2]<<16|t[3]]}function s(e,n){e=[e[0]>>>16,65535&e[0],e[1]>>>16,65535&e[1]],n=[n[0]>>>16,65535&n[0],n[1]>>>16,65535&n[1]];var t=[0,0,0,0];return t[3]+=e[3]*n[3],t[2]+=t[3]>>>16,t[3]&=65535,t[2]+=e[2]*n[3],t[1]+=t[2]>>>16,t[2]&=65535,t[2]+=e[3]*n[2],t[1]+=t[2]>>>16,t[2]&=65535,t[1]+=e[1]*n[3],t[0]+=t[1]>>>16,t[1]&=65535,t[1]+=e[2]*n[2],t[0]+=t[1]>>>16,t[1]&=65535,t[1]+=e[3]*n[1],t[0]+=t[1]>>>16,t[1]&=65535,t[0]+=e[0]*n[3]+e[1]*n[2]+e[2]*n[1]+e[3]*n[0],t[0]&=65535,[t[0]<<16|t[1],t[2]<<16|t[3]]}function d(e,n){return 32===(n%=64)?[e[1],e[0]]:n<32?[e[0]<<n|e[1]>>>32-n,e[1]<<n|e[0]>>>32-n]:(n-=32,[e[1]<<n|e[0]>>>32-n,e[0]<<n|e[1]>>>32-n])}function m(e,n){return 0===(n%=64)?e:n<32?[e[0]<<n|e[1]>>>32-n,e[1]<<n]:[e[1]<<n-32,0]}function f(e,n){return[e[0]^n[0],e[1]^n[1]]}function v(e){return e=f(e,[0,e[0]>>>1]),e=f(e=s(e,[4283543511,3981806797]),[0,e[0]>>>1]),e=f(e=s(e,[3301882366,444984403]),[0,e[0]>>>1])}function h(e,n){n=n||0;var t,r=(e=e||"").length%16,o=e.length-r,a=[0,n],i=[0,n],c=[0,0],u=[0,0],h=[2277735313,289559509],p=[1291169091,658871167];for(t=0;t<o;t+=16)c=[255&e.charCodeAt(t+4)|(255&e.charCodeAt(t+5))<<8|(255&e.charCodeAt(t+6))<<16|(255&e.charCodeAt(t+7))<<24,255&e.charCodeAt(t)|(255&e.charCodeAt(t+1))<<8|(255&e.charCodeAt(t+2))<<16|(255&e.charCodeAt(t+3))<<24],u=[255&e.charCodeAt(t+12)|(255&e.charCodeAt(t+13))<<8|(255&e.charCodeAt(t+14))<<16|(255&e.charCodeAt(t+15))<<24,255&e.charCodeAt(t+8)|(255&e.charCodeAt(t+9))<<8|(255&e.charCodeAt(t+10))<<16|(255&e.charCodeAt(t+11))<<24],c=d(c=s(c,h),31),a=l(a=d(a=f(a,c=s(c,p)),27),i),a=l(s(a,[0,5]),[0,1390208809]),u=d(u=s(u,p),33),i=l(i=d(i=f(i,u=s(u,h)),31),a),i=l(s(i,[0,5]),[0,944331445]);switch(c=[0,0],u=[0,0],r){case 15:u=f(u,m([0,e.charCodeAt(t+14)],48));case 14:u=f(u,m([0,e.charCodeAt(t+13)],40));case 13:u=f(u,m([0,e.charCodeAt(t+12)],32));case 12:u=f(u,m([0,e.charCodeAt(t+11)],24));case 11:u=f(u,m([0,e.charCodeAt(t+10)],16));case 10:u=f(u,m([0,e.charCodeAt(t+9)],8));case 9:u=s(u=f(u,[0,e.charCodeAt(t+8)]),p),i=f(i,u=s(u=d(u,33),h));case 8:c=f(c,m([0,e.charCodeAt(t+7)],56));case 7:c=f(c,m([0,e.charCodeAt(t+6)],48));case 6:c=f(c,m([0,e.charCodeAt(t+5)],40));case 5:c=f(c,m([0,e.charCodeAt(t+4)],32));case 4:c=f(c,m([0,e.charCodeAt(t+3)],24));case 3:c=f(c,m([0,e.charCodeAt(t+2)],16));case 2:c=f(c,m([0,e.charCodeAt(t+1)],8));case 1:c=s(c=f(c,[0,e.charCodeAt(t)]),h),a=f(a,c=s(c=d(c,31),p))}return a=l(a=f(a,[0,e.length]),i=f(i,[0,e.length])),i=l(i,a),a=l(a=v(a),i=v(i)),i=l(i,a),("00000000"+(a[0]>>>0).toString(16)).slice(-8)+("00000000"+(a[1]>>>0).toString(16)).slice(-8)+("00000000"+(i[0]>>>0).toString(16)).slice(-8)+("00000000"+(i[1]>>>0).toString(16)).slice(-8)}function p(e){return parseInt(e)}function b(e){return parseFloat(e)}function y(e,n){return"number"==typeof e&&isNaN(e)?n:e}function g(e){return e.reduce((function(e,n){return e+(n?1:0)}),0)}function w(e,n){if(void 0===n&&(n=1),Math.abs(n)>=1)return Math.round(e/n)*n;var t=1/n;return Math.round(e*t)/t}function L(e){return e&&"object"==typeof e&&"message"in e?e:{message:e}}function k(e){return"function"!=typeof e}function V(e,r,o){var a=Object.keys(e).filter((function(e){return!function(e,n){for(var t=0,r=e.length;t<r;++t)if(e[t]===n)return!0;return!1}(o,e)})),l=c(a,(function(n){return function(e,n){var t=new Promise((function(t){var r=Date.now();i(e.bind(null,n),(function(){for(var e=[],n=0;n<arguments.length;n++)e[n]=arguments[n];var o=Date.now()-r;if(!e[0])return t((function(){return{error:L(e[1]),duration:o}}));var a=e[1];if(k(a))return t((function(){return{value:a,duration:o}}));t((function(){return new Promise((function(e){var n=Date.now();i(a,(function(){for(var t=[],r=0;r<arguments.length;r++)t[r]=arguments[r];var a=o+Date.now()-n;if(!t[0])return e({error:L(t[1]),duration:a});e({value:t[1],duration:a})}))}))}))}))}));return u(t),function(){return t.then((function(e){return e()}))}}(e[n],r)}));return u(l),function(){return n(this,void 0,void 0,(function(){var e,n,r,o;return t(this,(function(t){switch(t.label){case 0:return[4,l];case 1:return[4,c(t.sent(),(function(e){var n=e();return u(n),n}))];case 2:return e=t.sent(),[4,Promise.all(e)];case 3:for(n=t.sent(),r={},o=0;o<a.length;++o)r[a[o]]=n[o];return[2,r]}}))}))}}function Z(e,n){var t=function(e){return k(e)?n(e):function(){var t=e();return a(t)?t.then(n):n(t)}};return function(n){var r=e(n);return a(r)?r.then(t):t(r)}}function W(){var e=window,n=navigator;return g(["MSCSSMatrix"in e,"msSetImmediate"in e,"msIndexedDB"in e,"msMaxTouchPoints"in n,"msPointerEnabled"in n])>=4}function C(){var e=window,n=navigator;return g(["msWriteProfilerMark"in e,"MSStream"in e,"msLaunchUri"in n,"msSaveBlob"in n])>=3&&!W()}function S(){var e=window,n=navigator;return g(["webkitPersistentStorage"in n,"webkitTemporaryStorage"in n,0===n.vendor.indexOf("Google"),"webkitResolveLocalFileSystemURL"in e,"BatteryManager"in e,"webkitMediaStream"in e,"webkitSpeechGrammar"in e])>=5}function x(){var e=window,n=navigator;return g(["ApplePayError"in e,"CSSPrimitiveValue"in e,"Counter"in e,0===n.vendor.indexOf("Apple"),"getStorageUpdates"in n,"WebKitMediaKeys"in e])>=4}function F(){var e=window;return g(["safari"in e,!("DeviceMotionEvent"in e),!("ongestureend"in e),!("standalone"in navigator)])>=3}function Y(){var e,n,t=window;return g(["buildID"in navigator,"MozAppearance"in(null!==(n=null===(e=document.documentElement)||void 0===e?void 0:e.style)&&void 0!==n?n:{}),"onmozfullscreenchange"in t,"mozInnerScreenX"in t,"CSSMozDocumentRule"in t,"CanvasCaptureMediaStream"in t])>=4}function M(){var e=document;return e.fullscreenElement||e.msFullscreenElement||e.mozFullScreenElement||e.webkitFullscreenElement||null}function G(){var e=S(),n=Y();if(!e&&!n)return!1;var t=window;return g(["onorientationchange"in t,"orientation"in t,e&&!("SharedWorker"in t),n&&/android/i.test(navigator.appVersion)])>=2}function R(e){var n=new Error(e);return n.name=e,n}function X(e,r,a){var i,c,u;return void 0===a&&(a=50),n(this,void 0,void 0,(function(){var n,l;return t(this,(function(t){switch(t.label){case 0:n=document,t.label=1;case 1:return n.body?[3,3]:[4,o(a)];case 2:return t.sent(),[3,1];case 3:l=n.createElement("iframe"),t.label=4;case 4:return t.trys.push([4,,10,11]),[4,new Promise((function(e,t){var o=!1,a=function(){o=!0,e()};l.onload=a,l.onerror=function(e){o=!0,t(e)};var i=l.style;i.setProperty("display","block","important"),i.position="absolute",i.top="0",i.left="0",i.visibility="hidden",r&&"srcdoc"in l?l.srcdoc=r:l.src="about:blank",n.body.appendChild(l);var c=function(){var e,n;o||("complete"===(null===(n=null===(e=l.contentWindow)||void 0===e?void 0:e.document)||void 0===n?void 0:n.readyState)?a():setTimeout(c,10))};c()}))];case 5:t.sent(),t.label=6;case 6:return(null===(c=null===(i=l.contentWindow)||void 0===i?void 0:i.document)||void 0===c?void 0:c.body)?[3,8]:[4,o(a)];case 7:return t.sent(),[3,6];case 8:return[4,e(l,l.contentWindow)];case 9:return[2,t.sent()];case 10:return null===(u=l.parentNode)||void 0===u||u.removeChild(l),[7];case 11:return[2]}}))}))}function A(e){for(var n=function(e){for(var n,t,r="Unexpected syntax '".concat(e,"'"),o=/^\s*([a-z-]*)(.*)$/i.exec(e),a=o[1]||void 0,i={},c=/([.:#][\w-]+|\[.+?\])/gi,u=function(e,n){i[e]=i[e]||[],i[e].push(n)};;){var l=c.exec(o[2]);if(!l)break;var s=l[0];switch(s[0]){case".":u("class",s.slice(1));break;case"#":u("id",s.slice(1));break;case"[":var d=/^\[([\w-]+)([~|^$*]?=("(.*?)"|([\w-]+)))?(\s+[is])?\]$/.exec(s);if(!d)throw new Error(r);u(d[1],null!==(t=null!==(n=d[4])&&void 0!==n?n:d[5])&&void 0!==t?t:"");break;default:throw new Error(r)}}return[a,i]}(e),t=n[0],r=n[1],o=document.createElement(null!=t?t:"div"),a=0,i=Object.keys(r);a<i.length;a++){var c=i[a],u=r[c].join(" ");"style"===c?j(o.style,u):o.setAttribute(c,u)}return o}function j(e,n){for(var t=0,r=n.split(";");t<r.length;t++){var o=r[t],a=/^\s*([\w-]+)\s*:\s*(.+?)(\s*!([\w-]+))?\s*$/.exec(o);if(a){var i=a[1],c=a[2],u=a[4];e.setProperty(i,c,u||"")}}}var I=["monospace","sans-serif","serif"],J=["sans-serif-thin","ARNO PRO","Agency FB","Arabic Typesetting","Arial Unicode MS","AvantGarde Bk BT","BankGothic Md BT","Batang","Bitstream Vera Sans Mono","Calibri","Century","Century Gothic","Clarendon","EUROSTILE","Franklin Gothic","Futura Bk BT","Futura Md BT","GOTHAM","Gill Sans","HELV","Haettenschweiler","Helvetica Neue","Humanst521 BT","Leelawadee","Letter Gothic","Levenim MT","Lucida Bright","Lucida Sans","Menlo","MS Mincho","MS Outlook","MS Reference Specialty","MS UI Gothic","MT Extra","MYRIAD PRO","Marlett","Meiryo UI","Microsoft Uighur","Minion Pro","Monotype Corsiva","PMingLiU","Pristina","SCRIPTINA","Segoe UI Light","Serifa","SimHei","Small Fonts","Staccato222 BT","TRAJAN PRO","Univers CE 55 Medium","Vrinda","ZWAdobeF"];function H(e){return e.toDataURL()}var P,N;function z(){var e=this;return function(){if(void 0===N){var e=function(){var n=D();E(n)?N=setTimeout(e,2500):(P=n,N=void 0)};e()}}(),function(){return n(e,void 0,void 0,(function(){var e;return t(this,(function(n){switch(n.label){case 0:return E(e=D())?P?[2,r([],P,!0)]:M()?[4,(t=document,(t.exitFullscreen||t.msExitFullscreen||t.mozCancelFullScreen||t.webkitExitFullscreen).call(t))]:[3,2]:[3,2];case 1:n.sent(),e=D(),n.label=2;case 2:return E(e)||(P=e),[2,e]}var t}))}))}}function D(){var e=screen;return[y(b(e.availTop),null),y(b(e.width)-b(e.availWidth)-y(b(e.availLeft),0),null),y(b(e.height)-b(e.availHeight)-y(b(e.availTop),0),null),y(b(e.availLeft),null)]}function E(e){for(var n=0;n<4;++n)if(e[n])return!1;return!0}function T(e){var r;return n(this,void 0,void 0,(function(){var n,a,i,c,u,l,s;return t(this,(function(t){switch(t.label){case 0:for(n=document,a=n.createElement("div"),i=new Array(e.length),c={},B(a),s=0;s<e.length;++s)"DIALOG"===(u=A(e[s])).tagName&&u.show(),B(l=n.createElement("div")),l.appendChild(u),a.appendChild(l),i[s]=u;t.label=1;case 1:return n.body?[3,3]:[4,o(50)];case 2:return t.sent(),[3,1];case 3:n.body.appendChild(a);try{for(s=0;s<e.length;++s)i[s].offsetParent||(c[e[s]]=!0)}finally{null===(r=a.parentNode)||void 0===r||r.removeChild(a)}return[2,c]}}))}))}function B(e){e.style.setProperty("display","block","important")}function _(e){return matchMedia("(inverted-colors: ".concat(e,")")).matches}function O(e){return matchMedia("(forced-colors: ".concat(e,")")).matches}function U(e){return matchMedia("(prefers-contrast: ".concat(e,")")).matches}function Q(e){return matchMedia("(prefers-reduced-motion: ".concat(e,")")).matches}function K(e){return matchMedia("(dynamic-range: ".concat(e,")")).matches}var q=Math,$=function(){return 0};var ee={default:[],apple:[{font:"-apple-system-body"}],serif:[{fontFamily:"serif"}],sans:[{fontFamily:"sans-serif"}],mono:[{fontFamily:"monospace"}],min:[{fontSize:"1px"}],system:[{fontFamily:"system-ui"}]};var ne={fonts:function(){return X((function(e,n){var t=n.document,r=t.body;r.style.fontSize="48px";var o=t.createElement("div"),a={},i={},c=function(e){var n=t.createElement("span"),r=n.style;return r.position="absolute",r.top="0",r.left="0",r.fontFamily=e,n.textContent="mmMwWLliI0O&1",o.appendChild(n),n},u=I.map(c),l=function(){for(var e={},n=function(n){e[n]=I.map((function(e){return function(e,n){return c("'".concat(e,"',").concat(n))}(n,e)}))},t=0,r=J;t<r.length;t++){n(r[t])}return e}();r.appendChild(o);for(var s=0;s<I.length;s++)a[I[s]]=u[s].offsetWidth,i[I[s]]=u[s].offsetHeight;return J.filter((function(e){return n=l[e],I.some((function(e,t){return n[t].offsetWidth!==a[e]||n[t].offsetHeight!==i[e]}));var n}))}))},domBlockers:function(e){var r=(void 0===e?{}:e).debug;return n(this,void 0,void 0,(function(){var e,n,o,a,i;return t(this,(function(t){switch(t.label){case 0:return x()||G()?(c=atob,e={abpIndo:["#Iklan-Melayang","#Kolom-Iklan-728","#SidebarIklan-wrapper",'[title="ALIENBOLA" i]',c("I0JveC1CYW5uZXItYWRz")],abpvn:[".quangcao","#mobileCatfish",c("LmNsb3NlLWFkcw=="),'[id^="bn_bottom_fixed_"]',"#pmadv"],adBlockFinland:[".mainostila",c("LnNwb25zb3JpdA=="),".ylamainos",c("YVtocmVmKj0iL2NsaWNrdGhyZ2guYXNwPyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9hcHAucmVhZHBlYWsuY29tL2FkcyJd")],adBlockPersian:["#navbar_notice_50",".kadr",'TABLE[width="140px"]',"#divAgahi",c("YVtocmVmXj0iaHR0cDovL2cxLnYuZndtcm0ubmV0L2FkLyJd")],adBlockWarningRemoval:["#adblock-honeypot",".adblocker-root",".wp_adblock_detect",c("LmhlYWRlci1ibG9ja2VkLWFk"),c("I2FkX2Jsb2NrZXI=")],adGuardAnnoyances:[".hs-sosyal","#cookieconsentdiv",'div[class^="app_gdpr"]',".as-oil",'[data-cypress="soft-push-notification-modal"]'],adGuardBase:[".BetterJsPopOverlay",c("I2FkXzMwMFgyNTA="),c("I2Jhbm5lcmZsb2F0MjI="),c("I2NhbXBhaWduLWJhbm5lcg=="),c("I0FkLUNvbnRlbnQ=")],adGuardChinese:[c("LlppX2FkX2FfSA=="),c("YVtocmVmKj0iLmh0aGJldDM0LmNvbSJd"),"#widget-quan",c("YVtocmVmKj0iLzg0OTkyMDIwLnh5eiJd"),c("YVtocmVmKj0iLjE5NTZobC5jb20vIl0=")],adGuardFrench:["#pavePub",c("LmFkLWRlc2t0b3AtcmVjdGFuZ2xl"),".mobile_adhesion",".widgetadv",c("LmFkc19iYW4=")],adGuardGerman:['aside[data-portal-id="leaderboard"]'],adGuardJapanese:["#kauli_yad_1",c("YVtocmVmXj0iaHR0cDovL2FkMi50cmFmZmljZ2F0ZS5uZXQvIl0="),c("Ll9wb3BJbl9pbmZpbml0ZV9hZA=="),c("LmFkZ29vZ2xl"),c("Ll9faXNib29zdFJldHVybkFk")],adGuardMobile:[c("YW1wLWF1dG8tYWRz"),c("LmFtcF9hZA=="),'amp-embed[type="24smi"]',"#mgid_iframe1",c("I2FkX2ludmlld19hcmVh")],adGuardRussian:[c("YVtocmVmXj0iaHR0cHM6Ly9hZC5sZXRtZWFkcy5jb20vIl0="),c("LnJlY2xhbWE="),'div[id^="smi2adblock"]',c("ZGl2W2lkXj0iQWRGb3hfYmFubmVyXyJd"),"#psyduckpockeball"],adGuardSocial:[c("YVtocmVmXj0iLy93d3cuc3R1bWJsZXVwb24uY29tL3N1Ym1pdD91cmw9Il0="),c("YVtocmVmXj0iLy90ZWxlZ3JhbS5tZS9zaGFyZS91cmw/Il0="),".etsy-tweet","#inlineShare",".popup-social"],adGuardSpanishPortuguese:["#barraPublicidade","#Publicidade","#publiEspecial","#queTooltip",".cnt-publi"],adGuardTrackingProtection:["#qoo-counter",c("YVtocmVmXj0iaHR0cDovL2NsaWNrLmhvdGxvZy5ydS8iXQ=="),c("YVtocmVmXj0iaHR0cDovL2hpdGNvdW50ZXIucnUvdG9wL3N0YXQucGhwIl0="),c("YVtocmVmXj0iaHR0cDovL3RvcC5tYWlsLnJ1L2p1bXAiXQ=="),"#top100counter"],adGuardTurkish:["#backkapat",c("I3Jla2xhbWk="),c("YVtocmVmXj0iaHR0cDovL2Fkc2Vydi5vbnRlay5jb20udHIvIl0="),c("YVtocmVmXj0iaHR0cDovL2l6bGVuemkuY29tL2NhbXBhaWduLyJd"),c("YVtocmVmXj0iaHR0cDovL3d3dy5pbnN0YWxsYWRzLm5ldC8iXQ==")],bulgarian:[c("dGQjZnJlZW5ldF90YWJsZV9hZHM="),"#ea_intext_div",".lapni-pop-over","#xenium_hot_offers"],easyList:[".yb-floorad",c("LndpZGdldF9wb19hZHNfd2lkZ2V0"),c("LnRyYWZmaWNqdW5reS1hZA=="),".textad_headline",c("LnNwb25zb3JlZC10ZXh0LWxpbmtz")],easyListChina:[c("LmFwcGd1aWRlLXdyYXBbb25jbGljayo9ImJjZWJvcy5jb20iXQ=="),c("LmZyb250cGFnZUFkdk0="),"#taotaole","#aafoot.top_box",".cfa_popup"],easyListCookie:[".ezmob-footer",".cc-CookieWarning","[data-cookie-number]",c("LmF3LWNvb2tpZS1iYW5uZXI="),".sygnal24-gdpr-modal-wrap"],easyListCzechSlovak:["#onlajny-stickers",c("I3Jla2xhbW5pLWJveA=="),c("LnJla2xhbWEtbWVnYWJvYXJk"),".sklik",c("W2lkXj0ic2tsaWtSZWtsYW1hIl0=")],easyListDutch:[c("I2FkdmVydGVudGll"),c("I3ZpcEFkbWFya3RCYW5uZXJCbG9jaw=="),".adstekst",c("YVtocmVmXj0iaHR0cHM6Ly94bHR1YmUubmwvY2xpY2svIl0="),"#semilo-lrectangle"],easyListGermany:["#SSpotIMPopSlider",c("LnNwb25zb3JsaW5rZ3J1ZW4="),c("I3dlcmJ1bmdza3k="),c("I3Jla2xhbWUtcmVjaHRzLW1pdHRl"),c("YVtocmVmXj0iaHR0cHM6Ly9iZDc0Mi5jb20vIl0=")],easyListItaly:[c("LmJveF9hZHZfYW5udW5jaQ=="),".sb-box-pubbliredazionale",c("YVtocmVmXj0iaHR0cDovL2FmZmlsaWF6aW9uaWFkcy5zbmFpLml0LyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9hZHNlcnZlci5odG1sLml0LyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9hZmZpbGlhemlvbmlhZHMuc25haS5pdC8iXQ==")],easyListLithuania:[c("LnJla2xhbW9zX3RhcnBhcw=="),c("LnJla2xhbW9zX251b3JvZG9z"),c("aW1nW2FsdD0iUmVrbGFtaW5pcyBza3lkZWxpcyJd"),c("aW1nW2FsdD0iRGVkaWt1b3RpLmx0IHNlcnZlcmlhaSJd"),c("aW1nW2FsdD0iSG9zdGluZ2FzIFNlcnZlcmlhaS5sdCJd")],estonian:[c("QVtocmVmKj0iaHR0cDovL3BheTRyZXN1bHRzMjQuZXUiXQ==")],fanboyAnnoyances:["#ac-lre-player",".navigate-to-top","#subscribe_popup",".newsletter_holder","#back-top"],fanboyAntiFacebook:[".util-bar-module-firefly-visible"],fanboyEnhancedTrackers:[".open.pushModal","#issuem-leaky-paywall-articles-zero-remaining-nag","#sovrn_container",'div[class$="-hide"][zoompage-fontsize][style="display: block;"]',".BlockNag__Card"],fanboySocial:["#FollowUs","#meteored_share","#social_follow",".article-sharer",".community__social-desc"],frellwitSwedish:[c("YVtocmVmKj0iY2FzaW5vcHJvLnNlIl1bdGFyZ2V0PSJfYmxhbmsiXQ=="),c("YVtocmVmKj0iZG9rdG9yLXNlLm9uZWxpbmsubWUiXQ=="),"article.category-samarbete",c("ZGl2LmhvbGlkQWRz"),"ul.adsmodern"],greekAdBlock:[c("QVtocmVmKj0iYWRtYW4ub3RlbmV0LmdyL2NsaWNrPyJd"),c("QVtocmVmKj0iaHR0cDovL2F4aWFiYW5uZXJzLmV4b2R1cy5nci8iXQ=="),c("QVtocmVmKj0iaHR0cDovL2ludGVyYWN0aXZlLmZvcnRobmV0LmdyL2NsaWNrPyJd"),"DIV.agores300","TABLE.advright"],hungarian:["#cemp_doboz",".optimonk-iframe-container",c("LmFkX19tYWlu"),c("W2NsYXNzKj0iR29vZ2xlQWRzIl0="),"#hirdetesek_box"],iDontCareAboutCookies:['.alert-info[data-block-track*="CookieNotice"]',".ModuleTemplateCookieIndicator",".o--cookies--container","#cookies-policy-sticky","#stickyCookieBar"],icelandicAbp:[c("QVtocmVmXj0iL2ZyYW1ld29yay9yZXNvdXJjZXMvZm9ybXMvYWRzLmFzcHgiXQ==")],latvian:[c("YVtocmVmPSJodHRwOi8vd3d3LnNhbGlkemluaS5sdi8iXVtzdHlsZT0iZGlzcGxheTogYmxvY2s7IHdpZHRoOiAxMjBweDsgaGVpZ2h0OiA0MHB4OyBvdmVyZmxvdzogaGlkZGVuOyBwb3NpdGlvbjogcmVsYXRpdmU7Il0="),c("YVtocmVmPSJodHRwOi8vd3d3LnNhbGlkemluaS5sdi8iXVtzdHlsZT0iZGlzcGxheTogYmxvY2s7IHdpZHRoOiA4OHB4OyBoZWlnaHQ6IDMxcHg7IG92ZXJmbG93OiBoaWRkZW47IHBvc2l0aW9uOiByZWxhdGl2ZTsiXQ==")],listKr:[c("YVtocmVmKj0iLy9hZC5wbGFuYnBsdXMuY28ua3IvIl0="),c("I2xpdmVyZUFkV3JhcHBlcg=="),c("YVtocmVmKj0iLy9hZHYuaW1hZHJlcC5jby5rci8iXQ=="),c("aW5zLmZhc3R2aWV3LWFk"),".revenue_unit_item.dable"],listeAr:[c("LmdlbWluaUxCMUFk"),".right-and-left-sponsers",c("YVtocmVmKj0iLmFmbGFtLmluZm8iXQ=="),c("YVtocmVmKj0iYm9vcmFxLm9yZyJd"),c("YVtocmVmKj0iZHViaXp6bGUuY29tL2FyLz91dG1fc291cmNlPSJd")],listeFr:[c("YVtocmVmXj0iaHR0cDovL3Byb21vLnZhZG9yLmNvbS8iXQ=="),c("I2FkY29udGFpbmVyX3JlY2hlcmNoZQ=="),c("YVtocmVmKj0id2Vib3JhbWEuZnIvZmNnaS1iaW4vIl0="),".site-pub-interstitiel",'div[id^="crt-"][data-criteo-id]'],officialPolish:["#ceneo-placeholder-ceneo-12",c("W2hyZWZePSJodHRwczovL2FmZi5zZW5kaHViLnBsLyJd"),c("YVtocmVmXj0iaHR0cDovL2Fkdm1hbmFnZXIudGVjaGZ1bi5wbC9yZWRpcmVjdC8iXQ=="),c("YVtocmVmXj0iaHR0cDovL3d3dy50cml6ZXIucGwvP3V0bV9zb3VyY2UiXQ=="),c("ZGl2I3NrYXBpZWNfYWQ=")],ro:[c("YVtocmVmXj0iLy9hZmZ0cmsuYWx0ZXgucm8vQ291bnRlci9DbGljayJd"),c("YVtocmVmXj0iaHR0cHM6Ly9ibGFja2ZyaWRheXNhbGVzLnJvL3Ryay9zaG9wLyJd"),c("YVtocmVmXj0iaHR0cHM6Ly9ldmVudC4ycGVyZm9ybWFudC5jb20vZXZlbnRzL2NsaWNrIl0="),c("YVtocmVmXj0iaHR0cHM6Ly9sLnByb2ZpdHNoYXJlLnJvLyJd"),'a[href^="/url/"]'],ruAd:[c("YVtocmVmKj0iLy9mZWJyYXJlLnJ1LyJd"),c("YVtocmVmKj0iLy91dGltZy5ydS8iXQ=="),c("YVtocmVmKj0iOi8vY2hpa2lkaWtpLnJ1Il0="),"#pgeldiz",".yandex-rtb-block"],thaiAds:["a[href*=macau-uta-popup]",c("I2Fkcy1nb29nbGUtbWlkZGxlX3JlY3RhbmdsZS1ncm91cA=="),c("LmFkczMwMHM="),".bumq",".img-kosana"],webAnnoyancesUltralist:["#mod-social-share-2","#social-tools",c("LmN0cGwtZnVsbGJhbm5lcg=="),".zergnet-recommend",".yt.btn-link.btn-md.btn"]},n=Object.keys(e),[4,T((i=[]).concat.apply(i,n.map((function(n){return e[n]}))))]):[2,void 0];case 1:return o=t.sent(),r&&function(e,n){for(var t="DOM blockers debug:\n```",r=0,o=Object.keys(e);r<o.length;r++){var a=o[r];t+="\n".concat(a,":");for(var i=0,c=e[a];i<c.length;i++){var u=c[i];t+="\n ".concat(n[u]?"🚫":"➡️"," ").concat(u)}}console.log("".concat(t,"\n```"))}(e,o),(a=n.filter((function(n){var t=e[n];return g(t.map((function(e){return o[e]})))>.6*t.length}))).sort(),[2,a]}var c}))}))},fontPreferences:function(){return function(e,n){void 0===n&&(n=4e3);return X((function(t,o){var a=o.document,i=a.body,c=i.style;c.width="".concat(n,"px"),c.webkitTextSizeAdjust=c.textSizeAdjust="none",S()?i.style.zoom="".concat(1/o.devicePixelRatio):x()&&(i.style.zoom="reset");var u=a.createElement("div");return u.textContent=r([],Array(n/20<<0),!0).map((function(){return"word"})).join(" "),i.appendChild(u),e(a,i)}),'<!doctype html><html><head><meta name="viewport" content="width=device-width, initial-scale=1">')}((function(e,n){for(var t={},r={},o=0,a=Object.keys(ee);o<a.length;o++){var i=a[o],c=ee[i],u=c[0],l=void 0===u?{}:u,s=c[1],d=void 0===s?"mmMwWLliI0fiflO&1":s,m=e.createElement("span");m.textContent=d,m.style.whiteSpace="nowrap";for(var f=0,v=Object.keys(l);f<v.length;f++){var h=v[f],p=l[h];void 0!==p&&(m.style[h]=p)}t[i]=m,n.appendChild(e.createElement("br")),n.appendChild(m)}for(var b=0,y=Object.keys(ee);b<y.length;b++){r[i=y[b]]=t[i].getBoundingClientRect().width}return r}))},audio:function(){var e=window,n=e.OfflineAudioContext||e.webkitOfflineAudioContext;if(!n)return-2;if(x()&&!F()&&!function(){var e=window;return g(["DOMRectList"in e,"RTCPeerConnectionIceEvent"in e,"SVGGeometryElement"in e,"ontransitioncancel"in e])>=3}())return-1;var t=new n(1,5e3,44100),r=t.createOscillator();r.type="triangle",r.frequency.value=1e4;var o=t.createDynamicsCompressor();o.threshold.value=-50,o.knee.value=40,o.ratio.value=12,o.attack.value=0,o.release.value=.25,r.connect(o),o.connect(t.destination),r.start(0);var i=function(e){var n=3,t=500,r=500,o=5e3,i=function(){};return[new Promise((function(c,l){var s=!1,d=0,m=0;e.oncomplete=function(e){return c(e.renderedBuffer)};var f=function(){setTimeout((function(){return l(R("timeout"))}),Math.min(r,m+o-Date.now()))},v=function(){try{var r=e.startRendering();switch(a(r)&&u(r),e.state){case"running":m=Date.now(),s&&f();break;case"suspended":document.hidden||d++,s&&d>=n?l(R("suspended")):setTimeout(v,t)}}catch(o){l(o)}};v(),i=function(){s||(s=!0,m>0&&f())}})),i]}(t),c=i[0],l=i[1],s=c.then((function(e){return function(e){for(var n=0,t=0;t<e.length;++t)n+=Math.abs(e[t]);return n}(e.getChannelData(0).subarray(4500))}),(function(e){if("timeout"===e.name||"suspended"===e.name)return-3;throw e}));return u(s),function(){return l(),s}},screenFrame:function(){var e=this,r=z();return function(){return n(e,void 0,void 0,(function(){var e,n;return t(this,(function(t){switch(t.label){case 0:return[4,r()];case 1:return e=t.sent(),[2,[(n=function(e){return null===e?null:w(e,10)})(e[0]),n(e[1]),n(e[2]),n(e[3])]]}}))}))}},osCpu:function(){return navigator.oscpu},languages:function(){var e,n=navigator,t=[],r=n.language||n.userLanguage||n.browserLanguage||n.systemLanguage;if(void 0!==r&&t.push([r]),Array.isArray(n.languages))S()&&g([!("MediaSettingsRange"in(e=window)),"RTCEncodedAudioFrame"in e,""+e.Intl=="[object Intl]",""+e.Reflect=="[object Reflect]"])>=3||t.push(n.languages);else if("string"==typeof n.languages){var o=n.languages;o&&t.push(o.split(","))}return t},colorDepth:function(){return window.screen.colorDepth},deviceMemory:function(){return y(b(navigator.deviceMemory),void 0)},screenResolution:function(){var e=screen,n=function(e){return y(p(e),null)},t=[n(e.width),n(e.height)];return t.sort().reverse(),t},hardwareConcurrency:function(){return y(p(navigator.hardwareConcurrency),void 0)},timezone:function(){var e,n=null===(e=window.Intl)||void 0===e?void 0:e.DateTimeFormat;if(n){var t=(new n).resolvedOptions().timeZone;if(t)return t}var r,o=(r=(new Date).getFullYear(),-Math.max(b(new Date(r,0,1).getTimezoneOffset()),b(new Date(r,6,1).getTimezoneOffset())));return"UTC".concat(o>=0?"+":"").concat(Math.abs(o))},sessionStorage:function(){try{return!!window.sessionStorage}catch(e){return!0}},localStorage:function(){try{return!!window.localStorage}catch(e){return!0}},indexedDB:function(){if(!W()&&!C())try{return!!window.indexedDB}catch(e){return!0}},openDatabase:function(){return!!window.openDatabase},cpuClass:function(){return navigator.cpuClass},platform:function(){var e=navigator.platform;return"MacIntel"===e&&x()&&!F()?function(){if("iPad"===navigator.platform)return!0;var e=screen,n=e.width/e.height;return g(["MediaSource"in window,!!Element.prototype.webkitRequestFullscreen,n>.65&&n<1.53])>=2}()?"iPad":"iPhone":e},plugins:function(){var e=navigator.plugins;if(e){for(var n=[],t=0;t<e.length;++t){var r=e[t];if(r){for(var o=[],a=0;a<r.length;++a){var i=r[a];o.push({type:i.type,suffixes:i.suffixes})}n.push({name:r.name,description:r.description,mimeTypes:o})}}return n}},canvas:function(){var e,n,t=!1,r=function(){var e=document.createElement("canvas");return e.width=1,e.height=1,[e,e.getContext("2d")]}(),o=r[0],a=r[1];if(function(e,n){return!(!n||!e.toDataURL)}(o,a)){t=function(e){return e.rect(0,0,10,10),e.rect(2,2,6,6),!e.isPointInPath(5,5,"evenodd")}(a),function(e,n){e.width=240,e.height=60,n.textBaseline="alphabetic",n.fillStyle="#f60",n.fillRect(100,1,62,20),n.fillStyle="#069",n.font='11pt "Times New Roman"';var t="Cwm fjordbank gly ".concat(String.fromCharCode(55357,56835));n.fillText(t,2,15),n.fillStyle="rgba(102, 204, 0, 0.2)",n.font="18pt Arial",n.fillText(t,4,45)}(o,a);var i=H(o);i!==H(o)?e=n="unstable":(n=i,function(e,n){e.width=122,e.height=110,n.globalCompositeOperation="multiply";for(var t=0,r=[["#f2f",40,40],["#2ff",80,40],["#ff2",60,80]];t<r.length;t++){var o=r[t],a=o[0],i=o[1],c=o[2];n.fillStyle=a,n.beginPath(),n.arc(i,c,40,0,2*Math.PI,!0),n.closePath(),n.fill()}n.fillStyle="#f9c",n.arc(60,60,60,0,2*Math.PI,!0),n.arc(60,60,20,0,2*Math.PI,!0),n.fill("evenodd")}(o,a),e=H(o))}else e=n="";return{winding:t,geometry:e,text:n}},touchSupport:function(){var e,n=navigator,t=0;void 0!==n.maxTouchPoints?t=p(n.maxTouchPoints):void 0!==n.msMaxTouchPoints&&(t=n.msMaxTouchPoints);try{document.createEvent("TouchEvent"),e=!0}catch(r){e=!1}return{maxTouchPoints:t,touchEvent:e,touchStart:"ontouchstart"in window}},vendor:function(){return navigator.vendor||""},vendorFlavors:function(){for(var e=[],n=0,t=["chrome","safari","__crWeb","__gCrWeb","yandex","__yb","__ybro","__firefox__","__edgeTrackingPreventionStatistics","webkit","oprt","samsungAr","ucweb","UCShellJava","puffinDevice"];n<t.length;n++){var r=t[n],o=window[r];o&&"object"==typeof o&&e.push(r)}return e.sort()},cookiesEnabled:function(){var e=document;try{e.cookie="cookietest=1; SameSite=Strict;";var n=-1!==e.cookie.indexOf("cookietest=");return e.cookie="cookietest=1; SameSite=Strict; expires=Thu, 01-Jan-1970 00:00:01 GMT",n}catch(t){return!1}},colorGamut:function(){for(var e=0,n=["rec2020","p3","srgb"];e<n.length;e++){var t=n[e];if(matchMedia("(color-gamut: ".concat(t,")")).matches)return t}},invertedColors:function(){return!!_("inverted")||!_("none")&&void 0},forcedColors:function(){return!!O("active")||!O("none")&&void 0},monochrome:function(){if(matchMedia("(min-monochrome: 0)").matches){for(var e=0;e<=100;++e)if(matchMedia("(max-monochrome: ".concat(e,")")).matches)return e;throw new Error("Too high value")}},contrast:function(){return U("no-preference")?0:U("high")||U("more")?1:U("low")||U("less")?-1:U("forced")?10:void 0},reducedMotion:function(){return!!Q("reduce")||!Q("no-preference")&&void 0},hdr:function(){return!!K("high")||!K("standard")&&void 0},math:function(){var e,n=q.acos||$,t=q.acosh||$,r=q.asin||$,o=q.asinh||$,a=q.atanh||$,i=q.atan||$,c=q.sin||$,u=q.sinh||$,l=q.cos||$,s=q.cosh||$,d=q.tan||$,m=q.tanh||$,f=q.exp||$,v=q.expm1||$,h=q.log1p||$;return{acos:n(.12312423423423424),acosh:t(1e308),acoshPf:(e=1e154,q.log(e+q.sqrt(e*e-1))),asin:r(.12312423423423424),asinh:o(1),asinhPf:function(e){return q.log(e+q.sqrt(e*e+1))}(1),atanh:a(.5),atanhPf:function(e){return q.log((1+e)/(1-e))/2}(.5),atan:i(.5),sin:c(-1e300),sinh:u(1),sinhPf:function(e){return q.exp(e)-1/q.exp(e)/2}(1),cos:l(10.000000000123),cosh:s(1),coshPf:function(e){return(q.exp(e)+1/q.exp(e))/2}(1),tan:d(-1e300),tanh:m(1),tanhPf:function(e){return(q.exp(2*e)-1)/(q.exp(2*e)+1)}(1),exp:f(1),expm1:v(1),expm1Pf:function(e){return q.exp(e)-1}(1),log1p:h(10),log1pPf:function(e){return q.log(1+e)}(10),powPI:function(e){return q.pow(q.PI,e)}(-100)}},videoCard:function(){var e,n=document.createElement("canvas"),t=null!==(e=n.getContext("webgl"))&&void 0!==e?e:n.getContext("experimental-webgl");if(t&&"getExtension"in t){var r=t.getExtension("WEBGL_debug_renderer_info");if(r)return{vendor:(t.getParameter(r.UNMASKED_VENDOR_WEBGL)||"").toString(),renderer:(t.getParameter(r.UNMASKED_RENDERER_WEBGL)||"").toString()}}},pdfViewerEnabled:function(){return navigator.pdfViewerEnabled},architecture:function(){var e=new Float32Array(1),n=new Uint8Array(e.buffer);return e[0]=1/0,e[0]=e[0]-e[0],n[3]}};function te(e){var n=function(e){if(G())return.4;if(x())return F()?.5:.3;var n=e.platform.value||"";if(/^Win/.test(n))return.6;if(/^Mac/.test(n))return.5;return.7}(e),t=function(e){return w(.99+.01*e,1e-4)}(n);return{score:n,comment:"$ if upgrade to Pro: https://fpjs.dev/pro".replace(/\$/g,"".concat(t))}}function re(n){return JSON.stringify(n,(function(n,t){return t instanceof Error?e({name:(r=t).name,message:r.message,stack:null===(o=r.stack)||void 0===o?void 0:o.split("\n")},r):t;var r,o}),2)}function oe(e){return h(function(e){for(var n="",t=0,r=Object.keys(e).sort();t<r.length;t++){var o=r[t],a=e[o],i=a.error?"error":JSON.stringify(a.value);n+="".concat(n?"|":"").concat(o.replace(/([:|\\])/g,"\\$1"),":").concat(i)}return n}(e))}function ae(e){return void 0===e&&(e=50),function(e,n){void 0===n&&(n=1/0);var t=window.requestIdleCallback;return t?new Promise((function(e){return t.call(window,(function(){return e()}),{timeout:n})})):o(Math.min(e,n))}(e,2*e)}function ie(e,r){var o=Date.now();return{get:function(a){return n(this,void 0,void 0,(function(){var n,i,c;return t(this,(function(t){switch(t.label){case 0:return n=Date.now(),[4,e()];case 1:return i=t.sent(),c=function(e){var n;return{get visitorId(){return void 0===n&&(n=oe(this.components)),n},set visitorId(e){n=e},confidence:te(e),components:e,version:"3.4.2"}}(i),(r||(null==a?void 0:a.debug))&&console.log("Copy the text below to get the debug data:\n\n```\nversion: ".concat(c.version,"\nuserAgent: ").concat(navigator.userAgent,"\ntimeBetweenLoadAndGet: ").concat(n-o,"\nvisitorId: ").concat(c.visitorId,"\ncomponents: ").concat(re(i),"\n```")),[2,c]}}))}))}}}function ce(e){var r=void 0===e?{}:e,o=r.delayFallback,a=r.debug;return r.monitoring,n(this,void 0,void 0,(function(){return t(this,(function(e){switch(e.label){case 0:return[4,ae(o)];case 1:return e.sent(),[2,ie(V(ne,{debug:a},[]),a)]}}))}))}var ue={load:ce,hashComponents:oe,componentsToDebugString:re},le=h;export{re as componentsToDebugString,ue as default,M as getFullscreenElement,z as getScreenFrame,oe as hashComponents,G as isAndroid,S as isChromium,F as isDesktopSafari,C as isEdgeHTML,Y as isGecko,W as isTrident,x as isWebKit,ce as load,V as loadSources,le as murmurX64Hash128,ae as prepareForSources,ne as sources,Z as transformSource,X as withIframe};
|
upgini/metrics.py
CHANGED
|
@@ -3,15 +3,16 @@ import re
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
import catboost
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
9
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
9
|
-
import catboost
|
|
10
10
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
11
11
|
from numpy import log1p
|
|
12
12
|
from pandas.api.types import is_numeric_dtype
|
|
13
13
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
14
14
|
|
|
15
|
+
from upgini.utils.features_validator import FeaturesValidator
|
|
15
16
|
from upgini.utils.sklearn_ext import cross_validate
|
|
16
17
|
|
|
17
18
|
try:
|
|
@@ -352,6 +353,7 @@ class EstimatorWrapper:
|
|
|
352
353
|
"target_type": target_type,
|
|
353
354
|
"groups": groups,
|
|
354
355
|
"text_features": text_features,
|
|
356
|
+
"logger": logger,
|
|
355
357
|
}
|
|
356
358
|
if estimator is None:
|
|
357
359
|
params = dict()
|
|
@@ -414,12 +416,22 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
414
416
|
target_type: ModelTaskType,
|
|
415
417
|
groups: Optional[List[str]] = None,
|
|
416
418
|
text_features: Optional[List[str]] = None,
|
|
419
|
+
logger: Optional[logging.Logger] = None,
|
|
417
420
|
):
|
|
418
421
|
super(CatBoostWrapper, self).__init__(
|
|
419
|
-
estimator,
|
|
422
|
+
estimator,
|
|
423
|
+
scorer,
|
|
424
|
+
metric_name,
|
|
425
|
+
multiplier,
|
|
426
|
+
cv,
|
|
427
|
+
target_type,
|
|
428
|
+
groups=groups,
|
|
429
|
+
text_features=text_features,
|
|
430
|
+
logger=logger,
|
|
420
431
|
)
|
|
421
432
|
self.cat_features = None
|
|
422
433
|
self.emb_features = None
|
|
434
|
+
self.exclude_features = []
|
|
423
435
|
|
|
424
436
|
def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
425
437
|
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
@@ -437,9 +449,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
437
449
|
X, embedding_features = self.group_embeddings(X)
|
|
438
450
|
params["embedding_features"] = embedding_features
|
|
439
451
|
else:
|
|
440
|
-
self.logger.info(
|
|
441
|
-
f"Embedding features count less than 3, so use them separately: {self.emb_features}"
|
|
442
|
-
)
|
|
452
|
+
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
443
453
|
self.emb_features = []
|
|
444
454
|
else:
|
|
445
455
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
@@ -498,6 +508,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
498
508
|
return df, [emb_name]
|
|
499
509
|
|
|
500
510
|
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
511
|
+
if self.exclude_features:
|
|
512
|
+
X = X.drop(columns=self.exclude_features)
|
|
501
513
|
X, y, params = super()._prepare_to_calculate(X, y)
|
|
502
514
|
if self.text_features:
|
|
503
515
|
params["text_features"] = self.text_features
|
|
@@ -510,6 +522,26 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
510
522
|
|
|
511
523
|
return X, y, params
|
|
512
524
|
|
|
525
|
+
def cross_val_predict(
|
|
526
|
+
self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
527
|
+
) -> Optional[float]:
|
|
528
|
+
try:
|
|
529
|
+
return super().cross_val_predict(X, y, baseline_score_column)
|
|
530
|
+
except Exception as e:
|
|
531
|
+
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
532
|
+
high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
|
|
533
|
+
self.logger.warning(
|
|
534
|
+
"Failed to calculate metrics. Try to remove high cardinality"
|
|
535
|
+
f" text features {high_cardinality_features} and retry"
|
|
536
|
+
)
|
|
537
|
+
for f in high_cardinality_features:
|
|
538
|
+
self.text_features.remove(f)
|
|
539
|
+
self.exclude_features.append(f)
|
|
540
|
+
X = X.drop(columns=f)
|
|
541
|
+
return super().cross_val_predict(X, y, baseline_score_column)
|
|
542
|
+
else:
|
|
543
|
+
raise e
|
|
544
|
+
|
|
513
545
|
|
|
514
546
|
class LightGBMWrapper(EstimatorWrapper):
|
|
515
547
|
def __init__(
|
|
@@ -522,9 +554,18 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
522
554
|
target_type: ModelTaskType,
|
|
523
555
|
groups: Optional[List[str]] = None,
|
|
524
556
|
text_features: Optional[List[str]] = None,
|
|
557
|
+
logger: Optional[logging.Logger] = None,
|
|
525
558
|
):
|
|
526
559
|
super(LightGBMWrapper, self).__init__(
|
|
527
|
-
estimator,
|
|
560
|
+
estimator,
|
|
561
|
+
scorer,
|
|
562
|
+
metric_name,
|
|
563
|
+
multiplier,
|
|
564
|
+
cv,
|
|
565
|
+
target_type,
|
|
566
|
+
groups=groups,
|
|
567
|
+
text_features=text_features,
|
|
568
|
+
logger=logger,
|
|
528
569
|
)
|
|
529
570
|
self.cat_features = None
|
|
530
571
|
|
|
@@ -561,9 +602,18 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
561
602
|
target_type: ModelTaskType,
|
|
562
603
|
groups: Optional[List[str]] = None,
|
|
563
604
|
text_features: Optional[List[str]] = None,
|
|
605
|
+
logger: Optional[logging.Logger] = None,
|
|
564
606
|
):
|
|
565
607
|
super(OtherEstimatorWrapper, self).__init__(
|
|
566
|
-
estimator,
|
|
608
|
+
estimator,
|
|
609
|
+
scorer,
|
|
610
|
+
metric_name,
|
|
611
|
+
multiplier,
|
|
612
|
+
cv,
|
|
613
|
+
target_type,
|
|
614
|
+
groups=groups,
|
|
615
|
+
text_features=text_features,
|
|
616
|
+
logger=logger,
|
|
567
617
|
)
|
|
568
618
|
self.cat_features = None
|
|
569
619
|
|
|
@@ -595,6 +645,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
595
645
|
|
|
596
646
|
|
|
597
647
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
648
|
+
# TODO validate that if it is Callable then it accepts 3 arguments
|
|
598
649
|
if isinstance(scoring, str) and scoring is not None:
|
|
599
650
|
_get_scorer_by_name(scoring)
|
|
600
651
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -44,7 +44,7 @@ class PhoneNormalizer:
|
|
|
44
44
|
Method will remove all non numeric chars from string and convert it to int.
|
|
45
45
|
None will be set for phone numbers that couldn"t be converted to int
|
|
46
46
|
"""
|
|
47
|
-
if is_string_dtype(self.df[self.phone_column_name]):
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
|
|
48
48
|
convert_func = self.phone_str_to_int_safe
|
|
49
49
|
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
50
|
convert_func = self.phone_float_to_int_safe
|
|
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
+
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
42
|
|
|
42
43
|
# Errors
|
|
43
44
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -111,6 +112,9 @@ x_is_empty=X is empty
|
|
|
111
112
|
y_is_empty=y is empty
|
|
112
113
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
114
|
missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
|
|
115
|
+
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
116
|
+
train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
117
|
+
eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
114
118
|
# eval set validation
|
|
115
119
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
116
120
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
|
145
149
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
146
150
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
147
151
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
148
|
-
|
|
152
|
+
dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
153
|
+
dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
149
154
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
150
155
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
151
156
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
196
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
197
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
198
203
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
199
|
-
target_type_detected
|
|
204
|
+
target_type_detected=\nDetected task type: {}\n
|
|
200
205
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
201
206
|
all_ok_community_invite=❓ Support request
|
|
202
|
-
too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
207
|
+
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
203
208
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
204
209
|
loss_selection_info=Using loss `{}` for feature selection
|
|
205
210
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
|
|
|
9
9
|
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
|
+
from upgini.metadata import SearchKey
|
|
12
13
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
14
|
+
from upgini.utils.warning_counter import WarningCounter
|
|
13
15
|
|
|
14
16
|
DATE_FORMATS = [
|
|
15
17
|
"%Y-%m-%d",
|
|
@@ -98,6 +100,9 @@ class DateTimeSearchKeyConverter:
|
|
|
98
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
99
101
|
self.logger.warning(msg)
|
|
100
102
|
raise ValidationError(msg)
|
|
103
|
+
else:
|
|
104
|
+
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
105
|
+
df[self.date_column] = self.parse_date(df)
|
|
101
106
|
|
|
102
107
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
103
108
|
# as additional features
|
|
@@ -225,3 +230,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
225
230
|
|
|
226
231
|
is_diff_less_than_two_columns = grouped.apply(check_differences)
|
|
227
232
|
return is_diff_less_than_two_columns.all()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def validate_dates_distribution(
|
|
236
|
+
X: pd.DataFrame,
|
|
237
|
+
search_keys: Dict[str, SearchKey],
|
|
238
|
+
logger: Optional[logging.Logger] = None,
|
|
239
|
+
bundle: Optional[ResourceBundle] = None,
|
|
240
|
+
warning_counter: Optional[WarningCounter] = None,
|
|
241
|
+
):
|
|
242
|
+
maybe_date_col = None
|
|
243
|
+
for key, key_type in search_keys.items():
|
|
244
|
+
if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
245
|
+
maybe_date_col = key
|
|
246
|
+
|
|
247
|
+
if maybe_date_col is None:
|
|
248
|
+
for col in X.columns:
|
|
249
|
+
if col in search_keys:
|
|
250
|
+
continue
|
|
251
|
+
try:
|
|
252
|
+
pd.to_datetime(X[col])
|
|
253
|
+
maybe_date_col = col
|
|
254
|
+
break
|
|
255
|
+
except Exception:
|
|
256
|
+
pass
|
|
257
|
+
|
|
258
|
+
if maybe_date_col is None:
|
|
259
|
+
return
|
|
260
|
+
|
|
261
|
+
dates = pd.to_datetime(X[maybe_date_col]).dt.date
|
|
262
|
+
|
|
263
|
+
date_counts = dates.value_counts().sort_index()
|
|
264
|
+
|
|
265
|
+
date_counts_1 = date_counts[: round(len(date_counts) / 2)]
|
|
266
|
+
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
|
267
|
+
ratio = date_counts_2.mean() / date_counts_1.mean()
|
|
268
|
+
|
|
269
|
+
if ratio > 1.2 or ratio < 0.8:
|
|
270
|
+
if warning_counter is not None:
|
|
271
|
+
warning_counter.increment()
|
|
272
|
+
if logger is None:
|
|
273
|
+
logger = logging.getLogger("muted_logger")
|
|
274
|
+
logger.setLevel("FATAL")
|
|
275
|
+
bundle = bundle or get_custom_bundle()
|
|
276
|
+
msg = bundle.get("x_unstable_by_date")
|
|
277
|
+
print(msg)
|
|
278
|
+
logger.warning(msg)
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
6
|
+
from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
7
|
from upgini.resource_bundle import ResourceBundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
|
|
|
78
78
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
79
79
|
if len(rows_with_diff_target) > 0:
|
|
80
80
|
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
logger
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
81
|
+
if EVAL_SET_INDEX not in df.columns:
|
|
82
|
+
rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
83
|
+
rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
|
|
84
|
+
perc = len(rows_to_remove) * 100 / len(df)
|
|
85
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
86
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
87
|
+
)
|
|
88
|
+
if not silent:
|
|
89
|
+
print(msg)
|
|
90
|
+
if logger:
|
|
91
|
+
logger.warning(msg)
|
|
92
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
93
|
+
df = df[~df.index.isin(rows_to_remove.index)]
|
|
94
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
|
+
else:
|
|
96
|
+
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
97
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
98
|
+
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
99
|
+
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
100
|
+
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
101
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
102
|
+
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
103
|
+
)
|
|
104
|
+
if not silent:
|
|
105
|
+
print(msg)
|
|
106
|
+
if logger:
|
|
107
|
+
logger.warning(msg)
|
|
108
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
|
|
109
|
+
train = train[~train.index.isin(train_rows_to_remove.index)]
|
|
110
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
|
|
111
|
+
|
|
112
|
+
evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
|
|
113
|
+
new_evals = []
|
|
114
|
+
for i, eval in enumerate(evals):
|
|
115
|
+
eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
116
|
+
eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
|
|
117
|
+
eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
|
|
118
|
+
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
119
|
+
eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
|
|
120
|
+
)
|
|
121
|
+
if not silent:
|
|
122
|
+
print(msg)
|
|
123
|
+
if logger:
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
|
|
126
|
+
eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
|
|
127
|
+
logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
|
|
128
|
+
new_evals.append(eval)
|
|
129
|
+
|
|
130
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
131
|
+
df = pd.concat([train] + new_evals)
|
|
132
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
133
|
return df
|
|
96
134
|
|
|
97
135
|
|
|
@@ -101,14 +139,18 @@ def clean_full_duplicates(
|
|
|
101
139
|
nrows = len(df)
|
|
102
140
|
if nrows == 0:
|
|
103
141
|
return df
|
|
104
|
-
# Remove
|
|
142
|
+
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
105
143
|
unique_columns = df.columns.tolist()
|
|
106
144
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
107
145
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
108
146
|
if SORT_ID in unique_columns:
|
|
109
147
|
unique_columns.remove(SORT_ID)
|
|
148
|
+
if EVAL_SET_INDEX in unique_columns:
|
|
149
|
+
unique_columns.remove(EVAL_SET_INDEX)
|
|
110
150
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
111
|
-
|
|
151
|
+
# Train segment goes first so if duplicates are found in train and eval set
|
|
152
|
+
# then we keep unique rows in train segment
|
|
153
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
|
112
154
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
113
155
|
nrows_after_full_dedup = len(df)
|
|
114
156
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
@@ -123,7 +165,7 @@ def clean_full_duplicates(
|
|
|
123
165
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
124
166
|
if marked_duplicates.sum() > 0:
|
|
125
167
|
dups_indices = df[marked_duplicates].index.to_list()
|
|
126
|
-
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
|
|
168
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
|
127
169
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
128
170
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
129
171
|
|
|
@@ -133,6 +175,7 @@ def clean_full_duplicates(
|
|
|
133
175
|
print(msg)
|
|
134
176
|
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
135
177
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
178
|
+
|
|
136
179
|
return df
|
|
137
180
|
|
|
138
181
|
|
upgini/utils/sklearn_ext.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import logging
|
|
3
2
|
import numbers
|
|
4
3
|
import time
|
|
5
4
|
import warnings
|
|
@@ -313,7 +312,7 @@ def cross_validate(
|
|
|
313
312
|
|
|
314
313
|
return ret
|
|
315
314
|
except Exception:
|
|
316
|
-
logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
315
|
+
# logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
317
316
|
raise
|
|
318
317
|
# fit_params["use_best_model"] = False
|
|
319
318
|
# return original_cross_validate(
|
upgini/utils/target_utils.py
CHANGED
|
@@ -132,9 +132,7 @@ def balance_undersample(
|
|
|
132
132
|
class_value = classes[class_idx]
|
|
133
133
|
class_count = vc[class_value]
|
|
134
134
|
sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
|
|
135
|
-
sampler = RandomUnderSampler(
|
|
136
|
-
sampling_strategy=sample_strategy, random_state=random_state
|
|
137
|
-
)
|
|
135
|
+
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
138
136
|
X = df[SYSTEM_RECORD_ID]
|
|
139
137
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
140
138
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
@@ -153,9 +151,7 @@ def balance_undersample(
|
|
|
153
151
|
minority_class = df[df[target_column] == min_class_value]
|
|
154
152
|
majority_class = df[df[target_column] != min_class_value]
|
|
155
153
|
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
156
|
-
sampled_majority_class = majority_class.sample(
|
|
157
|
-
n=sample_size, random_state=random_state
|
|
158
|
-
)
|
|
154
|
+
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
159
155
|
resampled_data = df[
|
|
160
156
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
161
157
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
@@ -181,3 +177,21 @@ def balance_undersample(
|
|
|
181
177
|
|
|
182
178
|
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
183
179
|
return resampled_data
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
183
|
+
df = pd.concat([expected, actual])
|
|
184
|
+
|
|
185
|
+
# Define the bins for the target variable
|
|
186
|
+
df_min = df.min()
|
|
187
|
+
df_max = df.max()
|
|
188
|
+
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
189
|
+
|
|
190
|
+
# Calculate the base distribution
|
|
191
|
+
train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
|
|
192
|
+
|
|
193
|
+
# Calculate the target distribution
|
|
194
|
+
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
|
+
|
|
196
|
+
# Calculate the PSI
|
|
197
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.274a4
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil >=2.8.0
|
|
30
30
|
Requires-Dist: requests >=2.8.0
|
|
31
|
-
Requires-Dist: pandas <2.
|
|
31
|
+
Requires-Dist: pandas <2.1.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy >=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn >=1.3.0
|
|
34
34
|
Requires-Dist: pydantic <2.0.0,>=1.8.2
|
|
@@ -1,34 +1,35 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=WDj4DO5lqANBdihEcRmwox4w1kqWVOorlIKY4dbsqrU,175376
|
|
6
|
+
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
6
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
7
8
|
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
8
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=U3VJKbKmuWACqI4jTcszXo0WqeXFtV8bWyY9VLBL-rw,29129
|
|
9
10
|
upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
|
|
10
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
11
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
12
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
13
14
|
upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
|
|
14
15
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
upgini/autofe/all_operands.py,sha256=
|
|
16
|
+
upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
|
|
16
17
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
17
|
-
upgini/autofe/date.py,sha256=
|
|
18
|
+
upgini/autofe/date.py,sha256=_6RoEJZ5Kf-Q_aMOFucS6YSIZpCcelgpw-edV4qmRIM,3935
|
|
18
19
|
upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
|
|
19
20
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
20
21
|
upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
|
|
21
22
|
upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
|
|
22
23
|
upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
|
|
23
24
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
25
|
+
upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
|
|
25
26
|
upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
|
|
26
27
|
upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
|
|
27
28
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
-
upgini/normalizer/phone_normalizer.py,sha256=
|
|
29
|
+
upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
|
|
29
30
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
30
31
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
31
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
32
|
+
upgini/resource_bundle/strings.properties,sha256=x-2fXtGc5Z2n7eUg9b6I4yhok56TTXDvzwU1JUaKcj4,26285
|
|
32
33
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
33
34
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
35
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
@@ -40,8 +41,8 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
|
|
|
40
41
|
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
41
42
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
42
43
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
43
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
44
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
44
|
+
upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
|
|
45
|
+
upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
|
|
45
46
|
upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
|
|
46
47
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
47
48
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
@@ -51,12 +52,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
|
51
52
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
52
53
|
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
53
54
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
54
|
-
upgini/utils/sklearn_ext.py,sha256=
|
|
55
|
-
upgini/utils/target_utils.py,sha256=
|
|
55
|
+
upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
|
|
56
|
+
upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
|
|
56
57
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
57
58
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
59
|
+
upgini-1.1.274a4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
60
|
+
upgini-1.1.274a4.dist-info/METADATA,sha256=xng0cJvEGeFT2zSBqLDy-qf9I6ONKxdKtXsFWokPpPs,48158
|
|
61
|
+
upgini-1.1.274a4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
62
|
+
upgini-1.1.274a4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
63
|
+
upgini-1.1.274a4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|