upgini 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/autofe/all_operands.py +12 -2
- upgini/autofe/date.py +68 -8
- upgini/autofe/feature.py +1 -1
- upgini/data_source/data_source_publisher.py +24 -5
- upgini/dataset.py +21 -58
- upgini/features_enricher.py +114 -40
- upgini/fingerprint.js +8 -0
- upgini/metrics.py +58 -7
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +8 -3
- upgini/search_task.py +1 -1
- upgini/utils/datetime_utils.py +53 -2
- upgini/utils/deduplicate_utils.py +61 -18
- upgini/utils/sklearn_ext.py +1 -2
- upgini/utils/target_utils.py +125 -2
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/METADATA +2 -2
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/RECORD +20 -19
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/LICENSE +0 -0
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/WHEEL +0 -0
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/top_level.txt +0 -0
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
|
-
from upgini.autofe.date import DateDiff,
|
|
2
|
+
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
3
|
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
4
4
|
from upgini.autofe.operand import Operand
|
|
5
5
|
from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
|
|
@@ -37,7 +37,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
37
37
|
Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
|
|
38
38
|
Sim(),
|
|
39
39
|
DateDiff(),
|
|
40
|
-
|
|
40
|
+
DateDiffType2(),
|
|
41
|
+
DateListDiff(aggregation="min"),
|
|
42
|
+
DateListDiff(aggregation="max"),
|
|
43
|
+
DateListDiff(aggregation="mean"),
|
|
44
|
+
DateListDiff(aggregation="nunique"),
|
|
45
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
|
|
46
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
|
|
47
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
|
|
48
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
49
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
50
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
41
51
|
]
|
|
42
52
|
}
|
|
43
53
|
|
upgini/autofe/date.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from pydantic import BaseModel
|
|
4
5
|
|
|
5
|
-
from upgini.autofe.operand import PandasOperand
|
|
6
|
+
from upgini.autofe.operand import PandasOperand
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
class DateDiffMixin:
|
|
9
|
+
class DateDiffMixin(BaseModel):
|
|
9
10
|
diff_unit: str = "D"
|
|
10
11
|
left_unit: Optional[str] = None
|
|
11
12
|
right_unit: Optional[str] = None
|
|
@@ -34,18 +35,77 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
34
35
|
return x
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
class
|
|
38
|
-
name = "
|
|
38
|
+
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
39
|
+
name = "date_diff_type2"
|
|
39
40
|
is_binary = True
|
|
40
41
|
has_symmetry_importance = True
|
|
41
|
-
is_vectorizable = False
|
|
42
42
|
|
|
43
43
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
44
44
|
left = self._convert_to_date(left, self.left_unit)
|
|
45
45
|
right = self._convert_to_date(right, self.right_unit)
|
|
46
|
-
future =
|
|
46
|
+
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
|
+
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
|
+
)
|
|
49
|
+
future = pd.to_datetime(future)
|
|
47
50
|
before = future[future < left]
|
|
48
|
-
future[future < left] = pd.
|
|
51
|
+
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
49
52
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
50
53
|
|
|
51
54
|
return diff
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
61
|
+
is_binary = True
|
|
62
|
+
has_symmetry_importance = True
|
|
63
|
+
aggregation: str
|
|
64
|
+
|
|
65
|
+
def __init__(self, **data: Any) -> None:
|
|
66
|
+
if "name" not in data:
|
|
67
|
+
data["name"] = f"date_diff_{data.get('aggregation')}"
|
|
68
|
+
super().__init__(**data)
|
|
69
|
+
|
|
70
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
71
|
+
left = self._convert_to_date(left, self.left_unit)
|
|
72
|
+
right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
|
|
73
|
+
|
|
74
|
+
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
75
|
+
|
|
76
|
+
def _diff(self, x):
|
|
77
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
78
|
+
return x[x > 0]
|
|
79
|
+
|
|
80
|
+
def _agg(self, x):
|
|
81
|
+
method = getattr(np, self.aggregation, None)
|
|
82
|
+
default = np.nan
|
|
83
|
+
if method is None and self.aggregation in _ext_aggregations:
|
|
84
|
+
method, default = _ext_aggregations[self.aggregation]
|
|
85
|
+
elif not callable(method):
|
|
86
|
+
raise ValueError(f"Unsupported aggregation: {self.aggregation}")
|
|
87
|
+
|
|
88
|
+
return method(x) if len(x) > 0 else default
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DateListDiffBounded(DateListDiff):
|
|
92
|
+
lower_bound: Optional[int]
|
|
93
|
+
upper_bound: Optional[int]
|
|
94
|
+
|
|
95
|
+
def __init__(self, **data: Any) -> None:
|
|
96
|
+
if "name" not in data:
|
|
97
|
+
lower_bound = data.get("lower_bound")
|
|
98
|
+
upper_bound = data.get("upper_bound")
|
|
99
|
+
components = [
|
|
100
|
+
"date_diff",
|
|
101
|
+
data.get("diff_unit"),
|
|
102
|
+
str(lower_bound if lower_bound is not None else "minusinf"),
|
|
103
|
+
str(upper_bound if upper_bound is not None else "plusinf"),
|
|
104
|
+
]
|
|
105
|
+
components.append(data.get("aggregation"))
|
|
106
|
+
data["name"] = "_".join(components)
|
|
107
|
+
super().__init__(**data)
|
|
108
|
+
|
|
109
|
+
def _agg(self, x):
|
|
110
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
111
|
+
return super()._agg(x)
|
upgini/autofe/feature.py
CHANGED
|
@@ -305,7 +305,7 @@ class FeatureGroup:
|
|
|
305
305
|
grouped_features = []
|
|
306
306
|
|
|
307
307
|
def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
|
|
308
|
-
return (f.op, f.children[f.op.group_index])
|
|
308
|
+
return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
|
|
309
309
|
|
|
310
310
|
for op_child, features in itertools.groupby(candidates, groupby_func):
|
|
311
311
|
op, main_child = op_child
|
|
@@ -40,7 +40,7 @@ class DataSourcePublisher:
|
|
|
40
40
|
if logs_enabled:
|
|
41
41
|
self.logger = LoggerFactory().get_logger(endpoint, api_key)
|
|
42
42
|
else:
|
|
43
|
-
self.logger = logging.getLogger()
|
|
43
|
+
self.logger = logging.getLogger("muted_logger")
|
|
44
44
|
self.logger.setLevel("FATAL")
|
|
45
45
|
|
|
46
46
|
def place(
|
|
@@ -48,6 +48,7 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
+
exclude_from_autofe_generation: Optional[List[str]],
|
|
51
52
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
52
53
|
sort_column: Optional[str] = None,
|
|
53
54
|
date_format: Optional[str] = None,
|
|
@@ -57,7 +58,6 @@ class DataSourcePublisher:
|
|
|
57
58
|
join_date_abs_limit_days: Optional[int] = None,
|
|
58
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
59
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
-
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
|
@@ -72,8 +72,8 @@ class DataSourcePublisher:
|
|
|
72
72
|
)
|
|
73
73
|
if search_keys is None or len(search_keys) == 0:
|
|
74
74
|
raise ValidationError("Empty search keys")
|
|
75
|
-
if SearchKey.DATE in search_keys.values() and date_format is None:
|
|
76
|
-
|
|
75
|
+
# if SearchKey.DATE in search_keys.values() and date_format is None:
|
|
76
|
+
# raise ValidationError("date_format is required for DATE search key")
|
|
77
77
|
if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
|
|
78
78
|
raise ValidationError(
|
|
79
79
|
f"Invalid update frequency: {update_frequency}. "
|
|
@@ -85,11 +85,19 @@ class DataSourcePublisher:
|
|
|
85
85
|
or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
|
|
86
86
|
) and sort_column is None:
|
|
87
87
|
raise ValidationError("Sort column is required for passed search keys")
|
|
88
|
+
if (
|
|
89
|
+
set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
|
|
90
|
+
and snapshot_frequency_days is None
|
|
91
|
+
and join_date_abs_limit_days is None
|
|
92
|
+
):
|
|
93
|
+
raise ValidationError(
|
|
94
|
+
"With MSISDN and DATE keys one of the snapshot_frequency_days or"
|
|
95
|
+
" join_date_abs_limit_days parameters is required"
|
|
96
|
+
)
|
|
88
97
|
|
|
89
98
|
request = {
|
|
90
99
|
"dataTableUri": data_table_uri,
|
|
91
100
|
"searchKeys": {k: v.value.value for k, v in search_keys.items()},
|
|
92
|
-
"dateFormat": date_format,
|
|
93
101
|
"excludeColumns": exclude_columns,
|
|
94
102
|
"hashFeatureNames": str(hash_feature_names).lower(),
|
|
95
103
|
"snapshotFrequencyDays": snapshot_frequency_days,
|
|
@@ -98,6 +106,8 @@ class DataSourcePublisher:
|
|
|
98
106
|
"featuresForEmbeddings": features_for_embeddings,
|
|
99
107
|
"forceGeneration": str(_force_generation).lower(),
|
|
100
108
|
}
|
|
109
|
+
if date_format is not None:
|
|
110
|
+
request["dateFormat"] = date_format
|
|
101
111
|
if secondary_search_keys is not None:
|
|
102
112
|
request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
|
|
103
113
|
if sort_column is not None:
|
|
@@ -170,6 +180,7 @@ class DataSourcePublisher:
|
|
|
170
180
|
print(msg)
|
|
171
181
|
self.logger.info(msg)
|
|
172
182
|
self._rest_client.stop_ads_management_task(task_id, trace_id)
|
|
183
|
+
raise
|
|
173
184
|
except Exception:
|
|
174
185
|
self.logger.exception("Failed to register data table")
|
|
175
186
|
raise
|
|
@@ -289,6 +300,7 @@ class DataSourcePublisher:
|
|
|
289
300
|
raise ValidationError("One of arguments: bq_table_id or search_keys should be presented")
|
|
290
301
|
if bq_table_id is not None and search_keys is not None:
|
|
291
302
|
raise ValidationError("Only one argument could be presented: bq_table_id or search_keys")
|
|
303
|
+
task_id = None
|
|
292
304
|
try:
|
|
293
305
|
search_keys = [k.value.value for k in search_keys] if search_keys else None
|
|
294
306
|
request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
|
|
@@ -303,6 +315,13 @@ class DataSourcePublisher:
|
|
|
303
315
|
raise Exception("Failed to register ADS: " + status_response["errorMessage"])
|
|
304
316
|
|
|
305
317
|
print("Uploading successfully finished")
|
|
318
|
+
except KeyboardInterrupt:
|
|
319
|
+
if task_id is not None:
|
|
320
|
+
msg = f"Stopping AdsManagementTask {task_id}"
|
|
321
|
+
print(msg)
|
|
322
|
+
self.logger.info(msg)
|
|
323
|
+
self._rest_client.stop_ads_management_task(task_id, trace_id)
|
|
324
|
+
raise
|
|
306
325
|
except Exception:
|
|
307
326
|
self.logger.exception(f"Failed to upload table {bq_table_id}")
|
|
308
327
|
raise
|
upgini/dataset.py
CHANGED
|
@@ -39,10 +39,10 @@ from upgini.metadata import (
|
|
|
39
39
|
)
|
|
40
40
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
41
41
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
42
|
-
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
43
42
|
from upgini.search_task import SearchTask
|
|
44
43
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
45
44
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
45
|
+
from upgini.utils.target_utils import balance_undersample
|
|
46
46
|
|
|
47
47
|
try:
|
|
48
48
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -60,7 +60,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
60
60
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
61
61
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
62
62
|
MIN_SAMPLE_THRESHOLD = 5_000
|
|
63
|
-
IMBALANCE_THESHOLD = 0.
|
|
63
|
+
IMBALANCE_THESHOLD = 0.6
|
|
64
|
+
BINARY_BOOTSTRAP_LOOPS = 5
|
|
65
|
+
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
64
66
|
MIN_TARGET_CLASS_ROWS = 100
|
|
65
67
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
|
66
68
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
|
@@ -460,10 +462,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
460
462
|
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
|
|
461
463
|
):
|
|
462
464
|
count = len(train_segment)
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
|
466
|
-
target = train_segment[target_column].copy()
|
|
465
|
+
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
|
466
|
+
target = train_segment[target_column]
|
|
467
467
|
target_classes_count = target.nunique()
|
|
468
468
|
|
|
469
469
|
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
|
@@ -473,12 +473,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
473
473
|
self.logger.warning(msg)
|
|
474
474
|
raise ValidationError(msg)
|
|
475
475
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
if current_class_count < min_class_count:
|
|
480
|
-
min_class_count = current_class_count
|
|
481
|
-
min_class_value = v
|
|
476
|
+
vc = target.value_counts()
|
|
477
|
+
min_class_value = vc.index[len(vc) - 1]
|
|
478
|
+
min_class_count = vc[min_class_value]
|
|
482
479
|
|
|
483
480
|
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
|
484
481
|
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
|
@@ -491,53 +488,19 @@ class Dataset: # (pd.DataFrame):
|
|
|
491
488
|
min_class_threshold = min_class_percent * count
|
|
492
489
|
|
|
493
490
|
if min_class_count < min_class_threshold:
|
|
494
|
-
msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
|
|
495
|
-
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
496
|
-
)
|
|
497
|
-
self.logger.warning(msg)
|
|
498
|
-
print(msg)
|
|
499
|
-
self.warning_counter.increment()
|
|
500
|
-
|
|
501
|
-
train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
502
|
-
if self.task_type == ModelTaskType.MULTICLASS:
|
|
503
|
-
# Sort classes by rows count and find 25% quantile class
|
|
504
|
-
classes = target.value_counts().index
|
|
505
|
-
quantile25_idx = int(0.75 * len(classes))
|
|
506
|
-
quantile25_class = classes[quantile25_idx]
|
|
507
|
-
count_of_quantile25_class = len(target[target == quantile25_class])
|
|
508
|
-
msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
|
|
509
|
-
self.logger.warning(msg)
|
|
510
|
-
print(msg)
|
|
511
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
512
|
-
parts = []
|
|
513
|
-
for class_idx in range(quantile25_idx):
|
|
514
|
-
sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
|
|
515
|
-
n=count_of_quantile25_class, random_state=self.random_state
|
|
516
|
-
)
|
|
517
|
-
parts.append(sampled)
|
|
518
|
-
for class_idx in range(quantile25_idx, len(classes)):
|
|
519
|
-
parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
|
|
520
|
-
resampled_data = pd.concat(parts)
|
|
521
|
-
elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
|
|
522
|
-
minority_class = train_segment[train_segment[target_column] == min_class_value]
|
|
523
|
-
majority_class = train_segment[train_segment[target_column] != min_class_value]
|
|
524
|
-
sampled_majority_class = majority_class.sample(
|
|
525
|
-
n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
|
|
526
|
-
)
|
|
527
|
-
resampled_data = train_segment[
|
|
528
|
-
(train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
529
|
-
| (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
530
|
-
]
|
|
531
|
-
else:
|
|
532
|
-
sampler = RandomUnderSampler(random_state=self.random_state)
|
|
533
|
-
X = train_segment[SYSTEM_RECORD_ID]
|
|
534
|
-
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
535
|
-
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
536
|
-
resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
537
|
-
|
|
538
|
-
self.data = resampled_data
|
|
539
|
-
self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
|
|
540
491
|
self.imbalanced = True
|
|
492
|
+
self.data = balance_undersample(
|
|
493
|
+
df=train_segment,
|
|
494
|
+
target_column=target_column,
|
|
495
|
+
task_type=self.task_type,
|
|
496
|
+
random_state=self.random_state,
|
|
497
|
+
imbalance_threshold=self.IMBALANCE_THESHOLD,
|
|
498
|
+
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
499
|
+
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
500
|
+
logger=self.logger,
|
|
501
|
+
bundle=self.bundle,
|
|
502
|
+
warning_counter=self.warning_counter,
|
|
503
|
+
)
|
|
541
504
|
|
|
542
505
|
# Resample over fit threshold
|
|
543
506
|
if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|