upgini 1.1.264__tar.gz → 1.1.264a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.264/src/upgini.egg-info → upgini-1.1.264a1}/PKG-INFO +1 -1
- {upgini-1.1.264 → upgini-1.1.264a1}/setup.py +1 -1
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/features_enricher.py +22 -13
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/strings.properties +2 -2
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/datetime_utils.py +1 -49
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/deduplicate_utils.py +61 -18
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/target_utils.py +6 -2
- {upgini-1.1.264 → upgini-1.1.264a1/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_datetime_utils.py +2 -30
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_features_enricher.py +0 -2
- {upgini-1.1.264 → upgini-1.1.264a1}/LICENSE +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/README.md +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/pyproject.toml +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/setup.cfg +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/ads.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/dataset.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/errors.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/http.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/metadata.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/metrics.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/search_task.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_autofe_operands.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_country_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_email_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_metrics.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_target_utils.py +0 -0
- {upgini-1.1.264 → upgini-1.1.264a1}/tests/test_widget.py +0 -0
|
@@ -70,7 +70,6 @@ from upgini.utils.datetime_utils import (
|
|
|
70
70
|
DateTimeSearchKeyConverter,
|
|
71
71
|
is_blocked_time_series,
|
|
72
72
|
is_time_series,
|
|
73
|
-
validate_dates_distribution,
|
|
74
73
|
)
|
|
75
74
|
from upgini.utils.deduplicate_utils import (
|
|
76
75
|
clean_full_duplicates,
|
|
@@ -1686,6 +1685,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1686
1685
|
df = validated_X.copy()
|
|
1687
1686
|
|
|
1688
1687
|
df[TARGET] = validated_y
|
|
1688
|
+
|
|
1689
|
+
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1690
|
+
|
|
1689
1691
|
num_samples = _num_samples(df)
|
|
1690
1692
|
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1691
1693
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
@@ -1920,6 +1922,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1920
1922
|
|
|
1921
1923
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1922
1924
|
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1925
|
+
# Don't pass
|
|
1923
1926
|
if email_converted_to_hem:
|
|
1924
1927
|
non_keys_columns.append(email_column)
|
|
1925
1928
|
|
|
@@ -1941,6 +1944,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1941
1944
|
if add_fit_system_record_id:
|
|
1942
1945
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1943
1946
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1947
|
+
non_keys_columns.append(SORT_ID)
|
|
1944
1948
|
|
|
1945
1949
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1946
1950
|
|
|
@@ -2217,10 +2221,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2217
2221
|
self.fit_search_keys = self.search_keys.copy()
|
|
2218
2222
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2219
2223
|
|
|
2220
|
-
validate_dates_distribution(
|
|
2221
|
-
validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
|
|
2222
|
-
)
|
|
2223
|
-
|
|
2224
2224
|
has_date = self._get_date_column(self.fit_search_keys) is not None
|
|
2225
2225
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2226
2226
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
@@ -2883,26 +2883,35 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2883
2883
|
|
|
2884
2884
|
# order by date and idempotent order by other keys
|
|
2885
2885
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2886
|
+
sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
|
|
2886
2887
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2887
2888
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2889
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
2888
2890
|
else:
|
|
2889
2891
|
date_column = self._get_date_column(search_keys)
|
|
2890
2892
|
sort_columns = [date_column] if date_column is not None else []
|
|
2891
2893
|
|
|
2892
|
-
|
|
2894
|
+
other_columns = sorted(
|
|
2893
2895
|
[
|
|
2894
|
-
|
|
2895
|
-
for
|
|
2896
|
-
if
|
|
2897
|
-
and
|
|
2898
|
-
and df[
|
|
2896
|
+
c
|
|
2897
|
+
for c in df.columns
|
|
2898
|
+
if c not in sort_columns
|
|
2899
|
+
and c not in sort_exclude_columns
|
|
2900
|
+
and df[c].nunique() > 1
|
|
2899
2901
|
]
|
|
2902
|
+
# [
|
|
2903
|
+
# sk
|
|
2904
|
+
# for sk, key_type in search_keys.items()
|
|
2905
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
2906
|
+
# and sk in df.columns
|
|
2907
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2908
|
+
# ]
|
|
2900
2909
|
)
|
|
2901
2910
|
|
|
2902
2911
|
search_keys_hash = "search_keys_hash"
|
|
2903
|
-
if len(
|
|
2912
|
+
if len(other_columns) > 0:
|
|
2904
2913
|
sort_columns.append(search_keys_hash)
|
|
2905
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
2914
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
|
|
2906
2915
|
|
|
2907
2916
|
df = df.sort_values(by=sort_columns)
|
|
2908
2917
|
|
|
@@ -111,7 +111,6 @@ x_is_empty=X is empty
|
|
|
111
111
|
y_is_empty=y is empty
|
|
112
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
113
|
missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
-
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
|
|
115
114
|
# eval set validation
|
|
116
115
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
117
116
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -146,7 +145,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
|
146
145
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
147
146
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
148
147
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
149
|
-
|
|
148
|
+
dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
149
|
+
dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
150
150
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
151
151
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
152
152
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -9,9 +9,7 @@ from dateutil.relativedelta import relativedelta
|
|
|
9
9
|
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
|
-
from upgini.metadata import SearchKey
|
|
13
12
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
14
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
15
13
|
|
|
16
14
|
DATE_FORMATS = [
|
|
17
15
|
"%Y-%m-%d",
|
|
@@ -227,49 +225,3 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
227
225
|
|
|
228
226
|
is_diff_less_than_two_columns = grouped.apply(check_differences)
|
|
229
227
|
return is_diff_less_than_two_columns.all()
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def validate_dates_distribution(
|
|
233
|
-
X: pd.DataFrame,
|
|
234
|
-
search_keys: Dict[str, SearchKey],
|
|
235
|
-
logger: Optional[logging.Logger] = None,
|
|
236
|
-
bundle: Optional[ResourceBundle] = None,
|
|
237
|
-
warning_counter: Optional[WarningCounter] = None,
|
|
238
|
-
):
|
|
239
|
-
maybe_date_col = None
|
|
240
|
-
for key, key_type in search_keys.items():
|
|
241
|
-
if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
242
|
-
maybe_date_col = key
|
|
243
|
-
|
|
244
|
-
if maybe_date_col is None:
|
|
245
|
-
for col in X.columns:
|
|
246
|
-
if col in search_keys:
|
|
247
|
-
continue
|
|
248
|
-
try:
|
|
249
|
-
pd.to_datetime(X[col])
|
|
250
|
-
maybe_date_col = col
|
|
251
|
-
break
|
|
252
|
-
except Exception:
|
|
253
|
-
pass
|
|
254
|
-
|
|
255
|
-
if maybe_date_col is None:
|
|
256
|
-
return
|
|
257
|
-
|
|
258
|
-
dates = pd.to_datetime(X[maybe_date_col]).dt.date
|
|
259
|
-
|
|
260
|
-
date_counts = dates.value_counts().sort_index()
|
|
261
|
-
|
|
262
|
-
date_counts_1 = date_counts[: round(len(date_counts) / 2)]
|
|
263
|
-
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
|
264
|
-
ratio = date_counts_2.mean() / date_counts_1.mean()
|
|
265
|
-
|
|
266
|
-
if ratio > 1.2 or ratio < 0.8:
|
|
267
|
-
if warning_counter is not None:
|
|
268
|
-
warning_counter.increment()
|
|
269
|
-
if logger is None:
|
|
270
|
-
logger = logging.getLogger("muted_logger")
|
|
271
|
-
logger.setLevel("FATAL")
|
|
272
|
-
bundle = bundle or get_custom_bundle()
|
|
273
|
-
msg = bundle.get("x_unstable_by_date")
|
|
274
|
-
print(msg)
|
|
275
|
-
logger.warning(msg)
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
6
|
+
from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
7
|
from upgini.resource_bundle import ResourceBundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
|
|
|
78
78
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
79
79
|
if len(rows_with_diff_target) > 0:
|
|
80
80
|
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
logger
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
81
|
+
if EVAL_SET_INDEX not in df.columns:
|
|
82
|
+
rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
83
|
+
rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
|
|
84
|
+
perc = len(rows_to_remove) * 100 / len(df)
|
|
85
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
86
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
87
|
+
)
|
|
88
|
+
if not silent:
|
|
89
|
+
print(msg)
|
|
90
|
+
if logger:
|
|
91
|
+
logger.warning(msg)
|
|
92
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
93
|
+
df = df[~df.index.isin(rows_to_remove.index)]
|
|
94
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
|
+
else:
|
|
96
|
+
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
97
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
98
|
+
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
99
|
+
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
100
|
+
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
101
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
102
|
+
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
103
|
+
)
|
|
104
|
+
if not silent:
|
|
105
|
+
print(msg)
|
|
106
|
+
if logger:
|
|
107
|
+
logger.warning(msg)
|
|
108
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
|
|
109
|
+
train = train[~train.index.isin(train_rows_to_remove.index)]
|
|
110
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
|
|
111
|
+
|
|
112
|
+
evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
|
|
113
|
+
new_evals = []
|
|
114
|
+
for i, eval in enumerate(evals):
|
|
115
|
+
eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
116
|
+
eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
|
|
117
|
+
eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
|
|
118
|
+
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
119
|
+
eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
|
|
120
|
+
)
|
|
121
|
+
if not silent:
|
|
122
|
+
print(msg)
|
|
123
|
+
if logger:
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
|
|
126
|
+
eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
|
|
127
|
+
logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
|
|
128
|
+
new_evals.append(eval)
|
|
129
|
+
|
|
130
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
131
|
+
df = pd.concat([train] + new_evals)
|
|
132
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
133
|
return df
|
|
96
134
|
|
|
97
135
|
|
|
@@ -101,14 +139,18 @@ def clean_full_duplicates(
|
|
|
101
139
|
nrows = len(df)
|
|
102
140
|
if nrows == 0:
|
|
103
141
|
return df
|
|
104
|
-
# Remove
|
|
142
|
+
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
105
143
|
unique_columns = df.columns.tolist()
|
|
106
144
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
107
145
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
108
146
|
if SORT_ID in unique_columns:
|
|
109
147
|
unique_columns.remove(SORT_ID)
|
|
148
|
+
if EVAL_SET_INDEX in unique_columns:
|
|
149
|
+
unique_columns.remove(EVAL_SET_INDEX)
|
|
110
150
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
111
|
-
|
|
151
|
+
# Train segment goes first so if duplicates are found in train and eval set
|
|
152
|
+
# then we keep unique rows in train segment
|
|
153
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
|
112
154
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
113
155
|
nrows_after_full_dedup = len(df)
|
|
114
156
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
@@ -123,7 +165,7 @@ def clean_full_duplicates(
|
|
|
123
165
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
124
166
|
if marked_duplicates.sum() > 0:
|
|
125
167
|
dups_indices = df[marked_duplicates].index.to_list()
|
|
126
|
-
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
|
|
168
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
|
127
169
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
128
170
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
129
171
|
|
|
@@ -133,6 +175,7 @@ def clean_full_duplicates(
|
|
|
133
175
|
print(msg)
|
|
134
176
|
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
135
177
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
178
|
+
|
|
136
179
|
return df
|
|
137
180
|
|
|
138
181
|
|
|
@@ -132,7 +132,9 @@ def balance_undersample(
|
|
|
132
132
|
class_value = classes[class_idx]
|
|
133
133
|
class_count = vc[class_value]
|
|
134
134
|
sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
|
|
135
|
-
sampler = RandomUnderSampler(
|
|
135
|
+
sampler = RandomUnderSampler(
|
|
136
|
+
sampling_strategy=sample_strategy, random_state=random_state
|
|
137
|
+
)
|
|
136
138
|
X = df[SYSTEM_RECORD_ID]
|
|
137
139
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
140
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
@@ -151,7 +153,9 @@ def balance_undersample(
|
|
|
151
153
|
minority_class = df[df[target_column] == min_class_value]
|
|
152
154
|
majority_class = df[df[target_column] != min_class_value]
|
|
153
155
|
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
154
|
-
sampled_majority_class = majority_class.sample(
|
|
156
|
+
sampled_majority_class = majority_class.sample(
|
|
157
|
+
n=sample_size, random_state=random_state
|
|
158
|
+
)
|
|
155
159
|
resampled_data = df[
|
|
156
160
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
157
161
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
@@ -1,13 +1,7 @@
|
|
|
1
|
-
import numpy as np
|
|
2
1
|
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
3
|
|
|
4
|
-
from upgini.
|
|
5
|
-
from upgini.utils.datetime_utils import (
|
|
6
|
-
is_blocked_time_series,
|
|
7
|
-
is_time_series,
|
|
8
|
-
validate_dates_distribution,
|
|
9
|
-
)
|
|
10
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
4
|
+
from upgini.utils.datetime_utils import is_blocked_time_series, is_time_series
|
|
11
5
|
|
|
12
6
|
pd.set_option("mode.chained_assignment", "raise")
|
|
13
7
|
|
|
@@ -189,25 +183,3 @@ def test_multivariate_time_series():
|
|
|
189
183
|
assert not is_blocked_time_series(df, "date", ["date"])
|
|
190
184
|
|
|
191
185
|
assert is_blocked_time_series(df, "date", ["date", "feature3"])
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def test_validate_dates_distribution():
|
|
195
|
-
df = pd.DataFrame({"date": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
|
|
196
|
-
warning_counter = WarningCounter()
|
|
197
|
-
validate_dates_distribution(df, {}, warning_counter=warning_counter)
|
|
198
|
-
assert warning_counter.has_warnings()
|
|
199
|
-
|
|
200
|
-
df = pd.DataFrame({"date": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
|
|
201
|
-
warning_counter = WarningCounter()
|
|
202
|
-
validate_dates_distribution(df, {}, warning_counter=warning_counter)
|
|
203
|
-
assert not warning_counter.has_warnings()
|
|
204
|
-
|
|
205
|
-
df = pd.DataFrame(
|
|
206
|
-
{
|
|
207
|
-
"date2": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
|
|
208
|
-
"date1": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
|
|
209
|
-
}
|
|
210
|
-
)
|
|
211
|
-
warning_counter = WarningCounter()
|
|
212
|
-
validate_dates_distribution(df, {"date1": SearchKey.DATE}, warning_counter=warning_counter)
|
|
213
|
-
assert warning_counter.has_warnings()
|
|
@@ -2164,8 +2164,6 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
|
|
|
2164
2164
|
|
|
2165
2165
|
actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
|
|
2166
2166
|
# actual_result_df.to_parquet(expected_result_path)
|
|
2167
|
-
actual_result_df["phone_num_a54a33"] = actual_result_df["phone_num_a54a33"].astype("Int64")
|
|
2168
|
-
actual_result_df["rep_date_f5d6bb"] = actual_result_df["rep_date_f5d6bb"].astype("Int64")
|
|
2169
2167
|
assert_frame_equal(actual_result_df, expected_result_df)
|
|
2170
2168
|
|
|
2171
2169
|
for i in range(5):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|