upgini 1.1.242a3__tar.gz → 1.1.244a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.242a3/src/upgini.egg-info → upgini-1.1.244a1}/PKG-INFO +1 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/setup.py +1 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/ads_management/ads_manager.py +0 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/feature.py +12 -5
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/data_source/data_source_publisher.py +1 -6
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/dataset.py +20 -4
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/errors.py +0 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/features_enricher.py +31 -38
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/http.py +24 -14
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/mdc/__init__.py +1 -2
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/mdc/context.py +1 -5
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/normalizer/phone_normalizer.py +3 -4
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/resource_bundle/exceptions.py +0 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/base.py +3 -9
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/random_under_sampler.py +1 -3
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/search_task.py +4 -10
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/spinner.py +1 -7
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/country_utils.py +3 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/datetime_utils.py +16 -3
- upgini-1.1.244a1/src/upgini/utils/deduplicate_utils.py +82 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/email_utils.py +0 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/fallback_progress_bar.py +5 -8
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/warning_counter.py +0 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/SOURCES.txt +1 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_datetime_utils.py +36 -30
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_email_utils.py +1 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_etalon_validation.py +13 -12
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_widget.py +1 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/LICENSE +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/README.md +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/pyproject.toml +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/setup.cfg +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/ads.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/metadata.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/metrics.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/ip_utils.py +1 -1
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_country_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_metrics.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.242a3 → upgini-1.1.244a1}/tests/test_postal_code_utils.py +0 -0
|
@@ -53,9 +53,15 @@ class Column:
|
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
class Feature:
|
|
56
|
-
def __init__(
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
op: Operand,
|
|
59
|
+
children: List[Union[Column, "Feature"]],
|
|
60
|
+
data: Optional[pd.DataFrame] = None,
|
|
61
|
+
display_index: Optional[str] = None,
|
|
62
|
+
cached_display_name: Optional[str] = None,
|
|
63
|
+
alias: Optional[str] = None,
|
|
64
|
+
):
|
|
59
65
|
self.op = op
|
|
60
66
|
self.children = children
|
|
61
67
|
self.data = data
|
|
@@ -258,8 +264,9 @@ class Feature:
|
|
|
258
264
|
|
|
259
265
|
|
|
260
266
|
class FeatureGroup:
|
|
261
|
-
def __init__(
|
|
262
|
-
|
|
267
|
+
def __init__(
|
|
268
|
+
self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
|
|
269
|
+
):
|
|
263
270
|
self.op = op
|
|
264
271
|
self.main_column_node = main_column
|
|
265
272
|
self.children = children
|
|
@@ -31,7 +31,6 @@ class OnlineUploadingType(Enum):
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class DataSourcePublisher:
|
|
34
|
-
|
|
35
34
|
FINAL_STATUSES = ["COMPLETED", "FAILED", "TIMED_OUT"]
|
|
36
35
|
DEFAULT_GENERATE_EMBEDDINGS = []
|
|
37
36
|
|
|
@@ -259,11 +258,7 @@ class DataSourcePublisher:
|
|
|
259
258
|
except Exception:
|
|
260
259
|
self.logger.exception(f"Failed to deactivate data tables {data_table_ids} for clients {client_emails}")
|
|
261
260
|
|
|
262
|
-
def upload_online(
|
|
263
|
-
self,
|
|
264
|
-
bq_table_id: Optional[str] = None,
|
|
265
|
-
search_keys: Optional[List[SearchKey]] = None
|
|
266
|
-
):
|
|
261
|
+
def upload_online(self, bq_table_id: Optional[str] = None, search_keys: Optional[List[SearchKey]] = None):
|
|
267
262
|
trace_id = str(uuid.uuid4())
|
|
268
263
|
with MDC(trace_id=trace_id):
|
|
269
264
|
if bq_table_id is None and search_keys is None:
|
|
@@ -36,12 +36,14 @@ from upgini.metadata import (
|
|
|
36
36
|
NumericInterval,
|
|
37
37
|
RuntimeParameters,
|
|
38
38
|
SearchCustomization,
|
|
39
|
+
SearchKey,
|
|
39
40
|
)
|
|
40
41
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
41
42
|
from upgini.resource_bundle import bundle
|
|
42
43
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
43
44
|
from upgini.search_task import SearchTask
|
|
44
45
|
from upgini.utils import combine_search_keys
|
|
46
|
+
from upgini.utils.deduplicate_utils import remove_fintech_duplicates
|
|
45
47
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
46
48
|
|
|
47
49
|
try:
|
|
@@ -346,9 +348,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
346
348
|
|
|
347
349
|
ipv6 = ip + "_v6"
|
|
348
350
|
self.data[ipv6] = (
|
|
349
|
-
self.data[ip]
|
|
350
|
-
|
|
351
|
-
|
|
351
|
+
self.data[ip]
|
|
352
|
+
.apply(self._to_ipv6)
|
|
353
|
+
.apply(self.__ip_to_int)
|
|
354
|
+
.astype("string")
|
|
355
|
+
.str.replace(".0", "", regex=False)
|
|
352
356
|
)
|
|
353
357
|
self.data = self.data.drop(columns=ip)
|
|
354
358
|
self.meaning_types[ipv6] = FileColumnMeaningType.IPV6_ADDRESS
|
|
@@ -811,7 +815,19 @@ class Dataset: # (pd.DataFrame):
|
|
|
811
815
|
|
|
812
816
|
self.__convert_features_types()
|
|
813
817
|
|
|
814
|
-
|
|
818
|
+
search_keys = {
|
|
819
|
+
col: SearchKey.from_meaning_type(key_type)
|
|
820
|
+
for col, key_type in self.meaning_types.items()
|
|
821
|
+
if SearchKey.from_meaning_type(key_type) is not None
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
if validate_target:
|
|
825
|
+
need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
|
|
826
|
+
else:
|
|
827
|
+
need_full_defuplication = True
|
|
828
|
+
|
|
829
|
+
if need_full_defuplication:
|
|
830
|
+
self.__clean_duplicates(silent_mode)
|
|
815
831
|
|
|
816
832
|
self.__validate_dataset(validate_target, silent_mode)
|
|
817
833
|
|
|
@@ -64,6 +64,7 @@ from upgini.utils.datetime_utils import (
|
|
|
64
64
|
is_blocked_time_series,
|
|
65
65
|
is_time_series,
|
|
66
66
|
)
|
|
67
|
+
from upgini.utils.deduplicate_utils import remove_fintech_duplicates
|
|
67
68
|
from upgini.utils.display_utils import (
|
|
68
69
|
display_html_dataframe,
|
|
69
70
|
do_without_pandas_limits,
|
|
@@ -297,8 +298,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
297
298
|
def _set_api_key(self, api_key: str):
|
|
298
299
|
self._api_key = api_key
|
|
299
300
|
if self.logs_enabled:
|
|
300
|
-
self.logger = LoggerFactory().get_logger(
|
|
301
|
-
|
|
301
|
+
self.logger = LoggerFactory().get_logger(
|
|
302
|
+
self.endpoint, self._api_key, self.client_ip, self.client_visitorid
|
|
303
|
+
)
|
|
302
304
|
|
|
303
305
|
api_key = property(_get_api_key, _set_api_key)
|
|
304
306
|
|
|
@@ -856,7 +858,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
856
858
|
|
|
857
859
|
if X is not None and y is None:
|
|
858
860
|
raise ValidationError("X passed without y")
|
|
859
|
-
|
|
861
|
+
|
|
860
862
|
effective_X = X if X is not None else self.X
|
|
861
863
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
862
864
|
|
|
@@ -1200,8 +1202,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1200
1202
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
|
|
1201
1203
|
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1202
1204
|
generated_features.extend(converter.generated_features)
|
|
1203
|
-
email_column = self.
|
|
1204
|
-
hem_column = self.
|
|
1205
|
+
email_column = self._get_email_column(search_keys)
|
|
1206
|
+
hem_column = self._get_hem_column(search_keys)
|
|
1205
1207
|
if email_column:
|
|
1206
1208
|
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1207
1209
|
extended_X = converter.convert(extended_X)
|
|
@@ -1469,7 +1471,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1469
1471
|
|
|
1470
1472
|
original_df_sampled = self.df_with_original_index[
|
|
1471
1473
|
self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
|
|
1472
|
-
|
|
1474
|
+
]
|
|
1473
1475
|
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1474
1476
|
if EVAL_SET_INDEX in original_df_sampled.columns:
|
|
1475
1477
|
Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
|
|
@@ -1525,6 +1527,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1525
1527
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1526
1528
|
df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
|
|
1527
1529
|
|
|
1530
|
+
_, df_with_eval_set_index = remove_fintech_duplicates(
|
|
1531
|
+
df_with_eval_set_index, self.search_keys, self.logger, silent=True
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1528
1534
|
# downsample if need to eval_set threshold
|
|
1529
1535
|
num_samples = _num_samples(df_with_eval_set_index)
|
|
1530
1536
|
if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
@@ -1534,9 +1540,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1534
1540
|
)
|
|
1535
1541
|
|
|
1536
1542
|
X_sampled = (
|
|
1537
|
-
df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0")
|
|
1538
|
-
.copy()
|
|
1539
|
-
.drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1543
|
+
df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy().drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1540
1544
|
)
|
|
1541
1545
|
X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
|
|
1542
1546
|
y_sampled = df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy()[TARGET]
|
|
@@ -1760,8 +1764,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1760
1764
|
generated_features.extend(converter.generated_features)
|
|
1761
1765
|
else:
|
|
1762
1766
|
self.logger.info("Input dataset hasn't date column")
|
|
1763
|
-
email_column = self.
|
|
1764
|
-
hem_column = self.
|
|
1767
|
+
email_column = self._get_email_column(search_keys)
|
|
1768
|
+
hem_column = self._get_hem_column(search_keys)
|
|
1765
1769
|
email_converted_to_hem = False
|
|
1766
1770
|
if email_column:
|
|
1767
1771
|
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
@@ -1883,9 +1887,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1883
1887
|
progress = self.get_progress(trace_id, validation_task)
|
|
1884
1888
|
except KeyboardInterrupt as e:
|
|
1885
1889
|
print(bundle.get("search_stopping"))
|
|
1886
|
-
self.rest_client.stop_search_task_v2(
|
|
1887
|
-
trace_id, validation_task.search_task_id
|
|
1888
|
-
)
|
|
1890
|
+
self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
|
|
1889
1891
|
self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
|
|
1890
1892
|
print(bundle.get("search_stopped"))
|
|
1891
1893
|
raise e
|
|
@@ -2098,8 +2100,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2098
2100
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2099
2101
|
else:
|
|
2100
2102
|
self.logger.info("Input dataset hasn't date column")
|
|
2101
|
-
email_column = self.
|
|
2102
|
-
hem_column = self.
|
|
2103
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2104
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2103
2105
|
email_converted_to_hem = False
|
|
2104
2106
|
if email_column:
|
|
2105
2107
|
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
@@ -2481,21 +2483,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2481
2483
|
raise ValidationError(bundle.get("y_is_constant_eval_set"))
|
|
2482
2484
|
|
|
2483
2485
|
return validated_eval_X, validated_eval_y
|
|
2484
|
-
|
|
2485
|
-
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
|
2486
|
-
if self.baseline_score_column is not None:
|
|
2487
|
-
if self.baseline_score_column not in X.columns:
|
|
2488
|
-
raise ValidationError(bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column))
|
|
2489
|
-
if X[self.baseline_score_column].isna().any():
|
|
2490
|
-
raise ValidationError(bundle.get("baseline_score_column_has_na"))
|
|
2491
|
-
if eval_set is not None:
|
|
2492
|
-
if isinstance(eval_set, tuple):
|
|
2493
|
-
eval_set = [eval_set]
|
|
2494
|
-
for eval in eval_set:
|
|
2495
|
-
if self.baseline_score_column not in eval[0].columns:
|
|
2496
|
-
raise ValidationError(bundle.get("baseline_score_column_not_exists"))
|
|
2497
|
-
if eval[0][self.baseline_score_column].isna().any():
|
|
2498
|
-
raise ValidationError(bundle.get("baseline_score_column_has_na"))
|
|
2499
2486
|
|
|
2500
2487
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
|
2501
2488
|
if self.baseline_score_column is not None:
|
|
@@ -2660,17 +2647,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2660
2647
|
return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
2661
2648
|
|
|
2662
2649
|
@staticmethod
|
|
2663
|
-
def
|
|
2650
|
+
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2664
2651
|
for col, t in search_keys.items():
|
|
2665
2652
|
if t == SearchKey.EMAIL:
|
|
2666
2653
|
return col
|
|
2667
2654
|
|
|
2668
2655
|
@staticmethod
|
|
2669
|
-
def
|
|
2656
|
+
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2670
2657
|
for col, t in search_keys.items():
|
|
2671
2658
|
if t == SearchKey.HEM:
|
|
2672
2659
|
return col
|
|
2673
2660
|
|
|
2661
|
+
@staticmethod
|
|
2662
|
+
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2663
|
+
for col, t in search_keys.items():
|
|
2664
|
+
if t == SearchKey.PHONE:
|
|
2665
|
+
return col
|
|
2666
|
+
|
|
2674
2667
|
def __add_fit_system_record_id(
|
|
2675
2668
|
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
2676
2669
|
) -> pd.DataFrame:
|
|
@@ -2785,9 +2778,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2785
2778
|
result_features.index.name = original_index_name
|
|
2786
2779
|
|
|
2787
2780
|
if rows_to_drop is not None:
|
|
2788
|
-
|
|
2781
|
+
self.logger.info(f"Before dropping target outliers size: {len(result_features)}")
|
|
2789
2782
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
2790
|
-
|
|
2783
|
+
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
2791
2784
|
|
|
2792
2785
|
result_eval_sets = dict()
|
|
2793
2786
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
@@ -2995,9 +2988,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2995
2988
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
2996
2989
|
continue
|
|
2997
2990
|
description["shap"] = feature_meta.shap_value
|
|
2998
|
-
description["Sources"] = feature_meta.data_source
|
|
2999
|
-
|
|
3000
|
-
|
|
2991
|
+
description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "").replace(
|
|
2992
|
+
"AutoFE: feature from ", ""
|
|
2993
|
+
)
|
|
3001
2994
|
description["Feature name"] = feature_meta.name
|
|
3002
2995
|
|
|
3003
2996
|
feature_idx = 1
|
|
@@ -308,7 +308,6 @@ class _RestClient:
|
|
|
308
308
|
# self.silent_mode = silent_mode
|
|
309
309
|
self.client_ip = client_ip
|
|
310
310
|
self.client_visitorid = client_visitorid
|
|
311
|
-
print(f"Created RestClient with {client_ip} and {client_visitorid}")
|
|
312
311
|
self._access_token = self._refresh_access_token()
|
|
313
312
|
# self._access_token: Optional[str] = None # self._refresh_access_token()
|
|
314
313
|
self.last_refresh_time = time.time()
|
|
@@ -442,9 +441,7 @@ class _RestClient:
|
|
|
442
441
|
) -> SearchTaskResponse:
|
|
443
442
|
api_path = self.INITIAL_SEARCH_URI_FMT_V2
|
|
444
443
|
|
|
445
|
-
print(f"Start initial search with {self.client_ip} and {self.client_visitorid}")
|
|
446
444
|
track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
|
447
|
-
print(f"Sending track metrics: {track_metrics}")
|
|
448
445
|
|
|
449
446
|
def open_and_send():
|
|
450
447
|
md5_hash = hashlib.md5()
|
|
@@ -486,7 +483,7 @@ class _RestClient:
|
|
|
486
483
|
api_path, files, trace_id=trace_id, additional_headers=additional_headers
|
|
487
484
|
)
|
|
488
485
|
|
|
489
|
-
response = self._with_unauth_retry(
|
|
486
|
+
response = self._with_unauth_retry(open_and_send)
|
|
490
487
|
return SearchTaskResponse(response)
|
|
491
488
|
|
|
492
489
|
def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
|
|
@@ -571,7 +568,7 @@ class _RestClient:
|
|
|
571
568
|
api_path, files, trace_id=trace_id, additional_headers=additional_headers
|
|
572
569
|
)
|
|
573
570
|
|
|
574
|
-
response = self._with_unauth_retry(
|
|
571
|
+
response = self._with_unauth_retry(open_and_send)
|
|
575
572
|
return SearchTaskResponse(response)
|
|
576
573
|
|
|
577
574
|
def validation_search_without_upload_v2(
|
|
@@ -912,8 +909,12 @@ def resolve_api_token(api_token: Optional[str]) -> str:
|
|
|
912
909
|
return DEMO_API_KEY
|
|
913
910
|
|
|
914
911
|
|
|
915
|
-
def get_rest_client(
|
|
916
|
-
|
|
912
|
+
def get_rest_client(
|
|
913
|
+
backend_url: Optional[str] = None,
|
|
914
|
+
api_token: Optional[str] = None,
|
|
915
|
+
client_ip: Optional[str] = None,
|
|
916
|
+
client_visitorid: Optional[str] = None,
|
|
917
|
+
) -> _RestClient:
|
|
917
918
|
url = _resolve_backend_url(backend_url)
|
|
918
919
|
token = resolve_api_token(api_token)
|
|
919
920
|
|
|
@@ -925,15 +926,21 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
|
925
926
|
|
|
926
927
|
|
|
927
928
|
@lru_cache()
|
|
928
|
-
def _get_rest_client(
|
|
929
|
-
|
|
929
|
+
def _get_rest_client(
|
|
930
|
+
backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
|
|
931
|
+
) -> _RestClient:
|
|
930
932
|
return _RestClient(backend_url, api_token, client_ip, client_visitorid)
|
|
931
933
|
|
|
932
934
|
|
|
933
935
|
class BackendLogHandler(logging.Handler):
|
|
934
|
-
def __init__(
|
|
935
|
-
|
|
936
|
-
|
|
936
|
+
def __init__(
|
|
937
|
+
self,
|
|
938
|
+
rest_client: _RestClient,
|
|
939
|
+
client_ip: Optional[str] = None,
|
|
940
|
+
client_visitorid: Optional[str] = None,
|
|
941
|
+
*args,
|
|
942
|
+
**kwargs,
|
|
943
|
+
) -> None:
|
|
937
944
|
super().__init__(*args, **kwargs)
|
|
938
945
|
self.rest_client = rest_client
|
|
939
946
|
self.track_metrics = None
|
|
@@ -987,8 +994,11 @@ class LoggerFactory:
|
|
|
987
994
|
root.handlers.clear()
|
|
988
995
|
|
|
989
996
|
def get_logger(
|
|
990
|
-
self,
|
|
991
|
-
|
|
997
|
+
self,
|
|
998
|
+
backend_url: Optional[str] = None,
|
|
999
|
+
api_token: Optional[str] = None,
|
|
1000
|
+
client_ip: Optional[str] = None,
|
|
1001
|
+
client_visitorid: Optional[str] = None,
|
|
992
1002
|
) -> logging.Logger:
|
|
993
1003
|
url = _resolve_backend_url(backend_url)
|
|
994
1004
|
token = resolve_api_token(api_token)
|
|
@@ -3,8 +3,7 @@
|
|
|
3
3
|
.. module: mdc
|
|
4
4
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
5
5
|
"""
|
|
6
|
-
from __future__ import
|
|
7
|
-
unicode_literals)
|
|
6
|
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
8
7
|
|
|
9
8
|
import logging
|
|
10
9
|
|
|
@@ -32,9 +32,7 @@ def get_mdc_fields():
|
|
|
32
32
|
|
|
33
33
|
@contextmanager
|
|
34
34
|
def new_log_context(**kwargs):
|
|
35
|
-
context_id = "mdc-{thread}-{context}".format(
|
|
36
|
-
thread=threading.current_thread().ident, context=uuid.uuid4()
|
|
37
|
-
)
|
|
35
|
+
context_id = "mdc-{thread}-{context}".format(thread=threading.current_thread().ident, context=uuid.uuid4())
|
|
38
36
|
|
|
39
37
|
LOGGER.debug("creating context %s", context_id)
|
|
40
38
|
|
|
@@ -48,11 +46,9 @@ def new_log_context(**kwargs):
|
|
|
48
46
|
setattr(context, key, value)
|
|
49
47
|
|
|
50
48
|
try:
|
|
51
|
-
|
|
52
49
|
yield context
|
|
53
50
|
|
|
54
51
|
finally:
|
|
55
|
-
|
|
56
52
|
LOGGER.debug("deleting context %s", context_id)
|
|
57
53
|
|
|
58
54
|
try:
|
|
@@ -7,7 +7,6 @@ from upgini.errors import ValidationError
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class PhoneNormalizer:
|
|
10
|
-
|
|
11
10
|
def __init__(self, df: pd.DataFrame, phone_column_name: str, country_column_name: Optional[str] = None):
|
|
12
11
|
self.df = df
|
|
13
12
|
self.phone_column_name = phone_column_name
|
|
@@ -78,7 +77,7 @@ class PhoneNormalizer:
|
|
|
78
77
|
try:
|
|
79
78
|
value = str(value)
|
|
80
79
|
if value.endswith(".0"):
|
|
81
|
-
value = value[:len(value) - 2]
|
|
80
|
+
value = value[: len(value) - 2]
|
|
82
81
|
numeric_filter = filter(str.isdigit, value)
|
|
83
82
|
numeric_string = "".join(numeric_filter)
|
|
84
83
|
return PhoneNormalizer.validate_length(int(numeric_string))
|
|
@@ -337,5 +336,5 @@ class PhoneNormalizer:
|
|
|
337
336
|
"PF": ("689", 7),
|
|
338
337
|
"TK": ("690", 7),
|
|
339
338
|
"FM": ("691", 7),
|
|
340
|
-
"MH": ("692", 7)
|
|
341
|
-
|
|
339
|
+
"MH": ("692", 7),
|
|
340
|
+
}
|
|
@@ -144,6 +144,7 @@ dataset_empty_column_names=Some column names are empty. Add names please
|
|
|
144
144
|
dataset_too_long_column_name=Column {} is too long: {} characters. Remove this column or trim length to 50 characters
|
|
145
145
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
146
146
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
147
|
+
dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
147
148
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
148
149
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
149
150
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
@@ -47,9 +47,7 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
|
47
47
|
Return the instance itself.
|
|
48
48
|
"""
|
|
49
49
|
X, y, _ = self._check_X_y(X, y)
|
|
50
|
-
self.sampling_strategy_ = check_sampling_strategy(
|
|
51
|
-
self.sampling_strategy, y, self._sampling_type
|
|
52
|
-
)
|
|
50
|
+
self.sampling_strategy_ = check_sampling_strategy(self.sampling_strategy, y, self._sampling_type)
|
|
53
51
|
return self
|
|
54
52
|
|
|
55
53
|
def fit_resample(self, X, y):
|
|
@@ -77,15 +75,11 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
|
|
|
77
75
|
arrays_transformer = ArraysTransformer(X, y)
|
|
78
76
|
X, y, binarize_y = self._check_X_y(X, y)
|
|
79
77
|
|
|
80
|
-
self.sampling_strategy_ = check_sampling_strategy(
|
|
81
|
-
self.sampling_strategy, y, self._sampling_type
|
|
82
|
-
)
|
|
78
|
+
self.sampling_strategy_ = check_sampling_strategy(self.sampling_strategy, y, self._sampling_type)
|
|
83
79
|
|
|
84
80
|
output = self._fit_resample(X, y)
|
|
85
81
|
|
|
86
|
-
y_ = (
|
|
87
|
-
label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1]
|
|
88
|
-
)
|
|
82
|
+
y_ = label_binarize(output[1], classes=np.unique(y)) if binarize_y else output[1]
|
|
89
83
|
|
|
90
84
|
X_, y_ = arrays_transformer.transform(output[0], y_)
|
|
91
85
|
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
|
|
@@ -76,9 +76,7 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
|
|
|
76
76
|
"""
|
|
77
77
|
|
|
78
78
|
@_deprecate_positional_args
|
|
79
|
-
def __init__(
|
|
80
|
-
self, *, sampling_strategy="auto", random_state=None, replacement=False
|
|
81
|
-
):
|
|
79
|
+
def __init__(self, *, sampling_strategy="auto", random_state=None, replacement=False):
|
|
82
80
|
super().__init__(sampling_strategy=sampling_strategy)
|
|
83
81
|
self.random_state = random_state
|
|
84
82
|
self.replacement = replacement
|
|
@@ -79,16 +79,12 @@ class SearchTask:
|
|
|
79
79
|
with Spinner():
|
|
80
80
|
if self.PROTECT_FROM_RATE_LIMIT:
|
|
81
81
|
time.sleep(1) # this is neccesary to avoid requests rate limit restrictions
|
|
82
|
-
self.summary = self.rest_client.search_task_summary_v2(
|
|
83
|
-
trace_id, search_task_id
|
|
84
|
-
)
|
|
82
|
+
self.summary = self.rest_client.search_task_summary_v2(trace_id, search_task_id)
|
|
85
83
|
while self.summary.status not in completed_statuses and (
|
|
86
84
|
not check_fit or "VALIDATION" not in self.summary.status
|
|
87
85
|
):
|
|
88
86
|
time.sleep(self.POLLING_DELAY_SECONDS)
|
|
89
|
-
self.summary = self.rest_client.search_task_summary_v2(
|
|
90
|
-
trace_id, search_task_id
|
|
91
|
-
)
|
|
87
|
+
self.summary = self.rest_client.search_task_summary_v2(trace_id, search_task_id)
|
|
92
88
|
if self.summary.status in failed_statuses:
|
|
93
89
|
self.logger.error(f"Search {search_task_id} failed with status {self.summary.status}")
|
|
94
90
|
raise RuntimeError(bundle.get("search_task_failed_status"))
|
|
@@ -130,9 +126,7 @@ class SearchTask:
|
|
|
130
126
|
for provider_summary in self.summary.initial_important_providers:
|
|
131
127
|
if provider_summary.status == "COMPLETED":
|
|
132
128
|
self.provider_metadata_v2.append(
|
|
133
|
-
self.rest_client.get_provider_search_metadata_v3(
|
|
134
|
-
provider_summary.ads_search_task_id, trace_id
|
|
135
|
-
)
|
|
129
|
+
self.rest_client.get_provider_search_metadata_v3(provider_summary.ads_search_task_id, trace_id)
|
|
136
130
|
)
|
|
137
131
|
if provider_summary.unused_features_for_generation is not None:
|
|
138
132
|
self.unused_features_for_generation.extend(provider_summary.unused_features_for_generation)
|
|
@@ -271,7 +265,7 @@ class SearchTask:
|
|
|
271
265
|
self.rest_client._refresh_token,
|
|
272
266
|
trace_id,
|
|
273
267
|
self.search_task_id,
|
|
274
|
-
self.PROTECT_FROM_RATE_LIMIT
|
|
268
|
+
self.PROTECT_FROM_RATE_LIMIT,
|
|
275
269
|
)
|
|
276
270
|
|
|
277
271
|
def get_max_initial_eval_set_hit_rate_v2(self) -> Optional[Dict[int, float]]:
|
|
@@ -22,7 +22,9 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
|
22
22
|
return df
|
|
23
23
|
|
|
24
24
|
df[country_column] = (
|
|
25
|
-
df[country_column]
|
|
25
|
+
df[country_column]
|
|
26
|
+
.astype("string")
|
|
27
|
+
.str.upper()
|
|
26
28
|
.map(CountrySearchKeyDetector.COUNTRIES)
|
|
27
29
|
.fillna(df[country_column])
|
|
28
30
|
)
|
|
@@ -61,9 +61,22 @@ class DateTimeSearchKeyConverter:
|
|
|
61
61
|
elif is_period_dtype(df[self.date_column]):
|
|
62
62
|
df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
|
|
63
63
|
elif is_numeric_dtype(df[self.date_column]):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
# 315532801 - 2524608001 - seconds
|
|
65
|
+
# 315532801000 - 2524608001000 - milliseconds
|
|
66
|
+
# 315532801000000 - 2524608001000000 - microseconds
|
|
67
|
+
# 315532801000000000 - 2524608001000000000 - nanoseconds
|
|
68
|
+
if df[self.date_column].apply(lambda x: 10**16 < x).all():
|
|
69
|
+
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
|
|
70
|
+
elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
|
|
71
|
+
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
|
|
72
|
+
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
|
73
|
+
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
|
|
74
|
+
elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
|
|
75
|
+
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
76
|
+
else:
|
|
77
|
+
msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
|
|
78
|
+
self.logger.warning(msg)
|
|
79
|
+
raise ValidationError(msg)
|
|
67
80
|
|
|
68
81
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
69
82
|
# as additional features
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from logging import Logger
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from upgini.metadata import TARGET, ModelTaskType, SearchKey
|
|
7
|
+
from upgini.resource_bundle import bundle
|
|
8
|
+
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
|
+
from upgini.utils.target_utils import define_task
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def remove_fintech_duplicates(
|
|
13
|
+
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: Optional[Logger] = None, silent=False
|
|
14
|
+
) -> Tuple(bool, pd.DataFrame):
|
|
15
|
+
# Base checks
|
|
16
|
+
need_full_deduplication = True
|
|
17
|
+
|
|
18
|
+
if define_task(df[TARGET], silent=True) != ModelTaskType.BINARY:
|
|
19
|
+
return need_full_deduplication, df
|
|
20
|
+
|
|
21
|
+
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
22
|
+
if date_col is None:
|
|
23
|
+
return need_full_deduplication, df
|
|
24
|
+
|
|
25
|
+
personal_cols = []
|
|
26
|
+
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
27
|
+
if phone_col:
|
|
28
|
+
personal_cols.append(phone_col)
|
|
29
|
+
email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
|
|
30
|
+
if email_col:
|
|
31
|
+
personal_cols.append(email_col)
|
|
32
|
+
hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
|
|
33
|
+
if hem_col:
|
|
34
|
+
personal_cols.append(hem_col)
|
|
35
|
+
if len(personal_cols) == 0:
|
|
36
|
+
return need_full_deduplication, df
|
|
37
|
+
|
|
38
|
+
grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
|
|
39
|
+
|
|
40
|
+
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
41
|
+
total = len(uniques)
|
|
42
|
+
diff_dates = len(uniques[uniques > 1])
|
|
43
|
+
if diff_dates / total >= 0.6:
|
|
44
|
+
return need_full_deduplication, df
|
|
45
|
+
|
|
46
|
+
# Additional checks
|
|
47
|
+
|
|
48
|
+
need_full_deduplication = False
|
|
49
|
+
|
|
50
|
+
duplicates = df.duplicated(personal_cols, keep=False)
|
|
51
|
+
duplicate_rows = df[duplicates]
|
|
52
|
+
if len(duplicate_rows) == 0:
|
|
53
|
+
return need_full_deduplication, df
|
|
54
|
+
|
|
55
|
+
if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
|
|
56
|
+
return need_full_deduplication, df
|
|
57
|
+
|
|
58
|
+
def has_diff_target_within_60_days(rows):
|
|
59
|
+
rows = rows.sort_values(by=date_col)
|
|
60
|
+
return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
|
|
61
|
+
|
|
62
|
+
df = DateTimeSearchKeyConverter(date_col).convert(df)
|
|
63
|
+
grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
|
|
64
|
+
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
65
|
+
if len(rows_with_diff_target) > 0:
|
|
66
|
+
perc = len(rows_with_diff_target) * 100 / len(df)
|
|
67
|
+
msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
|
|
68
|
+
perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list()
|
|
69
|
+
)
|
|
70
|
+
if not silent:
|
|
71
|
+
print(msg)
|
|
72
|
+
if logger:
|
|
73
|
+
logger.warning(msg)
|
|
74
|
+
df = df[~df.index.isin(rows_with_diff_target.index)]
|
|
75
|
+
|
|
76
|
+
return need_full_deduplication, df
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
80
|
+
for col, key_type in search_keys.items():
|
|
81
|
+
if (isinstance(keys, list) and key_type in keys) or key_type == keys:
|
|
82
|
+
return col
|
|
@@ -2,8 +2,8 @@ from typing import Tuple
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class CustomFallbackProgressBar:
|
|
5
|
-
"""Progressbar supports displaying a progressbar like element
|
|
6
|
-
|
|
5
|
+
"""Progressbar supports displaying a progressbar like element"""
|
|
6
|
+
|
|
7
7
|
def __init__(self, total=100):
|
|
8
8
|
"""Creates a new progressbar
|
|
9
9
|
|
|
@@ -20,12 +20,9 @@ class CustomFallbackProgressBar:
|
|
|
20
20
|
|
|
21
21
|
def __repr__(self):
|
|
22
22
|
fraction = self.progress / self.total
|
|
23
|
-
filled =
|
|
24
|
-
rest =
|
|
25
|
-
return
|
|
26
|
-
filled, rest,
|
|
27
|
-
self.progress, self._stage, self._eta
|
|
28
|
-
)
|
|
23
|
+
filled = "=" * int(fraction * self.text_width)
|
|
24
|
+
rest = " " * (self.text_width - len(filled))
|
|
25
|
+
return "[{}{}] {}% {} {}".format(filled, rest, self.progress, self._stage, self._eta)
|
|
29
26
|
|
|
30
27
|
def display(self):
|
|
31
28
|
print(self)
|
|
@@ -49,6 +49,7 @@ src/upgini/utils/country_utils.py
|
|
|
49
49
|
src/upgini/utils/custom_loss_utils.py
|
|
50
50
|
src/upgini/utils/cv_utils.py
|
|
51
51
|
src/upgini/utils/datetime_utils.py
|
|
52
|
+
src/upgini/utils/deduplicate_utils.py
|
|
52
53
|
src/upgini/utils/display_utils.py
|
|
53
54
|
src/upgini/utils/email_utils.py
|
|
54
55
|
src/upgini/utils/fallback_progress_bar.py
|
|
@@ -127,7 +127,8 @@ def test_multivariate_timeseries_detection():
|
|
|
127
127
|
|
|
128
128
|
|
|
129
129
|
def test_multivariate_time_series():
|
|
130
|
-
df = pd.DataFrame(
|
|
130
|
+
df = pd.DataFrame(
|
|
131
|
+
{
|
|
131
132
|
"date": [
|
|
132
133
|
"2020-01-01 00:00:00",
|
|
133
134
|
"2020-01-01 00:00:02",
|
|
@@ -135,44 +136,49 @@ def test_multivariate_time_series():
|
|
|
135
136
|
"2020-01-01 00:00:06",
|
|
136
137
|
"2020-01-01 00:00:08",
|
|
137
138
|
]
|
|
138
|
-
}
|
|
139
|
+
}
|
|
140
|
+
)
|
|
139
141
|
assert not is_blocked_time_series(df, "date", ["date"])
|
|
140
142
|
|
|
141
|
-
df = pd.DataFrame({
|
|
142
|
-
"date": pd.date_range("2020-01-01", "2020-02-01")
|
|
143
|
-
})
|
|
143
|
+
df = pd.DataFrame({"date": pd.date_range("2020-01-01", "2020-02-01")})
|
|
144
144
|
assert not is_blocked_time_series(df, "date", ["date"])
|
|
145
145
|
|
|
146
|
-
df = pd.DataFrame({
|
|
147
|
-
"date": pd.date_range("2020-01-01", "2021-01-01")
|
|
148
|
-
})
|
|
146
|
+
df = pd.DataFrame({"date": pd.date_range("2020-01-01", "2021-01-01")})
|
|
149
147
|
assert is_blocked_time_series(df, "date", ["date"])
|
|
150
148
|
|
|
151
|
-
df1 = pd.DataFrame(
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
149
|
+
df1 = pd.DataFrame(
|
|
150
|
+
{
|
|
151
|
+
"date": pd.date_range("2020-01-01", "2021-01-01"),
|
|
152
|
+
"feature1": np.random.randint(0, 1000, 367),
|
|
153
|
+
"feature2": np.random.randint(0, 1000, 367),
|
|
154
|
+
}
|
|
155
|
+
)
|
|
156
|
+
df2 = pd.DataFrame(
|
|
157
|
+
{
|
|
158
|
+
"date": pd.date_range("2020-01-01", "2021-01-01"),
|
|
159
|
+
"feature1": np.random.randint(0, 1000, 367),
|
|
160
|
+
"feature2": np.random.randint(0, 1000, 367),
|
|
161
|
+
}
|
|
162
|
+
)
|
|
161
163
|
df = pd.concat([df1, df2])
|
|
162
164
|
assert is_blocked_time_series(df, "date", ["date"])
|
|
163
165
|
|
|
164
|
-
df1 = pd.DataFrame(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
166
|
+
df1 = pd.DataFrame(
|
|
167
|
+
{
|
|
168
|
+
"date": pd.date_range("2020-01-01", "2021-01-01"),
|
|
169
|
+
"feature1": np.random.randint(0, 1000, 367),
|
|
170
|
+
"feature2": np.random.randint(0, 1000, 367),
|
|
171
|
+
"feature3": np.random.randint(0, 1000, 367),
|
|
172
|
+
}
|
|
173
|
+
)
|
|
174
|
+
df2 = pd.DataFrame(
|
|
175
|
+
{
|
|
176
|
+
"date": pd.date_range("2020-01-01", "2021-01-01"),
|
|
177
|
+
"feature1": np.random.randint(0, 1000, 367),
|
|
178
|
+
"feature2": np.random.randint(0, 1000, 367),
|
|
179
|
+
"feature3": np.random.randint(0, 1000, 367),
|
|
180
|
+
}
|
|
181
|
+
)
|
|
176
182
|
df = pd.concat([df1, df2])
|
|
177
183
|
assert not is_blocked_time_series(df, "date", ["date"])
|
|
178
184
|
|
|
@@ -58,7 +58,7 @@ def test_convertion_to_hem():
|
|
|
58
58
|
None,
|
|
59
59
|
None,
|
|
60
60
|
None,
|
|
61
|
-
None
|
|
61
|
+
None,
|
|
62
62
|
],
|
|
63
63
|
EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME: ["tgoogle.com", None, None, None, None, None],
|
|
64
64
|
EmailSearchKeyConverter.DOMAIN_COLUMN_NAME: ["google.com", None, None, None, None, None],
|
|
@@ -58,9 +58,14 @@ def test_string_ip_to_int_conversion():
|
|
|
58
58
|
{"ip": None},
|
|
59
59
|
]
|
|
60
60
|
)
|
|
61
|
-
dataset = Dataset(
|
|
62
|
-
"
|
|
63
|
-
|
|
61
|
+
dataset = Dataset(
|
|
62
|
+
"test",
|
|
63
|
+
df=df,
|
|
64
|
+
search_keys=[("ip",)],
|
|
65
|
+
meaning_types={
|
|
66
|
+
"ip": FileColumnMeaningType.IP_ADDRESS,
|
|
67
|
+
},
|
|
68
|
+
)
|
|
64
69
|
dataset._Dataset__rename_columns()
|
|
65
70
|
dataset._Dataset__convert_ip()
|
|
66
71
|
assert dataset.data["ip_bb9af5_v4"].dtype == "Int64"
|
|
@@ -77,7 +82,7 @@ def test_python_ip_to_int_conversion():
|
|
|
77
82
|
{"ip": ipaddress.ip_address("192.168.1.1")},
|
|
78
83
|
]
|
|
79
84
|
)
|
|
80
|
-
dataset = Dataset("test", df=df, search_keys=[("ip",
|
|
85
|
+
dataset = Dataset("test", df=df, search_keys=[("ip",)])
|
|
81
86
|
dataset.meaning_types = {
|
|
82
87
|
"ip": FileColumnMeaningType.IP_ADDRESS,
|
|
83
88
|
}
|
|
@@ -91,7 +96,7 @@ def test_python_ip_to_int_conversion():
|
|
|
91
96
|
|
|
92
97
|
def test_ip_v6_conversion():
|
|
93
98
|
df = pd.DataFrame({"ip": ["::cf:befe:525b"]})
|
|
94
|
-
dataset = Dataset("test", df=df, search_keys=[("ip",
|
|
99
|
+
dataset = Dataset("test", df=df, search_keys=[("ip",)])
|
|
95
100
|
dataset.meaning_types = {
|
|
96
101
|
"ip": FileColumnMeaningType.IP_ADDRESS,
|
|
97
102
|
}
|
|
@@ -107,7 +112,7 @@ def test_int_ip_to_int_conversion():
|
|
|
107
112
|
df = pd.DataFrame(
|
|
108
113
|
{"ip": [3232235777, 892262568539]},
|
|
109
114
|
)
|
|
110
|
-
dataset = Dataset("test", df=df, search_keys=[("ip",
|
|
115
|
+
dataset = Dataset("test", df=df, search_keys=[("ip",)]) # type: ignore
|
|
111
116
|
dataset.meaning_types = {
|
|
112
117
|
"ip": FileColumnMeaningType.IP_ADDRESS,
|
|
113
118
|
}
|
|
@@ -615,9 +620,7 @@ def test_columns_renaming():
|
|
|
615
620
|
|
|
616
621
|
df = pd.concat([df1, df2], axis=1)
|
|
617
622
|
|
|
618
|
-
dataset = Dataset(
|
|
619
|
-
"tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)]
|
|
620
|
-
)
|
|
623
|
+
dataset = Dataset("tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)])
|
|
621
624
|
dataset._Dataset__rename_columns()
|
|
622
625
|
print(dataset)
|
|
623
626
|
assert set(dataset.data.columns.to_list()) == {"feature1_422b73", "date_0e8763", "feature1_422b73_0"}
|
|
@@ -632,9 +635,7 @@ def test_too_long_columns():
|
|
|
632
635
|
}
|
|
633
636
|
)
|
|
634
637
|
|
|
635
|
-
dataset = Dataset(
|
|
636
|
-
"tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)]
|
|
637
|
-
)
|
|
638
|
+
dataset = Dataset("tds", df=df, meaning_types={"date": FileColumnMeaningType.DATE}, search_keys=[("date",)])
|
|
638
639
|
dataset._Dataset__rename_columns()
|
|
639
640
|
print(dataset)
|
|
640
641
|
assert set(dataset.data.columns.to_list()) == {
|
|
@@ -417,7 +417,7 @@ def test_widget(requests_mock: Mocker):
|
|
|
417
417
|
'<button kind="secondary"><p>Instant purchase</p></button></a></div>'
|
|
418
418
|
),
|
|
419
419
|
(
|
|
420
|
-
|
|
420
|
+
'<div class="stButton"><a href=\'https://app.snowflake.com/marketplace/listing/GZSTZ3VDMF6/'
|
|
421
421
|
"?referer=upgini' target='_blank' rel='noopener noreferrer'><button kind=\"secondary\"><p>"
|
|
422
422
|
"Instant purchase</p></button></a></div>"
|
|
423
423
|
),
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -5,12 +5,12 @@ import pandas as pd
|
|
|
5
5
|
from requests import get
|
|
6
6
|
|
|
7
7
|
from upgini.metadata import SearchKey
|
|
8
|
+
|
|
8
9
|
# from upgini.resource_bundle import bundle
|
|
9
10
|
# from upgini.utils.track_info import get_track_metrics
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class IpToCountrySearchKeyConverter:
|
|
13
|
-
|
|
14
14
|
url = "http://ip-api.com/json/{}"
|
|
15
15
|
|
|
16
16
|
def __init__(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|