upgini 1.1.312a3__tar.gz → 1.1.312a4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.312a3 → upgini-1.1.312a4}/PKG-INFO +1 -1
- upgini-1.1.312a4/src/upgini/__about__.py +1 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/dataset.py +4 -2
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/datetime_utils.py +7 -4
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/phone_utils.py +4 -4
- upgini-1.1.312a3/src/upgini/__about__.py +0 -1
- {upgini-1.1.312a3 → upgini-1.1.312a4}/.gitignore +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/LICENSE +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/README.md +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/pyproject.toml +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/ads.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/errors.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/features_enricher.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/http.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/lazy_import.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/metadata.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/metrics.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/search_task.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/spinner.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.312a3 → upgini-1.1.312a4}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.312a4"
|
|
@@ -440,9 +440,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
440
440
|
FileColumnMeaningType.DATETIME,
|
|
441
441
|
# FileColumnMeaningType.IP_ADDRESS,
|
|
442
442
|
}:
|
|
443
|
+
min_value = self.data[column_name].astype("Int64").min()
|
|
444
|
+
max_value = self.data[column_name].astype("Int64").max()
|
|
443
445
|
min_max_values = NumericInterval(
|
|
444
|
-
minValue=
|
|
445
|
-
maxValue=
|
|
446
|
+
minValue=min_value,
|
|
447
|
+
maxValue=max_value,
|
|
446
448
|
)
|
|
447
449
|
else:
|
|
448
450
|
min_max_values = None
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
+
import pytz
|
|
4
5
|
from typing import Dict, List, Optional
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
@@ -28,12 +29,13 @@ DATE_FORMATS = [
|
|
|
28
29
|
"%Y-%m-%dT%H:%M:%S.%f",
|
|
29
30
|
]
|
|
30
31
|
|
|
31
|
-
DATETIME_PATTERN = r"^[\d\s\.\-:T
|
|
32
|
+
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class DateTimeSearchKeyConverter:
|
|
35
36
|
DATETIME_COL = "_date_time"
|
|
36
|
-
MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
37
|
+
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
38
|
+
MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
|
|
37
39
|
|
|
38
40
|
def __init__(
|
|
39
41
|
self,
|
|
@@ -106,12 +108,13 @@ class DateTimeSearchKeyConverter:
|
|
|
106
108
|
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
107
109
|
df[self.date_column] = self.parse_date(df)
|
|
108
110
|
|
|
109
|
-
df = self.clean_old_dates(df)
|
|
110
|
-
|
|
111
111
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
112
112
|
# as additional features
|
|
113
113
|
seconds = "datetime_seconds"
|
|
114
114
|
df[self.date_column] = df[self.date_column].dt.tz_localize(None)
|
|
115
|
+
|
|
116
|
+
df = self.clean_old_dates(df)
|
|
117
|
+
|
|
115
118
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
116
119
|
|
|
117
120
|
seconds_without_na = df[seconds].dropna()
|
|
@@ -29,21 +29,21 @@ class PhoneSearchKeyConverter:
|
|
|
29
29
|
def convert(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
30
30
|
df = self.phone_to_int(df)
|
|
31
31
|
if self.country_column is not None:
|
|
32
|
-
df = df.apply(self.add_prefix, axis=1)
|
|
32
|
+
df[self.phone_column] = df.apply(self.add_prefix, axis=1)
|
|
33
33
|
df[self.phone_column] = df[self.phone_column].astype("Int64")
|
|
34
34
|
return df
|
|
35
35
|
|
|
36
36
|
def add_prefix(self, row):
|
|
37
37
|
phone = row[self.phone_column]
|
|
38
38
|
if pd.isna(phone):
|
|
39
|
-
return
|
|
39
|
+
return phone
|
|
40
40
|
country = row[self.country_column]
|
|
41
41
|
country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
|
|
42
42
|
if country_prefix_tuple is not None:
|
|
43
43
|
country_prefix, number_of_digits = country_prefix_tuple
|
|
44
44
|
if len(str(phone)) == number_of_digits:
|
|
45
|
-
|
|
46
|
-
return
|
|
45
|
+
return int(country_prefix + str(phone))
|
|
46
|
+
return phone
|
|
47
47
|
|
|
48
48
|
def phone_to_int(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
49
49
|
"""
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.1.312a3"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|