upgini 1.1.275__py3-none-any.whl → 1.1.275a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/ads.py +2 -6
- upgini/autofe/date.py +2 -9
- upgini/data_source/data_source_publisher.py +1 -1
- upgini/dataset.py +13 -6
- upgini/features_enricher.py +220 -154
- upgini/metadata.py +9 -1
- upgini/metrics.py +0 -12
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/__init__.py +2 -3
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +2 -2
- upgini/utils/datetime_utils.py +4 -7
- upgini/utils/deduplicate_utils.py +11 -1
- upgini/utils/email_utils.py +7 -2
- upgini/utils/features_validator.py +1 -2
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +13 -25
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/METADATA +2 -2
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/RECORD +23 -23
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/top_level.txt +0 -0
upgini/metadata.py
CHANGED
|
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Set
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
6
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
|
|
8
|
+
SEARCH_KEY_UNNEST = "search_key_unnest"
|
|
7
9
|
SORT_ID = "sort_id"
|
|
8
10
|
EVAL_SET_INDEX = "eval_set_index"
|
|
9
11
|
TARGET = "target"
|
|
@@ -11,7 +13,7 @@ COUNTRY = "country_iso_code"
|
|
|
11
13
|
RENAMED_INDEX = "index_col"
|
|
12
14
|
DEFAULT_INDEX = "index"
|
|
13
15
|
ORIGINAL_INDEX = "original_index"
|
|
14
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY
|
|
16
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
class FileColumnMeaningType(Enum):
|
|
@@ -37,6 +39,8 @@ class FileColumnMeaningType(Enum):
|
|
|
37
39
|
POSTAL_CODE = "POSTAL_CODE"
|
|
38
40
|
SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
|
|
39
41
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
42
|
+
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
43
|
+
UNNEST_KEY = "UNNEST_KEY"
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
class SearchKey(Enum):
|
|
@@ -182,6 +186,10 @@ class FileColumnMetadata(BaseModel):
|
|
|
182
186
|
meaningType: FileColumnMeaningType
|
|
183
187
|
minMaxValues: Optional[NumericInterval] = None
|
|
184
188
|
originalName: Optional[str]
|
|
189
|
+
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
190
|
+
isUnnest: bool = False,
|
|
191
|
+
# list of original etalon key column names like msisdn1, msisdn2
|
|
192
|
+
unnestKeyNames: Optional[list[str]]
|
|
185
193
|
|
|
186
194
|
|
|
187
195
|
class FileMetadata(BaseModel):
|
upgini/metrics.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import inspect
|
|
2
1
|
import logging
|
|
3
2
|
import re
|
|
4
3
|
from copy import deepcopy
|
|
@@ -382,11 +381,6 @@ class EstimatorWrapper:
|
|
|
382
381
|
kwargs["estimator"] = estimator_copy
|
|
383
382
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
384
383
|
if cat_features is not None:
|
|
385
|
-
for cat_feature in cat_features:
|
|
386
|
-
if cat_feature not in X.columns:
|
|
387
|
-
logger.error(
|
|
388
|
-
f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
|
|
389
|
-
)
|
|
390
384
|
estimator_copy.set_params(
|
|
391
385
|
cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
392
386
|
)
|
|
@@ -653,12 +647,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
653
647
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
654
648
|
if isinstance(scoring, str) and scoring is not None:
|
|
655
649
|
_get_scorer_by_name(scoring)
|
|
656
|
-
elif isinstance(scoring, Callable):
|
|
657
|
-
spec = inspect.getfullargspec(scoring)
|
|
658
|
-
if len(spec.args) < 3:
|
|
659
|
-
raise ValidationError(
|
|
660
|
-
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
|
|
661
|
-
)
|
|
662
650
|
|
|
663
651
|
|
|
664
652
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -44,7 +44,7 @@ class PhoneNormalizer:
|
|
|
44
44
|
Method will remove all non numeric chars from string and convert it to int.
|
|
45
45
|
None will be set for phone numbers that couldn"t be converted to int
|
|
46
46
|
"""
|
|
47
|
-
if is_string_dtype(self.df[self.phone_column_name])
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]):
|
|
48
48
|
convert_func = self.phone_str_to_int_safe
|
|
49
49
|
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
50
|
convert_func = self.phone_float_to_int_safe
|
|
@@ -38,7 +38,6 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
-
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
42
41
|
|
|
43
42
|
# Errors
|
|
44
43
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -88,6 +87,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
|
88
87
|
search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
89
88
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
90
89
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
90
|
+
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
91
91
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
92
92
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
93
93
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
@@ -159,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
159
159
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
160
160
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
161
161
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
162
|
-
dataset_rarest_class_less_min=
|
|
162
|
+
dataset_rarest_class_less_min=Frequency of the rarest class `{}` is {}, minimum frequency must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
163
163
|
dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
164
164
|
dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
165
165
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
upgini/utils/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from pandas.api.types import is_string_dtype
|
|
5
|
+
from pandas.api.types import is_string_dtype
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
@@ -20,6 +20,5 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
20
20
|
return [
|
|
21
21
|
col
|
|
22
22
|
for col in tmp.columns
|
|
23
|
-
if
|
|
24
|
-
and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
23
|
+
if is_string_dtype(tmp[col]) and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
25
24
|
]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
11
|
raise NotImplementedError()
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
|
|
14
|
+
return [
|
|
15
|
+
column_name
|
|
16
|
+
for column_name in column_names
|
|
17
|
+
if self._is_search_key_by_name(column_name)
|
|
18
|
+
]
|
|
17
19
|
|
|
18
|
-
def
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
for column_name in df.columns:
|
|
20
|
+
def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
|
|
21
|
+
other_columns = [col for col in df.columns if col not in existing_search_keys]
|
|
22
|
+
columns_by_names = self._get_search_keys_by_name(other_columns)
|
|
23
|
+
columns_by_values = []
|
|
24
|
+
for column_name in other_columns:
|
|
24
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
25
|
-
|
|
26
|
+
columns_by_values.append(column_name)
|
|
27
|
+
return list(set(columns_by_names + columns_by_values))
|
upgini/utils/country_utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from pandas.api.types import is_string_dtype
|
|
2
|
+
from pandas.api.types import is_string_dtype
|
|
3
3
|
|
|
4
4
|
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
5
5
|
|
|
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
|
|
|
9
9
|
return "country" in str(column_name).lower()
|
|
10
10
|
|
|
11
11
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
12
|
-
if not is_string_dtype(column)
|
|
12
|
+
if not is_string_dtype(column):
|
|
13
13
|
return False
|
|
14
14
|
|
|
15
15
|
all_count = len(column)
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -6,10 +6,7 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import
|
|
10
|
-
is_numeric_dtype,
|
|
11
|
-
is_period_dtype,
|
|
12
|
-
)
|
|
9
|
+
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
13
10
|
|
|
14
11
|
from upgini.errors import ValidationError
|
|
15
12
|
from upgini.metadata import SearchKey
|
|
@@ -81,6 +78,9 @@ class DateTimeSearchKeyConverter:
|
|
|
81
78
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
82
79
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
83
80
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
81
|
+
elif is_string_dtype(df[self.date_column]):
|
|
82
|
+
df[self.date_column] = df[self.date_column].apply(self.clean_date)
|
|
83
|
+
df[self.date_column] = self.parse_date(df)
|
|
84
84
|
elif is_period_dtype(df[self.date_column]):
|
|
85
85
|
df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
|
|
86
86
|
elif is_numeric_dtype(df[self.date_column]):
|
|
@@ -100,9 +100,6 @@ class DateTimeSearchKeyConverter:
|
|
|
100
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
101
101
|
self.logger.warning(msg)
|
|
102
102
|
raise ValidationError(msg)
|
|
103
|
-
else:
|
|
104
|
-
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
105
|
-
df[self.date_column] = self.parse_date(df)
|
|
106
103
|
|
|
107
104
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
108
105
|
# as additional features
|
|
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import
|
|
6
|
+
from upgini.metadata import (
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
+
EVAL_SET_INDEX,
|
|
9
|
+
SORT_ID,
|
|
10
|
+
SYSTEM_RECORD_ID,
|
|
11
|
+
TARGET,
|
|
12
|
+
ModelTaskType,
|
|
13
|
+
SearchKey,
|
|
14
|
+
)
|
|
7
15
|
from upgini.resource_bundle import ResourceBundle
|
|
8
16
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
17
|
from upgini.utils.target_utils import define_task
|
|
@@ -143,6 +151,8 @@ def clean_full_duplicates(
|
|
|
143
151
|
unique_columns = df.columns.tolist()
|
|
144
152
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
145
153
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
+
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
+
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
146
156
|
if SORT_ID in unique_columns:
|
|
147
157
|
unique_columns.remove(SORT_ID)
|
|
148
158
|
if EVAL_SET_INDEX in unique_columns:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -4,7 +4,7 @@ from hashlib import sha256
|
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
from pandas.api.types import is_string_dtype
|
|
7
|
+
from pandas.api.types import is_string_dtype
|
|
8
8
|
from upgini.resource_bundle import bundle
|
|
9
9
|
|
|
10
10
|
from upgini.metadata import SearchKey
|
|
@@ -18,7 +18,7 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
|
|
|
18
18
|
return str(column_name).lower() in ["email", "e_mail", "e-mail"]
|
|
19
19
|
|
|
20
20
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
21
|
-
if not is_string_dtype(column)
|
|
21
|
+
if not is_string_dtype(column):
|
|
22
22
|
return False
|
|
23
23
|
if not column.astype("string").str.contains("@").any():
|
|
24
24
|
return False
|
|
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
|
|
|
38
38
|
email_column: str,
|
|
39
39
|
hem_column: Optional[str],
|
|
40
40
|
search_keys: Dict[str, SearchKey],
|
|
41
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
41
42
|
logger: Optional[logging.Logger] = None,
|
|
42
43
|
):
|
|
43
44
|
self.email_column = email_column
|
|
44
45
|
self.hem_column = hem_column
|
|
45
46
|
self.search_keys = search_keys
|
|
47
|
+
self.unnest_search_keys = unnest_search_keys
|
|
46
48
|
if logger is not None:
|
|
47
49
|
self.logger = logger
|
|
48
50
|
else:
|
|
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
|
|
|
80
82
|
del self.search_keys[self.email_column]
|
|
81
83
|
return df
|
|
82
84
|
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
+
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
83
86
|
self.email_converted_to_hem = True
|
|
84
87
|
|
|
85
88
|
del self.search_keys[self.email_column]
|
|
89
|
+
if self.email_column in self.unnest_search_keys:
|
|
90
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
86
91
|
|
|
87
92
|
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
88
93
|
|
|
@@ -81,8 +81,7 @@ class FeaturesValidator:
|
|
|
81
81
|
return [
|
|
82
82
|
i
|
|
83
83
|
for i in df
|
|
84
|
-
if (
|
|
85
|
-
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
84
|
+
if (is_string_dtype(df[i]) or is_integer_dtype(df[i])) and (df[i].nunique(dropna=False) / row_count >= 0.95)
|
|
86
85
|
]
|
|
87
86
|
|
|
88
87
|
@staticmethod
|
upgini/utils/target_utils.py
CHANGED
|
@@ -107,7 +107,7 @@ def balance_undersample(
|
|
|
107
107
|
min_class_count = vc[min_class_value]
|
|
108
108
|
|
|
109
109
|
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold =
|
|
110
|
+
min_class_threshold = min_class_percent * count
|
|
111
111
|
|
|
112
112
|
resampled_data = df
|
|
113
113
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
upgini/utils/track_info.py
CHANGED
|
@@ -55,7 +55,7 @@ def _get_execution_ide() -> str:
|
|
|
55
55
|
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
56
56
|
# default values
|
|
57
57
|
track = {"ide": _get_execution_ide()}
|
|
58
|
-
ident_res = "https://
|
|
58
|
+
ident_res = "https://api.ipify.org"
|
|
59
59
|
|
|
60
60
|
try:
|
|
61
61
|
track["hostname"] = socket.gethostname()
|
|
@@ -74,20 +74,17 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
74
74
|
display(
|
|
75
75
|
Javascript(
|
|
76
76
|
"""
|
|
77
|
-
|
|
78
|
-
return import('https://upgini.github.io/upgini/js/a.js')
|
|
77
|
+
import('https://upgini.github.io/upgini/js/a.js')
|
|
79
78
|
.then(FingerprintJS => FingerprintJS.load())
|
|
80
79
|
.then(fp => fp.get())
|
|
81
|
-
.then(result => result.visitorId);
|
|
82
|
-
}
|
|
80
|
+
.then(result => window.visitorId = result.visitorId);
|
|
83
81
|
"""
|
|
84
82
|
)
|
|
85
83
|
)
|
|
86
|
-
track["visitorId"] = output.eval_js("
|
|
84
|
+
track["visitorId"] = output.eval_js("window.visitorId", timeout_sec=10)
|
|
87
85
|
except Exception as e:
|
|
88
86
|
track["err"] = str(e)
|
|
89
|
-
|
|
90
|
-
track["visitorId"] = "None"
|
|
87
|
+
track["visitorId"] = "None"
|
|
91
88
|
if client_ip:
|
|
92
89
|
track["ip"] = client_ip
|
|
93
90
|
else:
|
|
@@ -98,19 +95,16 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
98
95
|
display(
|
|
99
96
|
Javascript(
|
|
100
97
|
f"""
|
|
101
|
-
|
|
102
|
-
return fetch("{ident_res}")
|
|
98
|
+
fetch("{ident_res}")
|
|
103
99
|
.then(response => response.text())
|
|
104
|
-
.then(data => data);
|
|
105
|
-
}}
|
|
100
|
+
.then(data => window.clientIP = data);
|
|
106
101
|
"""
|
|
107
102
|
)
|
|
108
103
|
)
|
|
109
|
-
track["ip"] = output.eval_js("
|
|
104
|
+
track["ip"] = output.eval_js("window.clientIP", timeout_sec=10)
|
|
110
105
|
except Exception as e:
|
|
111
106
|
track["err"] = str(e)
|
|
112
|
-
|
|
113
|
-
track["ip"] = "0.0.0.0"
|
|
107
|
+
track["ip"] = "0.0.0.0"
|
|
114
108
|
|
|
115
109
|
elif track["ide"] == "binder":
|
|
116
110
|
try:
|
|
@@ -122,10 +116,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
122
116
|
track["visitorId"] = sha256(os.environ["CLIENT_IP"].encode()).hexdigest()
|
|
123
117
|
except Exception as e:
|
|
124
118
|
track["err"] = str(e)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if "visitorId" not in track:
|
|
128
|
-
track["visitorId"] = "None"
|
|
119
|
+
track["ip"] = "0.0.0.0"
|
|
120
|
+
track["visitorId"] = "None"
|
|
129
121
|
|
|
130
122
|
elif track["ide"] == "kaggle":
|
|
131
123
|
try:
|
|
@@ -144,8 +136,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
144
136
|
raise Exception(err)
|
|
145
137
|
except Exception as e:
|
|
146
138
|
track["err"] = str(e)
|
|
147
|
-
|
|
148
|
-
|
|
139
|
+
track["ip"] = "0.0.0.0"
|
|
140
|
+
track["visitorId"] = "None"
|
|
149
141
|
else:
|
|
150
142
|
try:
|
|
151
143
|
if client_ip:
|
|
@@ -158,9 +150,5 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
158
150
|
track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
|
|
159
151
|
except Exception as e:
|
|
160
152
|
track["err"] = str(e)
|
|
161
|
-
if "visitorId" not in track:
|
|
162
|
-
track["visitorId"] = "None"
|
|
163
|
-
if "ip" not in track:
|
|
164
|
-
track["ip"] = "0.0.0.0"
|
|
165
153
|
|
|
166
154
|
return track
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.275a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil >=2.8.0
|
|
30
30
|
Requires-Dist: requests >=2.8.0
|
|
31
|
-
Requires-Dist: pandas <
|
|
31
|
+
Requires-Dist: pandas <2.0.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy >=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn >=1.3.0
|
|
34
34
|
Requires-Dist: pydantic <2.0.0,>=1.8.2
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
|
-
upgini/ads.py,sha256=
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
2
|
+
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
+
upgini/dataset.py,sha256=g10BnbayclZMno9mAabpz_Zu0iyMiW0f_jOwt_xJr8U,45947
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=CgUBRCPW_itgBfaup3Tg_yfPYMbQpufoOqu4yYvn6VU,179316
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
8
|
+
upgini/metadata.py,sha256=FFwTnoMxdJ-7oKXbRgght1yk7e2u90WpeqljKDWUj18,10106
|
|
9
|
+
upgini/metrics.py,sha256=VmxVc-plbRPZ1U3Ve3E-FZkhYqi0X2r7x8H5L-shux4,29058
|
|
10
10
|
upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -15,49 +15,49 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
|
|
|
15
15
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
|
|
17
17
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
18
|
-
upgini/autofe/date.py,sha256=
|
|
18
|
+
upgini/autofe/date.py,sha256=cc0GMAJR0QZOI_Qp2V5UDklaXLNS_79O1GhU6GlOYzg,3895
|
|
19
19
|
upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
|
|
20
20
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
21
21
|
upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
|
|
22
22
|
upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
|
|
23
23
|
upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
|
|
24
24
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
25
|
+
upgini/data_source/data_source_publisher.py,sha256=J2lrpPuysUHPeqTSfoybBtPRTBCFu7R5KzaakhjaRDc,16485
|
|
26
26
|
upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
|
|
27
27
|
upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
|
|
28
28
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
-
upgini/normalizer/phone_normalizer.py,sha256=
|
|
29
|
+
upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
|
|
30
30
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
31
31
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
32
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
32
|
+
upgini/resource_bundle/strings.properties,sha256=AK5xktWWYa0smEa_ZVT7BFlXPSx7M_NTMIfXhgsnE2Y,26177
|
|
33
33
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
34
34
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
36
36
|
upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
|
|
37
37
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
38
|
-
upgini/utils/__init__.py,sha256=
|
|
39
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
38
|
+
upgini/utils/__init__.py,sha256=dQ4-s8-sZ5eOBZ-mH3gEwDHTdI0wI1bUAVgVqUKKPx4,786
|
|
39
|
+
upgini/utils/base_search_key_detector.py,sha256=VvEdamjJT1wypsH6NAfOkPp7dHo7nxhl7LhwX7Z9N5w,1025
|
|
40
40
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
41
|
-
upgini/utils/country_utils.py,sha256=
|
|
41
|
+
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
42
42
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
43
43
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
44
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
44
|
+
upgini/utils/datetime_utils.py,sha256=4ii5WphAHlb_NRmdJx35VZpTarJbAr-AnDw3XSzUSow,10346
|
|
45
|
+
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
46
46
|
upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
|
|
47
|
-
upgini/utils/email_utils.py,sha256=
|
|
47
|
+
upgini/utils/email_utils.py,sha256=0EPCxMU-huzTgb_vySiAQ8tmSUhS31Mz2BpaHGwwYO4,3772
|
|
48
48
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
49
|
-
upgini/utils/features_validator.py,sha256=
|
|
49
|
+
upgini/utils/features_validator.py,sha256=P-dfjBLAMxgzOcUX1Jo1bhVp8-8WyTyF3Ef0YZ5nfRI,3269
|
|
50
50
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
51
51
|
upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
52
52
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
53
53
|
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
54
54
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
55
55
|
upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
|
|
56
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
-
upgini/utils/track_info.py,sha256=
|
|
56
|
+
upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
|
|
57
|
+
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
58
58
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
59
|
+
upgini-1.1.275a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
60
|
+
upgini-1.1.275a1.dist-info/METADATA,sha256=ocZUhdmjsYXKoCXt0W3M4gfPGQ8UlFtQlYIjdD_6_w0,48158
|
|
61
|
+
upgini-1.1.275a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
62
|
+
upgini-1.1.275a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
63
|
+
upgini-1.1.275a1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|