upgini 1.1.237a2__py3-none-any.whl → 1.1.239a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/autofe/operand.py +11 -1
- upgini/autofe/unary.py +6 -6
- upgini/data_source/data_source_publisher.py +7 -0
- upgini/dataset.py +2 -13
- upgini/features_enricher.py +25 -32
- upgini/http.py +24 -12
- upgini/resource_bundle/strings.properties +0 -1
- upgini/search_task.py +7 -2
- upgini/utils/datetime_utils.py +3 -16
- upgini/utils/track_info.py +18 -25
- {upgini-1.1.237a2.dist-info → upgini-1.1.239a1.dist-info}/METADATA +4 -4
- {upgini-1.1.237a2.dist-info → upgini-1.1.239a1.dist-info}/RECORD +15 -16
- upgini/utils/deduplicate_utils.py +0 -72
- {upgini-1.1.237a2.dist-info → upgini-1.1.239a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.237a2.dist-info → upgini-1.1.239a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.237a2.dist-info → upgini-1.1.239a1.dist-info}/top_level.txt +0 -0
upgini/autofe/operand.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
|
-
from typing import Dict, List, Optional, Tuple
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
3
|
import abc
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import numpy as np
|
|
@@ -59,6 +59,16 @@ class PandasOperand(Operand, abc.ABC):
|
|
|
59
59
|
df_from.loc[np.nan] = np.nan
|
|
60
60
|
return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
|
|
61
61
|
|
|
62
|
+
def _round_value(self, value: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
|
|
63
|
+
if isinstance(value, pd.DataFrame):
|
|
64
|
+
return value.apply(self._round_value, axis=1)
|
|
65
|
+
|
|
66
|
+
if np.issubdtype(value.dtype, np.floating):
|
|
67
|
+
precision = np.finfo(value.dtype).precision
|
|
68
|
+
return np.trunc(value * 10**precision) / (10**precision)
|
|
69
|
+
else:
|
|
70
|
+
return value
|
|
71
|
+
|
|
62
72
|
|
|
63
73
|
class VectorizableMixin(Operand):
|
|
64
74
|
def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
|
upgini/autofe/unary.py
CHANGED
|
@@ -22,10 +22,10 @@ class Log(PandasOperand):
|
|
|
22
22
|
output_type = "float"
|
|
23
23
|
|
|
24
24
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
25
|
-
return np.log(np.abs(data.replace(0, np.nan)))
|
|
25
|
+
return self._round_value(np.log(np.abs(data.replace(0, np.nan))))
|
|
26
26
|
|
|
27
27
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
28
|
-
return np.log(data.replace(0, np.nan).abs())
|
|
28
|
+
return self._round_value(np.log(data.replace(0, np.nan).abs()))
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Sqrt(PandasOperand):
|
|
@@ -35,10 +35,10 @@ class Sqrt(PandasOperand):
|
|
|
35
35
|
output_type = "float"
|
|
36
36
|
|
|
37
37
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
38
|
-
return np.sqrt(np.abs(data))
|
|
38
|
+
return self._round_value(np.sqrt(np.abs(data)))
|
|
39
39
|
|
|
40
40
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
41
|
-
return np.sqrt(data.abs())
|
|
41
|
+
return self._round_value(np.sqrt(data.abs()))
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class Square(PandasOperand):
|
|
@@ -60,10 +60,10 @@ class Sigmoid(PandasOperand):
|
|
|
60
60
|
output_type = "float"
|
|
61
61
|
|
|
62
62
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
63
|
-
return 1 / (1 + np.exp(-data))
|
|
63
|
+
return self._round_value(1 / (1 + np.exp(-data)))
|
|
64
64
|
|
|
65
65
|
def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
|
|
66
|
-
return 1 / (1 + np.exp(-data))
|
|
66
|
+
return self._round_value(1 / (1 + np.exp(-data)))
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
class Floor(PandasOperand):
|
|
@@ -62,6 +62,7 @@ class DataSourcePublisher:
|
|
|
62
62
|
trace_id = str(uuid.uuid4())
|
|
63
63
|
|
|
64
64
|
with MDC(trace_id=trace_id):
|
|
65
|
+
task_id = None
|
|
65
66
|
try:
|
|
66
67
|
if data_table_uri is None or not data_table_uri.startswith("bq://"):
|
|
67
68
|
raise ValidationError(
|
|
@@ -148,6 +149,12 @@ class DataSourcePublisher:
|
|
|
148
149
|
self.logger.info(msg)
|
|
149
150
|
print(msg)
|
|
150
151
|
return data_table_id
|
|
152
|
+
except KeyboardInterrupt:
|
|
153
|
+
if task_id is not None:
|
|
154
|
+
msg = f"Stopping AdsManagementTask {task_id}"
|
|
155
|
+
print(msg)
|
|
156
|
+
self.logger.info(msg)
|
|
157
|
+
self._rest_client.stop_ads_management_task(task_id, trace_id)
|
|
151
158
|
except Exception:
|
|
152
159
|
self.logger.exception("Failed to register data table")
|
|
153
160
|
raise
|
upgini/dataset.py
CHANGED
|
@@ -36,14 +36,12 @@ from upgini.metadata import (
|
|
|
36
36
|
NumericInterval,
|
|
37
37
|
RuntimeParameters,
|
|
38
38
|
SearchCustomization,
|
|
39
|
-
SearchKey,
|
|
40
39
|
)
|
|
41
40
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
42
41
|
from upgini.resource_bundle import bundle
|
|
43
42
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
44
43
|
from upgini.search_task import SearchTask
|
|
45
44
|
from upgini.utils import combine_search_keys
|
|
46
|
-
from upgini.utils.deduplicate_utils import remove_fintech_duplicates
|
|
47
45
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
48
46
|
|
|
49
47
|
try:
|
|
@@ -83,7 +81,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
83
81
|
endpoint: Optional[str] = None,
|
|
84
82
|
api_key: Optional[str] = None,
|
|
85
83
|
logger: Optional[logging.Logger] = None,
|
|
86
|
-
client_ip: Optional[str] = None,
|
|
87
84
|
warning_counter: Optional[WarningCounter] = None,
|
|
88
85
|
**kwargs,
|
|
89
86
|
):
|
|
@@ -127,7 +124,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
127
124
|
else:
|
|
128
125
|
self.logger = logging.getLogger()
|
|
129
126
|
self.logger.setLevel("FATAL")
|
|
130
|
-
self.client_ip = client_ip
|
|
131
127
|
self.warning_counter = warning_counter or WarningCounter()
|
|
132
128
|
|
|
133
129
|
def __len__(self):
|
|
@@ -822,13 +818,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
822
818
|
self.__validate_dataset(validate_target, silent_mode)
|
|
823
819
|
|
|
824
820
|
if validate_target:
|
|
825
|
-
search_keys = {
|
|
826
|
-
col: SearchKey.from_meaning_type(key_type)
|
|
827
|
-
for col, key_type in self.meaning_types.items()
|
|
828
|
-
if SearchKey.from_meaning_type(key_type) is not None
|
|
829
|
-
}
|
|
830
|
-
self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
|
|
831
|
-
|
|
832
821
|
self.__validate_target()
|
|
833
822
|
|
|
834
823
|
self.__resample()
|
|
@@ -1028,7 +1017,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1028
1017
|
task_type=self.task_type,
|
|
1029
1018
|
endpoint=self.endpoint,
|
|
1030
1019
|
api_key=self.api_key,
|
|
1031
|
-
|
|
1020
|
+
logger=self.logger,
|
|
1032
1021
|
)
|
|
1033
1022
|
|
|
1034
1023
|
def validation(
|
|
@@ -1098,7 +1087,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1098
1087
|
initial_search_task_id=initial_search_task_id,
|
|
1099
1088
|
endpoint=self.endpoint,
|
|
1100
1089
|
api_key=self.api_key,
|
|
1101
|
-
|
|
1090
|
+
logger=self.logger,
|
|
1102
1091
|
)
|
|
1103
1092
|
|
|
1104
1093
|
def prepare_uploading_file(self, base_path: str) -> str:
|
upgini/features_enricher.py
CHANGED
|
@@ -21,8 +21,8 @@ from scipy.stats import ks_2samp
|
|
|
21
21
|
from sklearn.base import TransformerMixin
|
|
22
22
|
from sklearn.exceptions import NotFittedError
|
|
23
23
|
from sklearn.model_selection import BaseCrossValidator
|
|
24
|
-
from upgini.autofe.feature import Feature
|
|
25
24
|
|
|
25
|
+
from upgini.autofe.feature import Feature
|
|
26
26
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
27
27
|
from upgini.dataset import Dataset
|
|
28
28
|
from upgini.errors import HttpError, ValidationError
|
|
@@ -64,7 +64,6 @@ from upgini.utils.datetime_utils import (
|
|
|
64
64
|
is_blocked_time_series,
|
|
65
65
|
is_time_series,
|
|
66
66
|
)
|
|
67
|
-
from upgini.utils.deduplicate_utils import remove_fintech_duplicates
|
|
68
67
|
from upgini.utils.display_utils import (
|
|
69
68
|
display_html_dataframe,
|
|
70
69
|
do_without_pandas_limits,
|
|
@@ -181,17 +180,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
181
180
|
exclude_columns: Optional[List[str]] = None,
|
|
182
181
|
baseline_score_column: Optional[Any] = None,
|
|
183
182
|
client_ip: Optional[str] = None,
|
|
183
|
+
client_visitorid: Optional[str] = None,
|
|
184
184
|
**kwargs,
|
|
185
185
|
):
|
|
186
186
|
self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
|
|
187
187
|
if api_key is not None and not isinstance(api_key, str):
|
|
188
188
|
raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
|
|
189
|
-
self.rest_client = get_rest_client(endpoint, self._api_key)
|
|
189
|
+
self.rest_client = get_rest_client(endpoint, self._api_key, client_ip, client_visitorid)
|
|
190
190
|
self.client_ip = client_ip
|
|
191
|
+
self.client_visitorid = client_visitorid
|
|
191
192
|
|
|
192
193
|
self.logs_enabled = logs_enabled
|
|
193
194
|
if logs_enabled:
|
|
194
|
-
self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip)
|
|
195
|
+
self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip, client_visitorid)
|
|
195
196
|
else:
|
|
196
197
|
self.logger = logging.getLogger()
|
|
197
198
|
self.logger.setLevel("FATAL")
|
|
@@ -232,7 +233,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
232
233
|
self.feature_importances_ = []
|
|
233
234
|
self.search_id = search_id
|
|
234
235
|
if search_id:
|
|
235
|
-
search_task = SearchTask(search_id, endpoint=self.endpoint, api_key=self._api_key,
|
|
236
|
+
search_task = SearchTask(search_id, endpoint=self.endpoint, api_key=self._api_key, logger=self.logger)
|
|
236
237
|
|
|
237
238
|
print(bundle.get("search_by_task_id_start"))
|
|
238
239
|
trace_id = str(uuid.uuid4())
|
|
@@ -296,7 +297,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
296
297
|
def _set_api_key(self, api_key: str):
|
|
297
298
|
self._api_key = api_key
|
|
298
299
|
if self.logs_enabled:
|
|
299
|
-
self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key, self.client_ip)
|
|
300
|
+
self.logger = LoggerFactory().get_logger(self.endpoint, self._api_key, self.client_ip, self.client_visitorid)
|
|
300
301
|
|
|
301
302
|
api_key = property(_get_api_key, _set_api_key)
|
|
302
303
|
|
|
@@ -679,7 +680,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
679
680
|
return None
|
|
680
681
|
|
|
681
682
|
if not metrics_calculation:
|
|
682
|
-
transform_usage =
|
|
683
|
+
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
683
684
|
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
684
685
|
if transform_usage.has_limit:
|
|
685
686
|
if len(X) > transform_usage.rest_rows:
|
|
@@ -1184,8 +1185,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1184
1185
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger)
|
|
1185
1186
|
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1186
1187
|
generated_features.extend(converter.generated_features)
|
|
1187
|
-
email_column = self.
|
|
1188
|
-
hem_column = self.
|
|
1188
|
+
email_column = self.__get_email_column(search_keys)
|
|
1189
|
+
hem_column = self.__get_hem_column(search_keys)
|
|
1189
1190
|
if email_column:
|
|
1190
1191
|
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1191
1192
|
extended_X = converter.convert(extended_X)
|
|
@@ -1451,7 +1452,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1451
1452
|
rows_to_drop=rows_to_drop,
|
|
1452
1453
|
)
|
|
1453
1454
|
|
|
1454
|
-
original_df_sampled = self.df_with_original_index[
|
|
1455
|
+
original_df_sampled = self.df_with_original_index[
|
|
1456
|
+
self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
|
|
1457
|
+
]
|
|
1455
1458
|
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1456
1459
|
if EVAL_SET_INDEX in original_df_sampled.columns:
|
|
1457
1460
|
Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
|
|
@@ -1506,8 +1509,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1506
1509
|
eval_df_with_index[TARGET] = eval_y
|
|
1507
1510
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1508
1511
|
df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
|
|
1509
|
-
|
|
1510
|
-
df_with_eval_set_index = remove_fintech_duplicates(df_with_eval_set_index, self.search_keys, self.logger)
|
|
1511
1512
|
|
|
1512
1513
|
# downsample if need to eval_set threshold
|
|
1513
1514
|
num_samples = _num_samples(df_with_eval_set_index)
|
|
@@ -1744,8 +1745,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1744
1745
|
generated_features.extend(converter.generated_features)
|
|
1745
1746
|
else:
|
|
1746
1747
|
self.logger.info("Input dataset hasn't date column")
|
|
1747
|
-
email_column = self.
|
|
1748
|
-
hem_column = self.
|
|
1748
|
+
email_column = self.__get_email_column(search_keys)
|
|
1749
|
+
hem_column = self.__get_hem_column(search_keys)
|
|
1749
1750
|
email_converted_to_hem = False
|
|
1750
1751
|
if email_column:
|
|
1751
1752
|
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
@@ -1806,7 +1807,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1806
1807
|
api_key=self.api_key, # type: ignore
|
|
1807
1808
|
date_format=self.date_format, # type: ignore
|
|
1808
1809
|
logger=self.logger,
|
|
1809
|
-
client_ip=self.client_ip,
|
|
1810
1810
|
)
|
|
1811
1811
|
dataset.meaning_types = meaning_types
|
|
1812
1812
|
dataset.search_keys = combined_search_keys
|
|
@@ -1869,7 +1869,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1869
1869
|
progress = self.get_progress(trace_id, validation_task)
|
|
1870
1870
|
except KeyboardInterrupt as e:
|
|
1871
1871
|
print(bundle.get("search_stopping"))
|
|
1872
|
-
|
|
1872
|
+
self.rest_client.stop_search_task_v2(
|
|
1873
1873
|
trace_id, validation_task.search_task_id
|
|
1874
1874
|
)
|
|
1875
1875
|
self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
|
|
@@ -2084,8 +2084,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2084
2084
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2085
2085
|
else:
|
|
2086
2086
|
self.logger.info("Input dataset hasn't date column")
|
|
2087
|
-
email_column = self.
|
|
2088
|
-
hem_column = self.
|
|
2087
|
+
email_column = self.__get_email_column(self.fit_search_keys)
|
|
2088
|
+
hem_column = self.__get_hem_column(self.fit_search_keys)
|
|
2089
2089
|
email_converted_to_hem = False
|
|
2090
2090
|
if email_column:
|
|
2091
2091
|
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
@@ -2141,7 +2141,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2141
2141
|
date_format=self.date_format, # type: ignore
|
|
2142
2142
|
random_state=self.random_state, # type: ignore
|
|
2143
2143
|
logger=self.logger,
|
|
2144
|
-
client_ip=self.client_ip,
|
|
2145
2144
|
)
|
|
2146
2145
|
dataset.meaning_types = meaning_types
|
|
2147
2146
|
dataset.search_keys = combined_search_keys
|
|
@@ -2198,7 +2197,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2198
2197
|
progress = self.get_progress(trace_id)
|
|
2199
2198
|
except KeyboardInterrupt as e:
|
|
2200
2199
|
print(bundle.get("search_stopping"))
|
|
2201
|
-
|
|
2200
|
+
self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
|
|
2202
2201
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
|
2203
2202
|
print(bundle.get("search_stopped"))
|
|
2204
2203
|
raise e
|
|
@@ -2618,22 +2617,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2618
2617
|
return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
2619
2618
|
|
|
2620
2619
|
@staticmethod
|
|
2621
|
-
def
|
|
2620
|
+
def __get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2622
2621
|
for col, t in search_keys.items():
|
|
2623
2622
|
if t == SearchKey.EMAIL:
|
|
2624
2623
|
return col
|
|
2625
2624
|
|
|
2626
2625
|
@staticmethod
|
|
2627
|
-
def
|
|
2626
|
+
def __get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2628
2627
|
for col, t in search_keys.items():
|
|
2629
2628
|
if t == SearchKey.HEM:
|
|
2630
2629
|
return col
|
|
2631
|
-
|
|
2632
|
-
@staticmethod
|
|
2633
|
-
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2634
|
-
for col, t in search_keys.items():
|
|
2635
|
-
if t == SearchKey.PHONE:
|
|
2636
|
-
return col
|
|
2637
2630
|
|
|
2638
2631
|
def __add_fit_system_record_id(
|
|
2639
2632
|
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
@@ -3190,7 +3183,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3190
3183
|
metrics_df=self.metrics,
|
|
3191
3184
|
autofe_descriptions_df=self.get_autofe_features_description(),
|
|
3192
3185
|
search_id=self._search_task.search_task_id,
|
|
3193
|
-
email=
|
|
3186
|
+
email=self.rest_client.get_current_email(),
|
|
3194
3187
|
search_keys=[str(sk) for sk in self.search_keys.values()],
|
|
3195
3188
|
)
|
|
3196
3189
|
except Exception:
|
|
@@ -3374,7 +3367,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3374
3367
|
pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
|
|
3375
3368
|
with open(f"{tmp_dir}/eval_y.pickle", "wb") as eval_y_file:
|
|
3376
3369
|
pickle.dump(sample(eval_set[0][1], eval_xy_sample_index), eval_y_file)
|
|
3377
|
-
|
|
3370
|
+
self.rest_client.dump_input_files(
|
|
3378
3371
|
trace_id,
|
|
3379
3372
|
f"{tmp_dir}/x.pickle",
|
|
3380
3373
|
f"{tmp_dir}/y.pickle",
|
|
@@ -3382,13 +3375,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3382
3375
|
f"{tmp_dir}/eval_y.pickle",
|
|
3383
3376
|
)
|
|
3384
3377
|
else:
|
|
3385
|
-
|
|
3378
|
+
self.rest_client.dump_input_files(
|
|
3386
3379
|
trace_id,
|
|
3387
3380
|
f"{tmp_dir}/x.pickle",
|
|
3388
3381
|
f"{tmp_dir}/y.pickle",
|
|
3389
3382
|
)
|
|
3390
3383
|
else:
|
|
3391
|
-
|
|
3384
|
+
self.rest_client.dump_input_files(
|
|
3392
3385
|
trace_id,
|
|
3393
3386
|
f"{tmp_dir}/x.pickle",
|
|
3394
3387
|
)
|
upgini/http.py
CHANGED
|
@@ -289,7 +289,7 @@ class _RestClient:
|
|
|
289
289
|
GET_ALL_ADS_DESCRIPTIONS_URI = "private/api/v2/ads/descriptions"
|
|
290
290
|
GET_ACTIVE_ADS_DEFINITIONS_URI = "private/api/v2/ads/definitions"
|
|
291
291
|
UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
|
|
292
|
-
|
|
292
|
+
STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
|
|
293
293
|
|
|
294
294
|
ACCESS_TOKEN_HEADER_NAME = "Authorization"
|
|
295
295
|
CONTENT_TYPE_HEADER_NAME = "Content-Type"
|
|
@@ -301,11 +301,13 @@ class _RestClient:
|
|
|
301
301
|
USER_AGENT_HEADER_VALUE = "pyupgini/" + __version__
|
|
302
302
|
SEARCH_KEYS_HEADER_NAME = "Search-Keys"
|
|
303
303
|
|
|
304
|
-
def __init__(self, service_endpoint, refresh_token, silent_mode=False):
|
|
304
|
+
def __init__(self, service_endpoint, refresh_token, silent_mode=False, client_ip=None, client_visitorid=None):
|
|
305
305
|
# debug_requests_on()
|
|
306
306
|
self._service_endpoint = service_endpoint
|
|
307
307
|
self._refresh_token = refresh_token
|
|
308
308
|
self.silent_mode = silent_mode
|
|
309
|
+
self.client_ip = client_ip
|
|
310
|
+
self.client_visitorid = client_visitorid
|
|
309
311
|
self._access_token = self._refresh_access_token()
|
|
310
312
|
# self._access_token: Optional[str] = None # self._refresh_access_token()
|
|
311
313
|
self.last_refresh_time = time.time()
|
|
@@ -470,7 +472,7 @@ class _RestClient:
|
|
|
470
472
|
)
|
|
471
473
|
files["tracking"] = (
|
|
472
474
|
"tracking.json",
|
|
473
|
-
dumps(get_track_metrics()).encode(),
|
|
475
|
+
dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
|
|
474
476
|
"application/json",
|
|
475
477
|
)
|
|
476
478
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
|
@@ -554,7 +556,7 @@ class _RestClient:
|
|
|
554
556
|
)
|
|
555
557
|
files["tracking"] = (
|
|
556
558
|
"ide",
|
|
557
|
-
dumps(get_track_metrics()).encode(),
|
|
559
|
+
dumps(get_track_metrics(self.client_ip, self.client_visitorid)).encode(),
|
|
558
560
|
"application/json",
|
|
559
561
|
)
|
|
560
562
|
|
|
@@ -662,7 +664,7 @@ class _RestClient:
|
|
|
662
664
|
return ProviderTaskMetadataV2.parse_obj(response)
|
|
663
665
|
|
|
664
666
|
def get_current_transform_usage(self, trace_id) -> TransformUsage:
|
|
665
|
-
track_metrics = get_track_metrics()
|
|
667
|
+
track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
|
666
668
|
visitor_id = track_metrics.get("visitorId")
|
|
667
669
|
response = self._with_unauth_retry(
|
|
668
670
|
lambda: self._send_get_req(
|
|
@@ -751,6 +753,10 @@ class _RestClient:
|
|
|
751
753
|
response = self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
|
|
752
754
|
return response["adsManagementTaskId"]
|
|
753
755
|
|
|
756
|
+
def stop_ads_management_task(self, ads_management_task_id: str, trace_id: str):
|
|
757
|
+
api_path = self.STOP_ADS_MANAGEMENT_TASK_URI_FMT.format(ads_management_task_id)
|
|
758
|
+
self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
|
|
759
|
+
|
|
754
760
|
# ---
|
|
755
761
|
|
|
756
762
|
def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
|
|
@@ -901,11 +907,12 @@ def resolve_api_token(api_token: Optional[str]) -> str:
|
|
|
901
907
|
return DEMO_API_KEY
|
|
902
908
|
|
|
903
909
|
|
|
904
|
-
def get_rest_client(backend_url: Optional[str] = None, api_token: Optional[str] = None
|
|
910
|
+
def get_rest_client(backend_url: Optional[str] = None, api_token: Optional[str] = None,
|
|
911
|
+
client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
|
|
905
912
|
url = _resolve_backend_url(backend_url)
|
|
906
913
|
token = resolve_api_token(api_token)
|
|
907
914
|
|
|
908
|
-
return _get_rest_client(url, token)
|
|
915
|
+
return _get_rest_client(url, token, client_ip, client_visitorid)
|
|
909
916
|
|
|
910
917
|
|
|
911
918
|
def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
@@ -913,23 +920,27 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
|
913
920
|
|
|
914
921
|
|
|
915
922
|
@lru_cache()
|
|
916
|
-
def _get_rest_client(backend_url: str, api_token: str
|
|
923
|
+
def _get_rest_client(backend_url: str, api_token: str,
|
|
924
|
+
client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> _RestClient:
|
|
917
925
|
return _RestClient(backend_url, api_token)
|
|
918
926
|
|
|
919
927
|
|
|
920
928
|
class BackendLogHandler(logging.Handler):
|
|
921
|
-
def __init__(self, rest_client: _RestClient,
|
|
929
|
+
def __init__(self, rest_client: _RestClient,
|
|
930
|
+
client_ip: Optional[str] = None, client_visitorid: Optional[str] = None,
|
|
931
|
+
*args, **kwargs) -> None:
|
|
922
932
|
super().__init__(*args, **kwargs)
|
|
923
933
|
self.rest_client = rest_client
|
|
924
934
|
self.track_metrics = None
|
|
925
935
|
self.hostname = "0.0.0.0"
|
|
926
936
|
self.client_ip = client_ip
|
|
937
|
+
self.client_visitorid = client_visitorid
|
|
927
938
|
|
|
928
939
|
def emit(self, record: logging.LogRecord) -> None:
|
|
929
940
|
def task():
|
|
930
941
|
try:
|
|
931
942
|
if self.track_metrics is None or len(self.track_metrics) == 0:
|
|
932
|
-
self.track_metrics = get_track_metrics(self.client_ip)
|
|
943
|
+
self.track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
|
933
944
|
self.hostname = self.track_metrics.get("ip") or "0.0.0.0"
|
|
934
945
|
text = self.format(record)
|
|
935
946
|
tags = self.track_metrics
|
|
@@ -971,7 +982,8 @@ class LoggerFactory:
|
|
|
971
982
|
root.handlers.clear()
|
|
972
983
|
|
|
973
984
|
def get_logger(
|
|
974
|
-
self, backend_url: Optional[str] = None, api_token: Optional[str] = None,
|
|
985
|
+
self, backend_url: Optional[str] = None, api_token: Optional[str] = None,
|
|
986
|
+
client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
|
|
975
987
|
) -> logging.Logger:
|
|
976
988
|
url = _resolve_backend_url(backend_url)
|
|
977
989
|
token = resolve_api_token(api_token)
|
|
@@ -983,7 +995,7 @@ class LoggerFactory:
|
|
|
983
995
|
upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
|
|
984
996
|
upgini_logger.handlers.clear()
|
|
985
997
|
rest_client = get_rest_client(backend_url, api_token)
|
|
986
|
-
datadog_handler = BackendLogHandler(rest_client, client_ip)
|
|
998
|
+
datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
|
|
987
999
|
json_formatter = jsonlogger.JsonFormatter(
|
|
988
1000
|
"%(asctime)s %(threadName)s %(name)s %(levelname)s %(message)s",
|
|
989
1001
|
timestamp=True,
|
|
@@ -142,7 +142,6 @@ dataset_empty_column_names=Some column names are empty. Add names please
|
|
|
142
142
|
dataset_too_long_column_name=Column {} is too long: {} characters. Remove this column or trim length to 50 characters
|
|
143
143
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
144
144
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
145
|
-
dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
146
145
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
147
146
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
148
147
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
upgini/search_task.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import tempfile
|
|
2
3
|
import time
|
|
3
4
|
from functools import lru_cache
|
|
@@ -43,7 +44,7 @@ class SearchTask:
|
|
|
43
44
|
task_type: Optional[ModelTaskType] = None,
|
|
44
45
|
endpoint: Optional[str] = None,
|
|
45
46
|
api_key: Optional[str] = None,
|
|
46
|
-
|
|
47
|
+
logger: Optional[logging.Logger] = None,
|
|
47
48
|
):
|
|
48
49
|
self.search_task_id = search_task_id
|
|
49
50
|
self.initial_search_task_id = initial_search_task_id
|
|
@@ -55,7 +56,11 @@ class SearchTask:
|
|
|
55
56
|
self.summary = None
|
|
56
57
|
self.endpoint = endpoint
|
|
57
58
|
self.api_key = api_key
|
|
58
|
-
|
|
59
|
+
if logger is not None:
|
|
60
|
+
self.logger = logger
|
|
61
|
+
else:
|
|
62
|
+
self.logger = logging.getLogger()
|
|
63
|
+
self.logger.setLevel("FATAL")
|
|
59
64
|
self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
|
|
60
65
|
self.unused_features_for_generation: Optional[List[str]] = None
|
|
61
66
|
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -61,22 +61,9 @@ class DateTimeSearchKeyConverter:
|
|
|
61
61
|
elif is_period_dtype(df[self.date_column]):
|
|
62
62
|
df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
|
|
63
63
|
elif is_numeric_dtype(df[self.date_column]):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# 315532801000000000 - 2524608001000000000 - nanoseconds
|
|
68
|
-
if df[self.date_column].apply(lambda x: 10**16 < x).all():
|
|
69
|
-
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
|
|
70
|
-
elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
|
|
71
|
-
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
|
|
72
|
-
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
|
73
|
-
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
|
|
74
|
-
elif df[self.date_column].apply(lambda x: 0 < x < 10*11).all():
|
|
75
|
-
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
76
|
-
else:
|
|
77
|
-
msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
|
|
78
|
-
self.logger.warning(msg)
|
|
79
|
-
raise ValidationError(msg)
|
|
64
|
+
msg = f"Unsupported type of date column {self.date_column}. Convert to datetime please."
|
|
65
|
+
self.logger.warning(msg)
|
|
66
|
+
raise ValidationError(msg)
|
|
80
67
|
|
|
81
68
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
82
69
|
# as additional features
|
upgini/utils/track_info.py
CHANGED
|
@@ -36,18 +36,22 @@ def _env_contains(envs) -> bool:
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def _get_execution_ide() -> str:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
39
|
+
try:
|
|
40
|
+
if "google.colab" in sys.modules and _env_contains(_ide_env_variables["colab"]):
|
|
41
|
+
return "colab"
|
|
42
|
+
elif os.path.exists("/kaggle") and _check_installed("kaggle") and _env_contains(_ide_env_variables["kaggle"]):
|
|
43
|
+
return "kaggle"
|
|
44
|
+
elif getuser() == "jovyan" and _env_contains(_ide_env_variables["binder"]):
|
|
45
|
+
return "binder"
|
|
46
|
+
elif "widget" in socket.gethostname():
|
|
47
|
+
return "widget"
|
|
48
|
+
else:
|
|
49
|
+
return "other"
|
|
50
|
+
except Exception:
|
|
46
51
|
return "other"
|
|
47
52
|
|
|
48
|
-
|
|
49
53
|
@lru_cache()
|
|
50
|
-
def get_track_metrics(client_ip: Optional[str] = None) -> dict:
|
|
54
|
+
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
51
55
|
# default values
|
|
52
56
|
track = {"ide": _get_execution_ide()}
|
|
53
57
|
ident_res = "https://api.ipify.org"
|
|
@@ -66,24 +70,10 @@ def get_track_metrics(client_ip: Optional[str] = None) -> dict:
|
|
|
66
70
|
from google.colab import output # type: ignore
|
|
67
71
|
from IPython.display import Javascript, display
|
|
68
72
|
|
|
69
|
-
# path_to_script = Path(__file__).parent.parent.resolve() / "fingerprint.js"
|
|
70
|
-
# with open(path_to_script) as f:
|
|
71
|
-
# js_content = f.read()
|
|
72
|
-
# print(f"JS loaded. Length: {len(js_content)}")
|
|
73
|
-
|
|
74
73
|
display(
|
|
75
74
|
Javascript(
|
|
76
|
-
# """
|
|
77
|
-
# async function loadModuleFromString(code) {
|
|
78
|
-
# const blob = new Blob([code], { type: 'application/javascript' });
|
|
79
|
-
# const url = URL.createObjectURL(blob);
|
|
80
|
-
# const module = await import(url);
|
|
81
|
-
# URL.revokeObjectURL(url); // Clean URL-object after module load
|
|
82
|
-
# return module;
|
|
83
|
-
# }
|
|
84
|
-
# window.visitorId = loadModuleFromString(""" + js_content + """)
|
|
85
75
|
"""
|
|
86
|
-
window.visitorId = import('https://
|
|
76
|
+
window.visitorId = import('https://upgini.github.io/upgini/js/visitorid.js')
|
|
87
77
|
.then(FingerprintJS => FingerprintJS.load())
|
|
88
78
|
.then(fp => fp.get())
|
|
89
79
|
.then(result => result.visitorId);
|
|
@@ -153,7 +143,10 @@ def get_track_metrics(client_ip: Optional[str] = None) -> dict:
|
|
|
153
143
|
track["ip"] = client_ip
|
|
154
144
|
else:
|
|
155
145
|
track["ip"] = get(ident_res, timeout=10).text
|
|
156
|
-
|
|
146
|
+
if client_visitorid:
|
|
147
|
+
track["visitorId"] = client_visitorid
|
|
148
|
+
else:
|
|
149
|
+
track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
|
|
157
150
|
except Exception as e:
|
|
158
151
|
track["err"] = str(e)
|
|
159
152
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.239a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -823,11 +823,11 @@ Requests and support, in preferred order
|
|
|
823
823
|
- **scoped to a Single Bug** - one bug per report.
|
|
824
824
|
|
|
825
825
|
## 🧩 Contributing
|
|
826
|
-
We are a
|
|
826
|
+
We are not a large team, so we probably won't be able to:
|
|
827
827
|
- implement smooth integration with most common low-code ML libraries and platforms ([PyCaret](https://www.github.com/pycaret/pycaret), [H2O AutoML](https://github.com//h2oai/h2o-3/blob/master/h2o-docs/src/product/automl.rst), etc. )
|
|
828
|
-
- implement all possible data verification and normalization capabilities for different types of search keys
|
|
828
|
+
- implement all possible data verification and normalization capabilities for different types of search keys
|
|
829
|
+
And we need some help from the community!
|
|
829
830
|
|
|
830
|
-
And we need some help from community)
|
|
831
831
|
So, we'll be happy about every **pull request** you open and **issue** you find to make this library **more incredible**. Please note that it might sometimes take us a while to get back to you.
|
|
832
832
|
**For major changes**, please open an issue first to discuss what you would like to change
|
|
833
833
|
#### Developing
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=qSjv09LKzCYayucb_JlhExw9uSRcscLWTaD8hqATE3s,49676
|
|
4
4
|
upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=9RJi8NwYbXPK-vgWiMcYoD4I2wO0D91Uk-tvL_1nJ-8,160271
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
|
-
upgini/http.py,sha256=
|
|
7
|
+
upgini/http.py,sha256=RG93QmV3mqKixQsSHqYeM1Mtucp-EpdavcpCuhufnGE,42141
|
|
8
8
|
upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
|
|
9
9
|
upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
|
|
10
|
-
upgini/search_task.py,sha256=
|
|
10
|
+
upgini/search_task.py,sha256=sqgb5MfwWXg6YAbVhLOPcVJ5tDCUyzxFRWfd9aWj8SM,17236
|
|
11
11
|
upgini/spinner.py,sha256=yhakBaydMNS8E8TRAwTdCMdnWrHeWT0cR1M8c9hP6jA,1157
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -17,18 +17,18 @@ upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s
|
|
|
17
17
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
18
18
|
upgini/autofe/feature.py,sha256=d_iikjQJYgTOkZrXON_IWY5S22OkSpCsk6lfbmVA9ts,11825
|
|
19
19
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
20
|
-
upgini/autofe/operand.py,sha256=
|
|
21
|
-
upgini/autofe/unary.py,sha256=
|
|
20
|
+
upgini/autofe/operand.py,sha256=GpSx-nL2XKnTJ7kvRr_SIFoUMchqYian6SftJ82zsN4,2719
|
|
21
|
+
upgini/autofe/unary.py,sha256=WB-Ovwaz2a-Jscpshg1Om7Ttx6DJ6gQ_fgqtXx_UHuw,2845
|
|
22
22
|
upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
|
|
23
23
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
24
|
+
upgini/data_source/data_source_publisher.py,sha256=xvHi4N4m32eqB_h_qtY1wAt1dXekM5PdNL2T9JzFQD4,14051
|
|
25
25
|
upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
|
|
26
26
|
upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
|
|
27
27
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
upgini/normalizer/phone_normalizer.py,sha256=VIgLXuDuzzjPEXiy_LyDVLZKGaS7-le6Fh6T4D-TQDU,9930
|
|
29
29
|
upgini/resource_bundle/__init__.py,sha256=M7GtS7KPQw9pinz8P2aQWXpSkD2YFwUPVGk1w92Pn84,7888
|
|
30
30
|
upgini/resource_bundle/exceptions.py,sha256=KT-OnqA2J4OTfLjhbEl3KFZM2ci7EOPjqJuY_rXp3vs,622
|
|
31
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
31
|
+
upgini/resource_bundle/strings.properties,sha256=1mpOkd_wkKIJGwWRBgfXz0mLx4lqdDro5IUoj8BBxuE,24527
|
|
32
32
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
33
|
upgini/sampler/base.py,sha256=X2PVsfZ3Rl7twpFDh5UWyxqY2K_jcMGxZ2NcHLwFRj4,6489
|
|
34
34
|
upgini/sampler/random_under_sampler.py,sha256=whX_f_TtalHH8Seyn_7n3sX_TSiDHeYfALmme9saqDg,4082
|
|
@@ -39,8 +39,7 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
|
|
|
39
39
|
upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o,6436
|
|
40
40
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
41
41
|
upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
|
|
42
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
43
|
-
upgini/utils/deduplicate_utils.py,sha256=o-XY0hbqikQTzwpX0nyl34j_oiBQTefCvRgLHkZRkTE,2795
|
|
42
|
+
upgini/utils/datetime_utils.py,sha256=P56e7gcgAogJYfs2Blzk1uypxb9yrFzNaeJpMCRm6Zc,7716
|
|
44
43
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
45
44
|
upgini/utils/email_utils.py,sha256=MhCLUAWqbp81xRyKizauNhVx6t_MFeJQRQ8pFM7EpFo,3480
|
|
46
45
|
upgini/utils/fallback_progress_bar.py,sha256=f-VzVbiO6oU9WoKzEgoegYotixdiKanGlvdQCOGC-NY,1128
|
|
@@ -52,10 +51,10 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
|
|
|
52
51
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
53
52
|
upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,43847
|
|
54
53
|
upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
|
|
55
|
-
upgini/utils/track_info.py,sha256=
|
|
54
|
+
upgini/utils/track_info.py,sha256=jPOiIGpAG_zvHgeiFe_pQ4TWC9ZPjnd_5hSOu5tzLi4,5207
|
|
56
55
|
upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
|
|
57
|
-
upgini-1.1.
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
56
|
+
upgini-1.1.239a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
57
|
+
upgini-1.1.239a1.dist-info/METADATA,sha256=Sl4XSdmxJTR9080xw55QKFkoMDFMHuspXT_54E07mm0,48264
|
|
58
|
+
upgini-1.1.239a1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
59
|
+
upgini-1.1.239a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
60
|
+
upgini-1.1.239a1.dist-info/RECORD,,
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from logging import Logger
|
|
2
|
-
from typing import Dict, List, Optional, Union
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
from upgini.metadata import TARGET, ModelTaskType, SearchKey
|
|
7
|
-
from upgini.resource_bundle import bundle
|
|
8
|
-
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
|
-
from upgini.utils.target_utils import define_task
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def remove_fintech_duplicates(df: pd.DataFrame,
|
|
13
|
-
search_keys: Dict[str, SearchKey],
|
|
14
|
-
logger: Optional[Logger] = None) -> pd.DataFrame:
|
|
15
|
-
if define_task(df.target, silent=True) != ModelTaskType.BINARY:
|
|
16
|
-
return df
|
|
17
|
-
|
|
18
|
-
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
19
|
-
if date_col is None:
|
|
20
|
-
return df
|
|
21
|
-
|
|
22
|
-
personal_cols = []
|
|
23
|
-
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
24
|
-
if phone_col:
|
|
25
|
-
personal_cols.append(phone_col)
|
|
26
|
-
email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
|
|
27
|
-
if email_col:
|
|
28
|
-
personal_cols.append(email_col)
|
|
29
|
-
hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
|
|
30
|
-
if hem_col:
|
|
31
|
-
personal_cols.append(hem_col)
|
|
32
|
-
if len(personal_cols) == 0:
|
|
33
|
-
return df
|
|
34
|
-
|
|
35
|
-
duplicates = df.duplicated(personal_cols, keep=False)
|
|
36
|
-
duplicate_rows = df[duplicates]
|
|
37
|
-
if len(duplicate_rows) == 0:
|
|
38
|
-
return df
|
|
39
|
-
|
|
40
|
-
grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
|
|
41
|
-
|
|
42
|
-
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
43
|
-
total = len(uniques)
|
|
44
|
-
diff_dates = len(uniques[uniques > 1])
|
|
45
|
-
if diff_dates / total >= 0.6:
|
|
46
|
-
return df
|
|
47
|
-
|
|
48
|
-
if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
|
|
49
|
-
return df
|
|
50
|
-
|
|
51
|
-
def has_diff_target_within_60_days(rows):
|
|
52
|
-
rows = rows.sort_values(by=date_col)
|
|
53
|
-
return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
|
|
54
|
-
|
|
55
|
-
df = DateTimeSearchKeyConverter(date_col).convert(df)
|
|
56
|
-
grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
|
|
57
|
-
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
58
|
-
if len(rows_with_diff_target) > 0:
|
|
59
|
-
perc = len(rows_with_diff_target) * 100 / len(df)
|
|
60
|
-
msg = bundle.get("dataset_diff_target_duplicates_fintech").format(perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list())
|
|
61
|
-
print(msg)
|
|
62
|
-
if logger:
|
|
63
|
-
logger.warning(msg)
|
|
64
|
-
df = df[~df.index.isin(rows_with_diff_target.index)]
|
|
65
|
-
|
|
66
|
-
return df
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
70
|
-
for col, key_type in search_keys.items():
|
|
71
|
-
if (isinstance(keys, list) and key_type in keys) or key_type == keys:
|
|
72
|
-
return col
|
|
File without changes
|
|
File without changes
|
|
File without changes
|