upgini 1.1.236a1__py3-none-any.whl → 1.1.237__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/data_source/data_source_publisher.py +7 -0
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +11 -9
- upgini/http.py +5 -1
- upgini/utils/deduplicate_utils.py +72 -0
- {upgini-1.1.236a1.dist-info → upgini-1.1.237.dist-info}/METADATA +1 -1
- {upgini-1.1.236a1.dist-info → upgini-1.1.237.dist-info}/RECORD +10 -9
- {upgini-1.1.236a1.dist-info → upgini-1.1.237.dist-info}/LICENSE +0 -0
- {upgini-1.1.236a1.dist-info → upgini-1.1.237.dist-info}/WHEEL +0 -0
- {upgini-1.1.236a1.dist-info → upgini-1.1.237.dist-info}/top_level.txt +0 -0
|
@@ -62,6 +62,7 @@ class DataSourcePublisher:
|
|
|
62
62
|
trace_id = str(uuid.uuid4())
|
|
63
63
|
|
|
64
64
|
with MDC(trace_id=trace_id):
|
|
65
|
+
task_id = None
|
|
65
66
|
try:
|
|
66
67
|
if data_table_uri is None or not data_table_uri.startswith("bq://"):
|
|
67
68
|
raise ValidationError(
|
|
@@ -148,6 +149,12 @@ class DataSourcePublisher:
|
|
|
148
149
|
self.logger.info(msg)
|
|
149
150
|
print(msg)
|
|
150
151
|
return data_table_id
|
|
152
|
+
except KeyboardInterrupt:
|
|
153
|
+
if task_id is not None:
|
|
154
|
+
msg = f"Stopping AdsManagementTask {task_id}"
|
|
155
|
+
print(msg)
|
|
156
|
+
self.logger.info(msg)
|
|
157
|
+
self._rest_client.stop_ads_management_task(task_id, trace_id)
|
|
151
158
|
except Exception:
|
|
152
159
|
self.logger.exception("Failed to register data table")
|
|
153
160
|
raise
|
upgini/dataset.py
CHANGED
|
@@ -382,7 +382,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
382
382
|
|
|
383
383
|
if is_string_dtype(self.data[postal_code]):
|
|
384
384
|
try:
|
|
385
|
-
self.data[postal_code] = self.data[postal_code].astype("
|
|
385
|
+
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
386
386
|
except Exception:
|
|
387
387
|
pass
|
|
388
388
|
elif is_float_dtype(self.data[postal_code]):
|
upgini/features_enricher.py
CHANGED
|
@@ -21,8 +21,8 @@ from scipy.stats import ks_2samp
|
|
|
21
21
|
from sklearn.base import TransformerMixin
|
|
22
22
|
from sklearn.exceptions import NotFittedError
|
|
23
23
|
from sklearn.model_selection import BaseCrossValidator
|
|
24
|
-
from upgini.autofe.feature import Feature
|
|
25
24
|
|
|
25
|
+
from upgini.autofe.feature import Feature
|
|
26
26
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
27
27
|
from upgini.dataset import Dataset
|
|
28
28
|
from upgini.errors import HttpError, ValidationError
|
|
@@ -1450,15 +1450,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1450
1450
|
rows_to_drop=rows_to_drop,
|
|
1451
1451
|
)
|
|
1452
1452
|
|
|
1453
|
-
original_df_sampled = self.df_with_original_index[
|
|
1453
|
+
original_df_sampled = self.df_with_original_index[
|
|
1454
|
+
self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
|
|
1455
|
+
]
|
|
1454
1456
|
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1455
1457
|
if EVAL_SET_INDEX in original_df_sampled.columns:
|
|
1456
|
-
|
|
1458
|
+
Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
|
|
1457
1459
|
else:
|
|
1458
|
-
|
|
1459
|
-
X_sampled = drop_existing_columns(
|
|
1460
|
+
Xy_sampled = original_df_sampled
|
|
1461
|
+
X_sampled = drop_existing_columns(Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
|
|
1460
1462
|
search_keys = self.fit_search_keys
|
|
1461
|
-
y_sampled =
|
|
1463
|
+
y_sampled = Xy_sampled[TARGET].copy()
|
|
1462
1464
|
|
|
1463
1465
|
self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
|
|
1464
1466
|
self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
|
|
@@ -1472,9 +1474,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1472
1474
|
|
|
1473
1475
|
for idx in range(len(eval_set)):
|
|
1474
1476
|
enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
|
|
1475
|
-
|
|
1476
|
-
eval_X_sampled = drop_existing_columns(
|
|
1477
|
-
eval_y_sampled =
|
|
1477
|
+
eval_Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1478
|
+
eval_X_sampled = drop_existing_columns(eval_Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
|
|
1479
|
+
eval_y_sampled = eval_Xy_sampled[TARGET].copy()
|
|
1478
1480
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1479
1481
|
|
|
1480
1482
|
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
upgini/http.py
CHANGED
|
@@ -289,7 +289,7 @@ class _RestClient:
|
|
|
289
289
|
GET_ALL_ADS_DESCRIPTIONS_URI = "private/api/v2/ads/descriptions"
|
|
290
290
|
GET_ACTIVE_ADS_DEFINITIONS_URI = "private/api/v2/ads/definitions"
|
|
291
291
|
UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
|
|
292
|
-
|
|
292
|
+
STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
|
|
293
293
|
|
|
294
294
|
ACCESS_TOKEN_HEADER_NAME = "Authorization"
|
|
295
295
|
CONTENT_TYPE_HEADER_NAME = "Content-Type"
|
|
@@ -751,6 +751,10 @@ class _RestClient:
|
|
|
751
751
|
response = self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
|
|
752
752
|
return response["adsManagementTaskId"]
|
|
753
753
|
|
|
754
|
+
def stop_ads_management_task(self, ads_management_task_id: str, trace_id: str):
|
|
755
|
+
api_path = self.STOP_ADS_MANAGEMENT_TASK_URI_FMT.format(ads_management_task_id)
|
|
756
|
+
self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
|
|
757
|
+
|
|
754
758
|
# ---
|
|
755
759
|
|
|
756
760
|
def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from logging import Logger
|
|
2
|
+
from typing import Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from upgini.metadata import TARGET, ModelTaskType, SearchKey
|
|
7
|
+
from upgini.resource_bundle import bundle
|
|
8
|
+
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
|
+
from upgini.utils.target_utils import define_task
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def remove_fintech_duplicates(df: pd.DataFrame,
|
|
13
|
+
search_keys: Dict[str, SearchKey],
|
|
14
|
+
logger: Optional[Logger] = None) -> pd.DataFrame:
|
|
15
|
+
if define_task(df.target, silent=True) != ModelTaskType.BINARY:
|
|
16
|
+
return df
|
|
17
|
+
|
|
18
|
+
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
19
|
+
if date_col is None:
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
personal_cols = []
|
|
23
|
+
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
24
|
+
if phone_col:
|
|
25
|
+
personal_cols.append(phone_col)
|
|
26
|
+
email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
|
|
27
|
+
if email_col:
|
|
28
|
+
personal_cols.append(email_col)
|
|
29
|
+
hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
|
|
30
|
+
if hem_col:
|
|
31
|
+
personal_cols.append(hem_col)
|
|
32
|
+
if len(personal_cols) == 0:
|
|
33
|
+
return df
|
|
34
|
+
|
|
35
|
+
duplicates = df.duplicated(personal_cols, keep=False)
|
|
36
|
+
duplicate_rows = df[duplicates]
|
|
37
|
+
if len(duplicate_rows) == 0:
|
|
38
|
+
return df
|
|
39
|
+
|
|
40
|
+
grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
|
|
41
|
+
|
|
42
|
+
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
43
|
+
total = len(uniques)
|
|
44
|
+
diff_dates = len(uniques[uniques > 1])
|
|
45
|
+
if diff_dates / total >= 0.6:
|
|
46
|
+
return df
|
|
47
|
+
|
|
48
|
+
if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
|
|
49
|
+
return df
|
|
50
|
+
|
|
51
|
+
def has_diff_target_within_60_days(rows):
|
|
52
|
+
rows = rows.sort_values(by=date_col)
|
|
53
|
+
return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
|
|
54
|
+
|
|
55
|
+
df = DateTimeSearchKeyConverter(date_col).convert(df)
|
|
56
|
+
grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
|
|
57
|
+
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
58
|
+
if len(rows_with_diff_target) > 0:
|
|
59
|
+
perc = len(rows_with_diff_target) * 100 / len(df)
|
|
60
|
+
msg = bundle.get("dataset_diff_target_duplicates_fintech").format(perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list())
|
|
61
|
+
print(msg)
|
|
62
|
+
if logger:
|
|
63
|
+
logger.warning(msg)
|
|
64
|
+
df = df[~df.index.isin(rows_with_diff_target.index)]
|
|
65
|
+
|
|
66
|
+
return df
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
70
|
+
for col, key_type in search_keys.items():
|
|
71
|
+
if (isinstance(keys, list) and key_type in keys) or key_type == keys:
|
|
72
|
+
return col
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=BCgqbol6se2ZhVeaHC0DOOEnak-3jjeCAsHvjCQ_qRE,49764
|
|
4
4
|
upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=D-Rm00P8Cnl4A_730jAflWcGyEo425p0r1azwoU__B0,160381
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
|
-
upgini/http.py,sha256=
|
|
7
|
+
upgini/http.py,sha256=b1nF0YL4n-kgoz3Ju_r0pKWn_-cGs56rugr8YtA08HA,41476
|
|
8
8
|
upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
|
|
9
9
|
upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
|
|
10
10
|
upgini/search_task.py,sha256=7YxH1zrUHMmePO0VbPBBCJjeoer7jAC0Gltc9EVAOIg,17126
|
|
@@ -21,7 +21,7 @@ upgini/autofe/operand.py,sha256=8WqEoSIA5rEWCK1xuC303E4NW5a72GZ5jUMAEj4skII,2291
|
|
|
21
21
|
upgini/autofe/unary.py,sha256=7TBe7PCt7l_XQEqu_G5g_TC2cW3tppL7uPDcX8xsqz0,2731
|
|
22
22
|
upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
|
|
23
23
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
24
|
+
upgini/data_source/data_source_publisher.py,sha256=xvHi4N4m32eqB_h_qtY1wAt1dXekM5PdNL2T9JzFQD4,14051
|
|
25
25
|
upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
|
|
26
26
|
upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
|
|
27
27
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -40,6 +40,7 @@ upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o
|
|
|
40
40
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
41
41
|
upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
|
|
42
42
|
upgini/utils/datetime_utils.py,sha256=P56e7gcgAogJYfs2Blzk1uypxb9yrFzNaeJpMCRm6Zc,7716
|
|
43
|
+
upgini/utils/deduplicate_utils.py,sha256=o-XY0hbqikQTzwpX0nyl34j_oiBQTefCvRgLHkZRkTE,2795
|
|
43
44
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
44
45
|
upgini/utils/email_utils.py,sha256=MhCLUAWqbp81xRyKizauNhVx6t_MFeJQRQ8pFM7EpFo,3480
|
|
45
46
|
upgini/utils/fallback_progress_bar.py,sha256=f-VzVbiO6oU9WoKzEgoegYotixdiKanGlvdQCOGC-NY,1128
|
|
@@ -53,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
|
|
|
53
54
|
upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
|
|
54
55
|
upgini/utils/track_info.py,sha256=DVNVZmXUb4f25DSPEuUNEFx49hNEBfmuY9iSW5jkMnI,5708
|
|
55
56
|
upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
|
|
56
|
-
upgini-1.1.
|
|
57
|
-
upgini-1.1.
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
57
|
+
upgini-1.1.237.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
58
|
+
upgini-1.1.237.dist-info/METADATA,sha256=A5a-AaJ1JMDFrGIwcV3sjlF3zMfWB9yrEvzKx2ZM6HE,48344
|
|
59
|
+
upgini-1.1.237.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
60
|
+
upgini-1.1.237.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
61
|
+
upgini-1.1.237.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|