upgini 1.1.236a2__py3-none-any.whl → 1.1.237__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -62,6 +62,7 @@ class DataSourcePublisher:
62
62
  trace_id = str(uuid.uuid4())
63
63
 
64
64
  with MDC(trace_id=trace_id):
65
+ task_id = None
65
66
  try:
66
67
  if data_table_uri is None or not data_table_uri.startswith("bq://"):
67
68
  raise ValidationError(
@@ -148,6 +149,12 @@ class DataSourcePublisher:
148
149
  self.logger.info(msg)
149
150
  print(msg)
150
151
  return data_table_id
152
+ except KeyboardInterrupt:
153
+ if task_id is not None:
154
+ msg = f"Stopping AdsManagementTask {task_id}"
155
+ print(msg)
156
+ self.logger.info(msg)
157
+ self._rest_client.stop_ads_management_task(task_id, trace_id)
151
158
  except Exception:
152
159
  self.logger.exception("Failed to register data table")
153
160
  raise
upgini/dataset.py CHANGED
@@ -382,7 +382,7 @@ class Dataset: # (pd.DataFrame):
382
382
 
383
383
  if is_string_dtype(self.data[postal_code]):
384
384
  try:
385
- self.data[postal_code] = self.data[postal_code].astype("Float64").astype("Int64").astype("string")
385
+ self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
386
386
  except Exception:
387
387
  pass
388
388
  elif is_float_dtype(self.data[postal_code]):
@@ -21,8 +21,8 @@ from scipy.stats import ks_2samp
21
21
  from sklearn.base import TransformerMixin
22
22
  from sklearn.exceptions import NotFittedError
23
23
  from sklearn.model_selection import BaseCrossValidator
24
- from upgini.autofe.feature import Feature
25
24
 
25
+ from upgini.autofe.feature import Feature
26
26
  from upgini.data_source.data_source_publisher import CommercialSchema
27
27
  from upgini.dataset import Dataset
28
28
  from upgini.errors import HttpError, ValidationError
@@ -1450,7 +1450,9 @@ class FeaturesEnricher(TransformerMixin):
1450
1450
  rows_to_drop=rows_to_drop,
1451
1451
  )
1452
1452
 
1453
- original_df_sampled = self.df_with_original_index[self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])]
1453
+ original_df_sampled = self.df_with_original_index[
1454
+ self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
1455
+ ]
1454
1456
  enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1455
1457
  if EVAL_SET_INDEX in original_df_sampled.columns:
1456
1458
  Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
upgini/http.py CHANGED
@@ -289,7 +289,7 @@ class _RestClient:
289
289
  GET_ALL_ADS_DESCRIPTIONS_URI = "private/api/v2/ads/descriptions"
290
290
  GET_ACTIVE_ADS_DEFINITIONS_URI = "private/api/v2/ads/definitions"
291
291
  UPLOAD_ONLINE_URI = "private/api/v2/ads/upload-online"
292
- UPLOAD_ONLINE_ALL_URI = "private/api/v2/ads/upload-online-all"
292
+ STOP_ADS_MANAGEMENT_TASK_URI_FMT = "private/api/v2/ads/management-task/{0}/stop"
293
293
 
294
294
  ACCESS_TOKEN_HEADER_NAME = "Authorization"
295
295
  CONTENT_TYPE_HEADER_NAME = "Content-Type"
@@ -751,6 +751,10 @@ class _RestClient:
751
751
  response = self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id, request, result_format=None))
752
752
  return response["adsManagementTaskId"]
753
753
 
754
+ def stop_ads_management_task(self, ads_management_task_id: str, trace_id: str):
755
+ api_path = self.STOP_ADS_MANAGEMENT_TASK_URI_FMT.format(ads_management_task_id)
756
+ self._with_unauth_retry(lambda: self._send_post_req(api_path, trace_id))
757
+
754
758
  # ---
755
759
 
756
760
  def _send_get_req(self, api_path: str, trace_id: Optional[str], additional_headers: Optional[dict] = None):
@@ -0,0 +1,72 @@
1
+ from logging import Logger
2
+ from typing import Dict, List, Optional, Union
3
+
4
+ import pandas as pd
5
+
6
+ from upgini.metadata import TARGET, ModelTaskType, SearchKey
7
+ from upgini.resource_bundle import bundle
8
+ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
+ from upgini.utils.target_utils import define_task
10
+
11
+
12
+ def remove_fintech_duplicates(df: pd.DataFrame,
13
+ search_keys: Dict[str, SearchKey],
14
+ logger: Optional[Logger] = None) -> pd.DataFrame:
15
+ if define_task(df.target, silent=True) != ModelTaskType.BINARY:
16
+ return df
17
+
18
+ date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
19
+ if date_col is None:
20
+ return df
21
+
22
+ personal_cols = []
23
+ phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
24
+ if phone_col:
25
+ personal_cols.append(phone_col)
26
+ email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
27
+ if email_col:
28
+ personal_cols.append(email_col)
29
+ hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
30
+ if hem_col:
31
+ personal_cols.append(hem_col)
32
+ if len(personal_cols) == 0:
33
+ return df
34
+
35
+ duplicates = df.duplicated(personal_cols, keep=False)
36
+ duplicate_rows = df[duplicates]
37
+ if len(duplicate_rows) == 0:
38
+ return df
39
+
40
+ grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
41
+
42
+ uniques = grouped_by_personal_cols[date_col].nunique()
43
+ total = len(uniques)
44
+ diff_dates = len(uniques[uniques > 1])
45
+ if diff_dates / total >= 0.6:
46
+ return df
47
+
48
+ if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
49
+ return df
50
+
51
+ def has_diff_target_within_60_days(rows):
52
+ rows = rows.sort_values(by=date_col)
53
+ return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
54
+
55
+ df = DateTimeSearchKeyConverter(date_col).convert(df)
56
+ grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
57
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
58
+ if len(rows_with_diff_target) > 0:
59
+ perc = len(rows_with_diff_target) * 100 / len(df)
60
+ msg = bundle.get("dataset_diff_target_duplicates_fintech").format(perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list())
61
+ print(msg)
62
+ if logger:
63
+ logger.warning(msg)
64
+ df = df[~df.index.isin(rows_with_diff_target.index)]
65
+
66
+ return df
67
+
68
+
69
+ def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
70
+ for col, key_type in search_keys.items():
71
+ if (isinstance(keys, list) and key_type in keys) or key_type == keys:
72
+ return col
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.236a2
3
+ Version: 1.1.237
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,10 +1,10 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=7z9zbVvd1_MiufmoZlCwEHwQ25Q2DX_0g9PFcSMlqMY,49764
3
+ upgini/dataset.py,sha256=BCgqbol6se2ZhVeaHC0DOOEnak-3jjeCAsHvjCQ_qRE,49764
4
4
  upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
5
- upgini/features_enricher.py,sha256=-lRmAOJcRWwI08nddfC8_vTV_qBS9NryGmrnJg4ahf4,160355
5
+ upgini/features_enricher.py,sha256=D-Rm00P8Cnl4A_730jAflWcGyEo425p0r1azwoU__B0,160381
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
- upgini/http.py,sha256=HzUSZudCdISJGUqHC1gAT1v_x1n_dIFVDJW4z3Q7DCs,41204
7
+ upgini/http.py,sha256=b1nF0YL4n-kgoz3Ju_r0pKWn_-cGs56rugr8YtA08HA,41476
8
8
  upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
9
9
  upgini/metrics.py,sha256=YeYHJtEIs8OG-EzidG-nbSYB919pjZ4MMbdcZ_jfV2s,23639
10
10
  upgini/search_task.py,sha256=7YxH1zrUHMmePO0VbPBBCJjeoer7jAC0Gltc9EVAOIg,17126
@@ -21,7 +21,7 @@ upgini/autofe/operand.py,sha256=8WqEoSIA5rEWCK1xuC303E4NW5a72GZ5jUMAEj4skII,2291
21
21
  upgini/autofe/unary.py,sha256=7TBe7PCt7l_XQEqu_G5g_TC2cW3tppL7uPDcX8xsqz0,2731
22
22
  upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
23
23
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- upgini/data_source/data_source_publisher.py,sha256=zFu0WMKwPM11gPZHq8dpsBP7s4wmTtBqYoDEakgNxoY,13725
24
+ upgini/data_source/data_source_publisher.py,sha256=xvHi4N4m32eqB_h_qtY1wAt1dXekM5PdNL2T9JzFQD4,14051
25
25
  upgini/mdc/__init__.py,sha256=CuKmWYCqAnmiq1S7wgMzJhSCTsXuoeiZWXSfzw0lyig,1152
26
26
  upgini/mdc/context.py,sha256=eVNEubcgkiAP139Vna2qtUBZJWoy15rWWAuB0TFv54E,1484
27
27
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -40,6 +40,7 @@ upgini/utils/country_utils.py,sha256=9BXSXoGm3nVoOZE_bRENY-KMkwMUFvAF3Au0zxUNA1o
40
40
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
41
41
  upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
42
42
  upgini/utils/datetime_utils.py,sha256=P56e7gcgAogJYfs2Blzk1uypxb9yrFzNaeJpMCRm6Zc,7716
43
+ upgini/utils/deduplicate_utils.py,sha256=o-XY0hbqikQTzwpX0nyl34j_oiBQTefCvRgLHkZRkTE,2795
43
44
  upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
44
45
  upgini/utils/email_utils.py,sha256=MhCLUAWqbp81xRyKizauNhVx6t_MFeJQRQ8pFM7EpFo,3480
45
46
  upgini/utils/fallback_progress_bar.py,sha256=f-VzVbiO6oU9WoKzEgoegYotixdiKanGlvdQCOGC-NY,1128
@@ -53,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
53
54
  upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
54
55
  upgini/utils/track_info.py,sha256=DVNVZmXUb4f25DSPEuUNEFx49hNEBfmuY9iSW5jkMnI,5708
55
56
  upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
56
- upgini-1.1.236a2.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
57
- upgini-1.1.236a2.dist-info/METADATA,sha256=bdbyXyxTfP4vY8JO6Cu6ABCtNAEzDKdSJJ0cCfNo7CI,48346
58
- upgini-1.1.236a2.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
59
- upgini-1.1.236a2.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
60
- upgini-1.1.236a2.dist-info/RECORD,,
57
+ upgini-1.1.237.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
+ upgini-1.1.237.dist-info/METADATA,sha256=A5a-AaJ1JMDFrGIwcV3sjlF3zMfWB9yrEvzKx2ZM6HE,48344
59
+ upgini-1.1.237.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
60
+ upgini-1.1.237.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
+ upgini-1.1.237.dist-info/RECORD,,