upgini 1.1.274a1__py3-none-any.whl → 1.1.274a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/autofe/date.py CHANGED
@@ -46,6 +46,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
46
46
  future = right + (left.dt.year - right.dt.year).apply(
47
47
  lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
48
  )
49
+ future = pd.to_datetime(future)
49
50
  before = future[future < left]
50
51
  future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
51
52
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
@@ -48,6 +48,7 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
+ exclude_from_autofe_generation: Optional[List[str]],
51
52
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
52
53
  sort_column: Optional[str] = None,
53
54
  date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
57
58
  join_date_abs_limit_days: Optional[int] = None,
58
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
59
60
  data_table_id_to_replace: Optional[str] = None,
60
- exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -1,4 +1,5 @@
1
1
  import dataclasses
2
+ import datetime
2
3
  import gc
3
4
  import hashlib
4
5
  import itertools
@@ -146,6 +147,7 @@ class FeaturesEnricher(TransformerMixin):
146
147
  """
147
148
 
148
149
  TARGET_NAME = "target"
150
+ CURRENT_DATE = "current_date"
149
151
  RANDOM_STATE = 42
150
152
  CALCULATE_METRICS_THRESHOLD = 50_000_000
151
153
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -207,6 +209,7 @@ class FeaturesEnricher(TransformerMixin):
207
209
  client_ip: Optional[str] = None,
208
210
  client_visitorid: Optional[str] = None,
209
211
  custom_bundle_config: Optional[str] = None,
212
+ add_date_if_missing: bool = True,
210
213
  **kwargs,
211
214
  ):
212
215
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -317,6 +320,7 @@ class FeaturesEnricher(TransformerMixin):
317
320
  self.raise_validation_error = raise_validation_error
318
321
  self.exclude_columns = exclude_columns
319
322
  self.baseline_score_column = baseline_score_column
323
+ self.add_date_if_missing = add_date_if_missing
320
324
 
321
325
  def _get_api_key(self):
322
326
  return self._api_key
@@ -1804,10 +1808,13 @@ class FeaturesEnricher(TransformerMixin):
1804
1808
  else:
1805
1809
  features_section = ""
1806
1810
 
1807
- api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1811
+ search_id = self._search_task.search_task_id
1812
+ api_example = (
1813
+ f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1808
1814
  -H 'Authorization: {self.api_key}' \\
1809
1815
  -H 'Content-Type: application/json' \\
1810
- -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1816
+ -d '{{"search_keys": {keys}{features_section}}}'"""
1817
+ )
1811
1818
  return api_example
1812
1819
 
1813
1820
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1902,6 +1909,9 @@ class FeaturesEnricher(TransformerMixin):
1902
1909
  generated_features.extend(converter.generated_features)
1903
1910
  else:
1904
1911
  self.logger.info("Input dataset hasn't date column")
1912
+ if self.add_date_if_missing:
1913
+ df = self._add_current_date_as_key(df)
1914
+ search_keys[self.CURRENT_DATE] = SearchKey.DATE
1905
1915
  email_column = self._get_email_column(search_keys)
1906
1916
  hem_column = self._get_hem_column(search_keys)
1907
1917
  email_converted_to_hem = False
@@ -2220,9 +2230,7 @@ class FeaturesEnricher(TransformerMixin):
2220
2230
  self.fit_search_keys = self.search_keys.copy()
2221
2231
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2222
2232
 
2223
- validate_dates_distribution(
2224
- validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2225
- )
2233
+ validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2226
2234
 
2227
2235
  maybe_date_column = self._get_date_column(self.fit_search_keys)
2228
2236
  has_date = maybe_date_column is not None
@@ -2273,6 +2281,9 @@ class FeaturesEnricher(TransformerMixin):
2273
2281
  self.fit_generated_features.extend(converter.generated_features)
2274
2282
  else:
2275
2283
  self.logger.info("Input dataset hasn't date column")
2284
+ if self.add_date_if_missing:
2285
+ df = self._add_current_date_as_key(df)
2286
+ self.fit_search_keys[self.CURRENT_DATE] = SearchKey.DATE
2276
2287
  email_column = self._get_email_column(self.fit_search_keys)
2277
2288
  hem_column = self._get_hem_column(self.fit_search_keys)
2278
2289
  email_converted_to_hem = False
@@ -2853,6 +2864,11 @@ class FeaturesEnricher(TransformerMixin):
2853
2864
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2854
2865
  return col
2855
2866
 
2867
+ @staticmethod
2868
+ def _add_current_date_as_key(df: pd.DataFrame) -> pd.DataFrame:
2869
+ df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2870
+ return df
2871
+
2856
2872
  @staticmethod
2857
2873
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2858
2874
  return [
@@ -2903,9 +2919,7 @@ class FeaturesEnricher(TransformerMixin):
2903
2919
  [
2904
2920
  c
2905
2921
  for c in df.columns
2906
- if c not in sort_columns
2907
- and c not in sort_exclude_columns
2908
- and df[c].nunique() > 1
2922
+ if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
2909
2923
  ]
2910
2924
  # [
2911
2925
  # sk
upgini/metrics.py CHANGED
@@ -645,6 +645,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
645
645
 
646
646
 
647
647
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
648
+ # TODO validate that if it is Callable then it accepts 3 arguments
648
649
  if isinstance(scoring, str) and scoring is not None:
649
650
  _get_scorer_by_name(scoring)
650
651
 
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -44,7 +44,7 @@ class PhoneNormalizer:
44
44
  Method will remove all non numeric chars from string and convert it to int.
45
45
  None will be set for phone numbers that couldn"t be converted to int
46
46
  """
47
- if is_string_dtype(self.df[self.phone_column_name]):
47
+ if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
48
  convert_func = self.phone_str_to_int_safe
49
49
  elif is_float_dtype(self.df[self.phone_column_name]):
50
50
  convert_func = self.phone_float_to_int_safe
@@ -100,6 +100,9 @@ class DateTimeSearchKeyConverter:
100
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
101
101
  self.logger.warning(msg)
102
102
  raise ValidationError(msg)
103
+ else:
104
+ df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
+ df[self.date_column] = self.parse_date(df)
103
106
 
104
107
  # If column with date is datetime then extract seconds of the day and minute of the hour
105
108
  # as additional features
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.274a1
3
+ Version: 1.1.274a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=LPYSCGq89WLaL5iQNikTyhICUs_APtqEvhn5XRENn1U,174105
5
+ upgini/features_enricher.py,sha256=QjXTklqC7_UZTG4zYXYQPMybdoGWnsCcK-uKIIxUVWY,174740
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
8
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
9
- upgini/metrics.py,sha256=VmxVc-plbRPZ1U3Ve3E-FZkhYqi0X2r7x8H5L-shux4,29058
9
+ upgini/metrics.py,sha256=U3VJKbKmuWACqI4jTcszXo0WqeXFtV8bWyY9VLBL-rw,29129
10
10
  upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -15,18 +15,18 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
15
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
17
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
18
- upgini/autofe/date.py,sha256=cc0GMAJR0QZOI_Qp2V5UDklaXLNS_79O1GhU6GlOYzg,3895
18
+ upgini/autofe/date.py,sha256=_6RoEJZ5Kf-Q_aMOFucS6YSIZpCcelgpw-edV4qmRIM,3935
19
19
  upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
20
20
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
21
21
  upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
22
22
  upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
23
23
  upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
24
24
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- upgini/data_source/data_source_publisher.py,sha256=J2lrpPuysUHPeqTSfoybBtPRTBCFu7R5KzaakhjaRDc,16485
25
+ upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
26
26
  upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
27
27
  upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
28
28
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
- upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
29
+ upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
30
30
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
31
31
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
32
32
  upgini/resource_bundle/strings.properties,sha256=TM9OykiEXNpcgFN3DpqBGbQs4N9m4mzHBn-k6aazc30,26111
@@ -41,7 +41,7 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
41
41
  upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
42
42
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
43
43
  upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
44
- upgini/utils/datetime_utils.py,sha256=4ii5WphAHlb_NRmdJx35VZpTarJbAr-AnDw3XSzUSow,10346
44
+ upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
45
45
  upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
46
46
  upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
47
47
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
@@ -56,8 +56,8 @@ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,4
56
56
  upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
57
57
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
58
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
59
- upgini-1.1.274a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
- upgini-1.1.274a1.dist-info/METADATA,sha256=h6qNl44oOoxx3KE4tSeuxbx3JeXUUzSXdxmg6Fy0VMo,48158
61
- upgini-1.1.274a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
- upgini-1.1.274a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
- upgini-1.1.274a1.dist-info/RECORD,,
59
+ upgini-1.1.274a3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.274a3.dist-info/METADATA,sha256=lvF0tpzmSFKi-fAm2xL1NhIZa4SVTg98OcPENHWt6u8,48158
61
+ upgini-1.1.274a3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
+ upgini-1.1.274a3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.274a3.dist-info/RECORD,,